Here's a Python script to optimize JSON documents:
#!/usr/bin/python3
"""jopo - Optimize JSON documents."""
from collections import defaultdict
def optimize(data):
"""Optimize a JSON-like data structure."""
if isinstance(data, list):
if len(data) == 1:
return optimize(data[0])
digest = defaultdict(lambda: defaultdict(set))
for element in data:
if not isinstance(element, dict) or len(element) != 2:
break
for key, value in element.items():
digest[key][type(value)].add(repr(value))
else:
if len(digest) == 2:
pool = {k for k in digest if len(digest[k][str]) == len(data)}
if pool:
if len(pool) > 1:
meta = input("Which key (%s)? " % "|".join(pool))
else:
meta = next(iter(pool))
infra = (digest.keys() - {meta}).pop()
return {d[meta]: optimize(d[infra]) for d in data}
return [optimize(x) for x in data]
if isinstance(data, dict):
if len(data) == 1:
return optimize(next(iter(data.values())))
else:
return {k: optimize(v) for k, v in data.items()}
return data
if __name__ == "__main__":
import json
import sys
for arg in sys.argv[1:]:
with open(arg) as f:
doc = json.load(f)
print(json.dumps(optimize(doc), indent=2))
... which, given a JSON document recipes.json
like this:
{
"recipes": [
{
"name": "pizza",
"ingredients": [
{
"quantity": "100g",
"ingredient": "cheese"
},
{
"quantity": "200g",
"ingredient": "tomato"
}
]
},
{
"name": "pizza 2",
"ingredients": [
{
"quantity": "300g",
"ingredient": "ham"
},
{
"quantity": "300g",
"ingredient": "pineapple"
}
]
}
]
}
... is used like this:
$ ./jopo.py recipes.json
Which key (ingredient|quantity)? ingredient
{
"pizza 2": {
"ham": "300g",
"pineapple": "300g"
},
"pizza": {
"tomato": "200g",
"cheese": "100g"
}
}
It does two things:
Collapse 1-item objects and arrays.
Reduce arrays of congruent 2-item objects to a single object, if possible.
I think the optimize()
function is both too long and too hard to follow.
Obviously I could separate out the if isinstance(data, <type>): # ...
blocks into separate functions, but even then, an optimize_list()
function along those lines is still going to be quite long and, IMO, somewhat impenetrable.
How can I improve the legibility of this code?