JSON templates in Python

Question

I am looking for a general review of my project here. I have written some Python, but not enough to feel confident in my ability to produce idiomatic code.

#!/usr/bin/env python

"""mockjson.py: Library for mocking JSON objects from a template."""

__author__ = "James McMahon"
__copyright__ = "Copyright 2012, James McMahon"
__license__ = "MIT"

try:
    import simplejson as json
except ImportError:
    import json
import random
import re
import string
import sys

from datetime import datetime, timedelta

_male_first_name = ("James", "John", "Robert", "Michael", "William", "David",
    "Richard", "Charles", "Joseph", "Thomas", "Christopher", "Daniel",
    "Paul", "Mark", "Donald", "George", "Kenneth", "Steven", "Edward",
    "Brian", "Ronald", "Anthony", "Kevin", "Jason", "Matthew", "Gary",
    "Timothy", "Jose", "Larry", "Jeffrey", "Frank", "Scott", "Eric")
_female_first_name = ("Mary", "Patricia", "Linda", "Barbara", "Elizabeth",
    "Jennifer", "Maria", "Susan", "Margaret", "Dorothy", "Lisa", "Nancy",
    "Karen", "Betty", "Helen", "Sandra", "Donna", "Carol", "Ruth", "Sharon",
    "Michelle", "Laura", "Sarah", "Kimberly", "Deborah", "Jessica",
    "Shirley", "Cynthia", "Angela", "Melissa", "Brenda", "Amy", "Anna")
_last_name = ("Smith", "Johnson", "Williams", "Brown", "Jones", "Miller",
    "Davis", "Garcia", "Rodriguez", "Wilson", "Martinez", "Anderson",
    "Taylor", "Thomas", "Hernandez", "Moore", "Martin", "Jackson",
    "Thompson", "White", "Lopez", "Lee", "Gonzalez", "Harris", "Clark",
    "Lewis", "Robinson", "Walker", "Perez", "Hall", "Young", "Allen")
_lorem = tuple("""lorem ipsum dolor sit amet consectetur adipisicing elit
        sed do eiusmod tempor incididunt ut labore et dolore magna aliqua
        Ut enim ad minim veniam quis nostrud exercitation ullamco laboris
        nisi ut aliquip ex ea commodo consequat Duis aute irure dolor in
        reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla
        pariatur Excepteur sint occaecat cupidatat non proident sunt in
        culpa qui officia deserunt mollit anim id est laborum""".split())


def _random_item(items):
    return items[random.randrange(len(items))]


def _random_data(key):
    key = key.lstrip('@')
    if not key in data:
        return key
    return data[key]()


def _lorem_ipsum():
    length = random.randrange(2, len(_lorem) / 2)
    result = ''
    for i in xrange(length):
        result += ' ' + _lorem[random.randrange(len(_lorem))]
    return result.strip()


def _random_date():
    return datetime.today() - timedelta(days=random.randrange(6571, 27375))

data = {
    'NUMBER': lambda: _random_item("0123456789"),
    'LETTER_UPPER': lambda: _random_item(string.uppercase),
    'LETTER_LOWER': lambda: _random_item(string.lowercase),
    'MALE_FIRST_NAME': lambda: _random_item(_male_first_name),
    'FEMALE_FIRST_NAME': lambda: _random_item(_female_first_name),
    'LAST_NAME': lambda: _random_item(_last_name),
    'EMAIL': lambda: (_random_data('@LETTER_LOWER')
                      + '.'
                      + _random_data('@LAST_NAME').lower()
                      + '@'
                      + _random_data('@LAST_NAME').lower()
                      + '.com'),
    'LOREM': lambda: _random_item(_lorem),
    'LOREM_IPSUM': _lorem_ipsum,
    'DATE_YYYY': lambda: str(_random_date().year),
    'DATE_MM': lambda: str(_random_date().month).zfill(2),
    'DATE_DD': lambda: str(_random_date().day).zfill(2),
    'TIME_HH': lambda: str(_random_date().hour).zfill(2),
    'TIME_MM': lambda: str(_random_date().minute).zfill(2),
    'TIME_SS': lambda: str(_random_date().second).zfill(2)
}


def generate_json_object(template, name=None):
    length = 0
    if name:
        matches = re.search(r"\w+\|(\d+)-(\d+)", name)
        if matches:
            groups = matches.groups()
            length_min = int(groups[0])
            length_max = int(groups[1])
            length = random.randint(length_min, length_max)

    t_type = type(template)
    if t_type is dict:
        generated = {}
        for key, value in template.iteritems():
            stripped_key = re.sub(r"\|(\d+-\d+|\+\d+)", '', key)
            generated[stripped_key] = generate_json_object(value, key)

            # handle increments
            inc_matches = re.search(r"\w+\|\+(\d+)", key)
            if inc_matches and type(template[key]) is int:
                increment = int(inc_matches.groups()[0])
                template[key] += increment
    elif t_type is list:
        generated = []
        for i in xrange(length):
            generated.append(generate_json_object(template[0]))
    elif t_type is int:
        generated = length if matches else template
    elif t_type is bool:
        # apparently getrandbits(1) is faster...
        generated = random.choice([True, False]) if matches else template
    # is this always just going to be unicode here?
    elif t_type is str or t_type is unicode:
        if template:
            generated = ''
            length = length if length else 1
            for i in range(length):
                generated += template
            matches = re.findall(r"(@[A-Z_0-9\(\),]+)", generated)
            if matches:
                for key in matches:
                    rd = _random_data(key)
                    generated = generated.replace(key, rd, 1)
        else:
            generated = (''.join(random.choice(string.letters)
                         for i in xrange(length)))
    else:
        generated = template
    return generated


def generate_json(template, name=None):
    return json.dumps(generate_json_object(json_data), sort_keys=False)


if __name__ == '__main__':
    arg = sys.argv[1:][0]
    with open(arg) as f:
        json_data = json.load(f)
    print(generate_json(json_data))

It would be better if you comment(heavily) on your code so that it could be easily understood. — user16275, Sep 3 '12 at 8:09

Gareth Rees · Accepted Answer · 2012-09-03 18:33:58Z

1. Introduction

This review grew to be very long, so I'll say up front that you shouldn't take the length of this to heart: your code is not bad, especially if you are new to Python. There's always a lot of things to say about a piece of code of this length, and idiomatic Python has a bunch of features (sets, generators, comprehensions, iterators) that will be unfamiliar to users of some other languages. So take everything I have to say with a pinch of salt (except for item #1 under "General comments", which is really the only big problem here).

2. General comments

By far the most important problem is your documentation. What is your library supposed to do? How do it use it? It is impossible to review code without understanding its purpose.

As a user I expect to be able to write help(mockjson) to get a description of a module and help(mockjson.generate_json) to get a description of a function, but there's nothing here. Your online documentation is also unhelpful.

After some hunting around I eventually found Mennon van Slooten's mockJSON documentation. I'm going to proceed on the basis that your purpose is to reimplement this in Python.
There's no test suite. I know it's hard for you to write test cases because of the randomness. You probably want to take a hint from Mennon van Slooten and provide a way for to change the random choices to a deterministic sequence of choices.
Your collections of example data (_male_first_name etc.) are stored as tuples, but it would be better to store these as lists. Tuples are fixed-size collections typically used as lightweight representations of records. Lists sets are variable-size collections typically containing similar items.
Your example data is very America-centric. Given that the purpose of your module is to produce example data for testing, it will be useful to have a wider variety of names, including names with accented letters, or in different scripts.

It is generally results in source code that is easier to read (with fewer quotation characters and commas) if you produce this kind of data using split. For example,

_female_first_names = u"""
    Mary Patricia Linda Barbara Elizabeth
    Zoé Yến София فاطمة 明子 美玲 ยิ่งลักษณ์
    """.split()

The function _random_item is already in the Python library as random.choice.
The string "0123456789" is already in the Python library as string.digits.
It's not clear why _random_data needs to strip all initial @ signs from its argument before trying to look it up. You only call this function in a context where you could easily strip the @. It also fails to raise an error if the key is missing (so that mistakes like @NUBMER are not caught).

Your definitions of data will look nicer (fewer quotation marks) if you use the dict constructor:

data = dict(
    NUMBER = lambda: _random_item("0123456789"),
    LETTER_UPPER = lambda: _random_item(string.uppercase),
    LETTER_LOWER = lambda: _random_item(string.lowercase),
    ...

Since most of the values in data are of the form lambda: _random_item(...), why not special-case them to save on boilerplate? You could write something like:

def _random_data(key):
    """
    Construct a random data item for `key` and return it.
    Raise KeyError if `key` is unknown.
    """
    constructor = data[key]
    if isinstance(constructor, types.FunctionType):
        return constructor()
    else:
        return random.choice(constructor)

and then:

data = dict(
    NUMBER = string.digits,
    LETTER_UPPER = string.uppercase,
    LETTER_LOWER = string.lowercase,
    ...

Building a string by repeatedly extending it with += is a well-known anti-pattern in Python. This is because extending a string is inefficient in Python: a new string gets allocated and the old string copied across each time. This means that any algorithm that builds a string by repeated extension runs like O(n²). It is nearly always better to generate the items to be assembled into the string and then join them to produce the result. This technique also avoids fencepost errors like spurious extra spaces at the start or end. So your could write:
```
def _lorem_ipsum():
    """
    Return a random paragraph of placeholder text.
    """
    length = random.randrange(2, len(_lorem) / 2)
    return ' '.join(random.choice(_lorem) for _ in xrange(length))
```
generate_json takes an optional argument name which is always ignored. You can omit this.

3. Comments on `generate_json_object`

The name could be improved: generate has a special meaning in Python (referring to the output of a generator), and the json part is misleading since there's nothing JSON-specific about this function (it operates on Python objects, not on JSON representations). Since instantiate is often used for filling in a template, I think I would use a name like instantiate_object. (And instantiate_json for generate_json.)
The variable length is misleadingly named because it's not always a length (it might just be a number). Perhaps n would be better. You also have to go through contortions because you are overloading its meaning: it's None if no number was specified (in which case sometimes it needs to be treated as if it has the value 1), or else it's a number. It would be much clearer to have a separate variable (have_n in my code below) that indicates whether a number was supplied.
Your regular expression for finding repeat counts in JSON keys is \w+\|(\d+)-(\d+). This matches strings like "a|0-2trailing garbage". It's probably a good idea to anchor it to the end of the string (\w\|(\d+)-(\d+)$). Or maybe, depending on exactly what you want to match, to the beginning of the string as well: re.match(r'\w+\|(\d+)-(\d+)$', name). But then you need to decide what to do for non-matching keys. Maybe raise an exception?
The structure
```
matches = re.search(...)
if matches:
     ...
```
is so common that readers will know what you mean if you abbreviate matches to m.
matches.group(0) is simpler than matches.groups()[0].
You perform three regular expression matches on the key: first, to find the repeat counts, second, to strip all suffixes, and third, to find the increment. If you refactor the function to take the repeat count as an argument (instead of the name), you can do these three matches all at the same time, like this:
```
m = re.match(r"^(\w+)(?:\|(?:(\d+)-(\d+)|\+(\d+)))?$", '', key)
if not m:
    raise ValueError("Bad key: {0}".format(key))
have_n = False
n = 1
if m.group(2):
    have_n = True
    n = random.randint(int(m.group(2)), int(m.group(3)))
increment = 0
if m.group(4):
    increment = int(m.group(4))
```
Now this doesn't quite work as I've written it above because of the way you've structured the function. Having used your regular expression to parsed the number and increment out of the key, you then pass the name when you make your recursive call, which means that the number has to be parsed again. To do this, you need to restructure the function so that instead of passing name to the recursive call, you pass have_n, n and increment. See later on where I give the whole text of the function as revised.
It is usually better in Python to test if an object belongs to a type t by writing isinstance(object, t) rather than type(object) is t. The reason is that object may belong to a subtype of t (for example, a defaultdict or OrderedDict instead of a plain dict).

There is a gotcha here, which is that bool is a subtype of int, so you need to order your tests so that Booleans get tested before integers.
The structure of generate_json_object involves various branches which assign an object to the variable generated. This is then returned at the end. Why not just return the object directly and avoid the local variable?
When you're building a list, you do this by repeatedly calling append. This is not quite as bad as repeatedly calling += on a string (lists are a bit more flexible), but you can simplify this code by using a list comprehension:
```
elif isinstance(template, list):
    return [generate_json_object(template[0]) for _ in xrange(length)]
```
Note that when the loop variable is ignored (as here) it's conventional in Python to name it _.
When you handle an incremented value, you write template[key] += increment which updates the template permanently. That's all very well if you are going to use the template just once, but what if you want to use the same template many times?

In order to allow re-use of templates, you are going to have to store the current increment for each template key in a separate data structure from the template, so that you can throw away this data structure when your instantiation is complete. The way to do this is to use a dictionary whose keys are locations in the template and whose values are the current increment for that location.

Now, how can we represent a location in a template in such a way that we can use it as a key in a dictionary? Well, a location is a template is always a key in a dictionary, so we'd like to represent it as the pair (dictionary, key). However, this won't work, because only immutable (constant) objects can be used as keys in dictionaries in Python, and the template (being a dictionary) is mutable. But Python provides the function id which returns a unique identifier for each object in memory, and we can use that instead:
```
increment = 0
template_key = (id(template), key)
if template_key in _increments:
    increment = _increments[template_key]
elif m.group(4):
    increment = int(m.group(4))
    _increments[template_key] = increment
```
Where should this _increments dictionary be stored? We need a new one for each instantiation we do, but it needs to persist throughout the recursive series of calls. We could pass it around as another function parameter, but if you find yourself doing this, that's a sign that you need to use a class. Again, see the final version at the end for how this can be done.

The code I gave above isn't quite right, because although it remembers the increment, it doesn't actually update it each time it is used. Python has a very neat way to make this happen: iterators. Instead of storing increment, we'll construct an iterator that yields the values 0, increment, 2*increment, 3*increment, and so on. The function itertools.count does this job. And when there's no increment, we'll construct an iterator that always yields the value 0 no matter how many times we call it (using itertools.repeat):
```
increment = itertools.repeat(0)
if template_key in self._increments:
    increment = self._increments[template_key]
elif m.group(4):
    increment = itertools.count(start = 0, step = int(m.group(4)))
    self._increments[template_key] = increment
```

4. Revised instantiation code

OK, let's put all that together. (There are a couple of bonus improvements in here for you to spot for yourself!)

_constantly_zero = itertools.repeat(0)

class _Instantiator(object):
    def __init__(self):
        self._increments = dict()

    def instantiate(self, template, have_n = False, n = 1,
                    increment = _constantly_zero):
        if isinstance(template, dict):
            generated = {}
            for key, value in template.iteritems():
                m = re.match(r"^(\w+)(?:\|(?:(\d+)-(\d+)|\+(\d+)))?$", key)
                if not m:
                    raise ValueError("Bad key: {0}".format(key))
                have_n = False
                n = 1
                if m.group(2):
                    have_n = True
                    n = random.randint(int(m.group(2)), int(m.group(3)))
                template_key = (id(template), key)
                increment = _constantly_zero
                if template_key in self._increments:
                    increment = self._increments[template_key]
                elif m.group(4):
                    increment = itertools.count(start = 0, step = int(m.group(4)))
                    self._increments[template_key] = increment
                generated[m.group(1)] = self.instantiate(value, have_n, n, increment)
            return generated
        elif isinstance(template, list):
            return [self.instantiate(template[0]) for _ in xrange(n)]
        elif isinstance(template, bool):
            if have_n:
                return bool(n)
            else:
                return template
        elif isinstance(template, int):
            if have_n:
                return n
            else:
                return template + next(increment)
        elif isinstance(template, str) or isinstance(template, unicode):
            if template:
                pattern = re.compile(r'@([A-Z_0-9]+)')
                repl = lambda m: _random_data(m.group(1))
                return ''.join(pattern.sub(repl, template) for _ in xrange(n))
            else:
                return ''.join(random.choice(string.letters) for _ in xrange(n))
        else:
            return template

def instantiate_object(template):
    """
    Instantiate an object based on `template` and return it.
    """
    return _Instantiator().instantiate(template)

I really appreciate you taking the time to review my code and write this up. As time permits I will go through my code and make many of the improvements you suggested. I do have a question about the list vs tuple point (number 3). I has assumed because my data is immutable I should make it a tuple, is this not the case? — James McMahon, Sep 4 '12 at 2:13
You're right that tuples are immutable and lists are mutable. But mutability only matters when you are storing objects in sets or as dictionary keys. When you're not worried about whether your collection can be hashed, the rule of thumb is that fixed-size records of different objects are best represented as tuples, while variable-size collections of similar objects are best represented as lists. But nothing particularly bad will happen if you don't follow this rule of thumb. — Gareth Rees, Sep 4 '12 at 9:42

asked	3 years ago
viewed	1080 times
active	1 month ago

current community

your communities

more stack exchange communities

JSON templates in Python

1 Answer 1

1. Introduction

2. General comments

3. Comments on `generate_json_object`

4. Revised instantiation code

Your Answer

Not the answer you're looking for? Browse other questions tagged python json or ask your own question.

Hot Network Questions

current community

your communities

more stack exchange communities

JSON templates in Python

1 Answer 1

1. Introduction

2. General comments

3. Comments on generate_json_object

4. Revised instantiation code

Your Answer

Sign up or log in

Post as a guest

Not the answer you're looking for? Browse other questions tagged python json or ask your own question.

Related

Hot Network Questions

3. Comments on `generate_json_object`