Take the 2-minute tour ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.
#!/usr/bin/env python

import re
import nltk.corpus

def tokenize(text, **kw):
    """
    Break down text into a set of unique lower case words.
    """
    plain_words = nltk.corpus.abc.words()
    stop_words = nltk.corpus.stopwords.words()
    # Defaults
    kw.setdefault('min_len', 4)
    kw.setdefault('max_len', 0)
    kw.setdefault('ignore_list', [])
    kw['ignore_list'] += stop_words

    ok_length = lambda token: [len(token) >= kw['min_len']
                             and kw['max_len'] <= kw['max_len']][0]

    tokens = set([token.lower()
                  for token in re.findall('([A-z]+)+', text)
                  if ok_length(token) and
                  token not in kw['ignore_list']]).intersection(plain_words)

    return tokens

## TESTING tokenize

TEXT = open('/bin/bash', 'r').read()

print tokenize(TEXT,
               min_len=4,
               max_len=0,
               ignore_list=['code'])

My concerns:

  1. Is tokenize() too complex as one function?
  2. Should I replace the list comprehensions?
  3. Does this regex '([A-z]+)+' need improving?
  4. How could I make this code more idiomatic?
  5. Have I overlooked any faults in any of the logic?

Example output:

11:26 PM$ python Tokenize.py 
set(['replacing', 'default', 'all', 'forget', 'chain', 'skip', 'global', 'dollar', 'splitting', 'existing', 'four', 'executing', 'go', 'follow', 'expressions', 'activates', 'saved', 'children', 'causes', 'row', 'whose', 'tv', 'graph', 'discard', 'send', 'environment', 'to', 'topic', 'program', 'marks', 'include', 'sent', 'allocate', 'division', 'random', 'slash', 'dynamic', 'reserved', 'removing', 'manipulated', 'every', 'nesting', 'decide', 'entries', 'locked', 'syntax', 'exact', 'condition', 'entire', 'redistribute', 'magic', 'exits', 'level', 'turns', 'array', 'exec', 'list', 'fewer', 'try', 'mm', 'quick', 'refer', 'upper', 'unexpected', 'force', 'portable', 'be', 'obsolete', 'sign', 'jump', 'consists', 'second', 'displays', 'insertion', 'pass', 'gm', 'even', 'index', 'errors', 'adds', 'sub', 'directive', 'near', 'supplied', 'current', 'seconds', 'waiting', 'version', 'new', 'movement', 'redirect', 'full', 'simultaneously', 'exchange', 'respectively', 'error', 'commercial', 'equals', 'reported', 'objects', 'let', 'undo', 'groups', 'erm', 'active', 'path', '[', 'diagnostic', 'appears', 'change', 'wait', 'digits', 'great', 'copyright', 'coerced', 'handlers', 'items', 'changed', 'allows', 'reports', 'ignoring', 'amount', 'resulting', 'menu', 'usually', 'history', 'makes', 'exited', 'missing', 'composed', 'named', 'via', 'useful', 'extra', 'prefer', 'logical', 'replace', 'visible', 'names', 'apply', 'unit', 'use', 'takes', 'working', 'nine', 'escapes', 'two', 'next', 'r', 'duplicate', 'handler', 'call', 'memory', 'scope', 'type', 'until', 'more', 'separated', 'successful', 'initial', 'operators', 'tested', 'flag', 'controlling', 'encountered', 'disabling', 'must', 'me', 'escaped', 'none', 'te', 'word', 'err', 'indicates', 'this', 'loops', 'work', 'mi', 'modified', 'abort', 'values', 'can', 'socket', 'following', 'making', 'closing', 'my', 'example', 'performed', 'control', 'del', 'prompt', 'links', 'give', 'process', 'lock', 'functions', 'share', 'accept', 'trap', 'high', 'effectively', 'tag', 'numbers', 'allowed', 'scheduling', 'counting', 'audible', 'information', 'rather', 'means', 'j', 'write', 'how', 'silent', 'instead', 'profile', 'map', 'fr', 'blocks', 'description', 'may', 'max', 'resumes', 'tries', 'disable', 'coming', 'date', 'horizontal', 'law', 'data', 'types', 'fo', 'a', 'ambiguous', 'short', 'physical', 'remember', 'third', 'whenever', 'maybe', 'lines', 'bugs', 'element', 'provide', 'expression', 'allow', 'decreasing', 'scroll', 'subsequently', 'operate', 'order', 'se', 'feed', 'breaking', 'interpretation', 'help', 'disables', 'move', 'displayed', 'interpreted', 'disabled', 'timing', 'suspend', 'un', 'differs', 'interpreter', 'still', 'pointer', 'positional', 'style', 'le', 'group', 'monitor', 'curly', 'shifts', 'lo', 'll', 'detected', 'lu', 'systems', 'listing', 'mail', 'hidden', 'main', 'pending', 'split', 'non', 'return', 'greater', 'output', 'matches', 'auto', 'runs', 'number', 'break', 'internally', 'blink', 'killed', 'matched', 'term', 'name', 'ifs', 'always', 'revert', 'identified', 'privileged', 'possibilities', 'applied', 'token', 'inequality', 'stopped', 'mode', 'arrow', 'each', 'found', 'cc', 'reset', 'preceded', 'square', 'invoked', 'generation', 'ed', 'chunk', 'hard', 'frames', 'expect', 'exceeded', 'eu', 'et', 'operation', 'bay', 'event', 'special', 'intended', 'large', 'shown', 'network', 'space', 'restricted', 'since', 'preserve', 'unknown', 'looking', 're', 'acting', 'flushing', 'exporting', 'print', 'got', 'rn', 'cause', 'occurs', 'common', 'foundation', 'turning', 'resume', 'free', 'standard', 'indices', 'base', 'execute', 'put', 'org', 'wanted', 'beginning', 'l', 'software', 'resumed', 'definition', 'g', 'created', 'locations', 'retrieving', 'messages', 'times', 'creates', 'turn', 'length', 'place', 'w', 'assumed', 'timed', 'onto', 'assign', 'first', 'origin', 'already', 'succeeds', 'omitted', 'variables', 'symbolic', 'primary', 'owned', 'one', 'restrict', 'hook', 'done', 'notify', 'suspended', 'blank', 'reached', 'message', 'open', 'braces', 'size', 'given', 'checked', 'exists', 'service', 'redirection', 'meaningful', 'top', 'behaves', 'accent', 'system', 'construct', 'priority', 'indicate', 'returns', 'listed', 'passed', 'typing', 'white', 'final', 'gives', 'shell', 'option', 'trapped', 'ch', 'completed', 'exactly', 'lists', 'copy', 'completes', 'specify', 'character', 'begins', 'b', 'target', 'quantum', 'instruction', 'enabled', 'depends', 'i', 'determined', 'bind', 'enables', 'declare', 'interactive', 'and', 'files', 'false', 'topics', 'turned', 'argument', 'dash', 'width', 'need', 'seen', 'any', 'contents', 'forced', 'zero', 'depending', 'self', 'note', 'also', 'internal', 'build', 'indexed', 'destroy', 'copied', 'brace', 'begin', 'added', 'unless', 'trace', 'normal', 'buffer', 'object', 'leave', 'regular', 'eight', 'printed', 'letter', 'termination', 'nothing', 'alpha', 'segment', 'associative', 'grave', 'appear', 'kg', 'foreground', 'clear', 'later', 'm', 'km', 'looked', 'bracket', 'keywords', 'pattern', 'normally', 'notion', 'selection', 'show', 'text', 'supported', 'brief', 'session', 'beg', 'conditional', 'find', 'completion', 'access', 'based', 'quoted', 'parameters', 'implementation', 'true', 'specified', 'assertion', 'controls', 'terminal', 'failed', 'only', 'inherited', 'override', 'query', 'local', 'columns', 'do', 'specifications', 'invoke', 'get', 'convert', 'de', 'stop', 'da', 'cannot', 'negative', 'words', 'reply', 'report', 'du', 'procedures', 'sorts', 'secondary', 'processes', 'resource', 'horizontally', 'fields', 'remove', 'calling', 'arrays', 'bad', 'processed', 'contain', 'release', 'x', 'fixed', 'automatic', 'flagged', 'ignored', 'set', 'dump', 'frame', 'prints', 'maximum', 'relative', 'see', 'result', 'successive', 'sequences', 'fails', 'evaluation', 'vertical', 'placed', 'ways', 'subsequent', 'currently', 'written', 'protected', 'neither', 'reading', 'conditions', 'checks', 'available', 'suppresses', 'jobs', 'parent', 'opening', 'modify', 'screen', 'sole', 'transpose', 'disallow', 'nd', 'job', 'succeed', 'selectively', 'key', 'interface', 'printing', 'optional', 'valid', 'hits', 'last', 'reverse', 'limits', 'many', 'region', 'la', 'according', 'minus', 'etc', 's', 'context', 'attributes', 'delete', 'whole', 'botched', 'otherwise', 'load', 'pre', 'permitted', 'co', 'extent', 'point', 'simple', 'effective', 'period', 'pop', 'cz', 'simply', 'unsuccessful', 'table', 'allocated', 'indefinite', 'suppressing', 'described', 'duo', 'addition', 'shells', 'create', 'three', 'mark', 'pc', 'treat', 'expected', 'entered', 'empty', 'define', 'generating', 'enable', 'corresponding', 'suppress', 'sufficient', 'search', 'else', 'child', 'an', 'assigning', 'present', '^', 'case', 'handling', 'license', 'these', 'plain', 'expanded', 'examine', 'value', 'n', 'while', 'replaced', 'behavior', 'shift', 'evaluates', 'di', 'property', 'precede', 'loop', 'seven', 've', 'resident', 'is', 'dumped', 'binding', 'it', 'equal', 'vu', 'in', 'ie', 'if', 'binary', 'containing', 'perform', 'make', 'attribute', 'member', 'read', 'arguments', 'freed', 'modification', 'document', 'events', 'resources', 'status', 'used', 'temporary', 'receives', 'keys', 'reporting', 'upon', 'effect', 'alert', 'action', 'running', 'levels', 'uses', 'user', 'characters', 'stack', 'expand', 'recent', 'lower', 'older', 'shared', 'changes', 'well', 'spent', 'options', 'patterns', 'without', 'flags', 'sets', 'y', 'position', 'the', 'left', 'comment', 'newest', 'sourced', 'less', 'percent', 'obtain', 'actions', 'assigned', 'stored', 'kill', 'immediately', 'followed', 'alternative', 'rotates', 'previous', 'adding', 'loading', 'generator', 'grouped', 'bell', 'guaranteed', 'except', 'signals', 'source', 'add', 'setting', 'combine', 'location', 'usage', 'input', 'reusable', 'interprets', 'remaining', 'match', 'take', 'real', 'tests', 'format', 'rules', 'evaluate', 'showing', 'unlimited', 'possible', 'five', 'background', 'using', 'bit', 'accepted', 'string', 'd', 'insert', 'appearing', 'like', 'success', 'sizes', 'signal', 'performing', 'manual', 'specific', 'exhausted', 'continue', 'hosts', 't', 'become', 'soft', 'attempting', 'right', 'old', 'often', 'sequence', 'oriented', 'creation', 'some', 'back', 'oh', 'export', 'evaluated', 'loaded', 'duration', 'multiple', 'matching', 'reasons', 'ignore', 'describing', 'for', 'notification', 'avoid', 'though', 'comments', 'disk', 'exit', 'select', 'provides', 'indication', 'leader', 'either', 'core', 'command', 'run', 'remembered', 'equivalent', 'processing', 'continuing', 'bi', 'expansion', 'utilities', 'host', 'display', 'offset', 'leftover', 'post', 'refers', 'by', 'comparison', 'pipeline', 'ok', 'would', 'getting', 'column', 'of', 'http', 'o', 'page', 'stamp', 'range', 'plus', 'stand', 'illegal', 'connected', 'os', 'or', 'block', 'op', 'contains', 'letters', 'previously', 'within', 'bound', 'son', 'en', 'determine', 'operator', 'accumulated', 'exchanges', 'terminated', 'statistics', 'additional', 'waits', ']', 'there', 'question', 'long', 'start', 'restricts', 'editor', 'way', 'forward', 'eg', 'combined', 'function', 'head', 'successfully', 'complete', 'form', 'hr', 'attempted', 'removes', 'commands', 'failure', 'manipulate', 'hi', 'link', 'newer', 'line', 'with', 'bug', 'he', 'count', 'entry', 'places', 'versions', 'whether', 'wish', 'caller', 'up', 'us', 'record', 'carriage', 'converted', 'limit', 'fetch', 'pm', 'similar', 'called', 'connect', 'detailed', 'storing', 'definitions', 'associated', 'ad', 'ag', 'defined', 'pseudo', 'universal', 'escape', 'incremental', 'al', 'general', 'consumed', 'single', 'warning', 'exist', 'at', 'file', 'home', 'importing', 'trailing', 'check', 'defines', 'echo', 'pipe', 'marking', 'remainder', 'no', 'when', 'virtual', 'started', 'other', 'outputs', 'test', 'you', 'acceptable', 'arithmetic', 'formats', 'elements', 'star', 'colon', 'separate', 'preceding', 'searched', 'reused', 'includes', 'generated', 'exported', 'variable', 'structure', 'opened', 'e', 'requires', 'required', 'mask', 'visual', 'strings', 'u', 'time', 'directory', 'backward', 'starting', 'original'])
share|improve this question
add comment

2 Answers

  1. No, tokenize is not too complex for one function. It does a single operation: it tokenizes a string.
  2. Your list comprehensions are fine. They are somewhat complex but are still readable and easily understandable.
  3. Instead of using the regex ([A-z]+)+ you could simply use the 'shortcut' `\w+'. This regex will match one-to-many words.

    If you wanted your regex to recognize hyphenated words, a simple change will suffice:

    \w(-?\w)*
    
  4. Your code is quite Pythonic. Your spacing and indentation is fine. The same goes for your variable names. If you have more concerns about style, consult PEP8, the official Python style guide.
  5. There is a flaw in your logic though. Take your lambda expression:

    ok_length = lambda token: [len(token) >= kw['min_len']
                         and kw['max_len'] <= kw['max_len']][0]
    

    Currently, this will except any string whose length is greater than the min_length this is because you have kw['max_len'] == kw['max_len'] which will always evaluate True. You also have the conditionals switched around. This is how you should have the check:

    ok_length = lambda token: [kw['min_length'] <= len(token) <= kw['max_length']][0]
    

    Now, the question you need to ask is, whether or not it pays off to use a lambda expression here. Is the functionality 'deserving' of a lambda expression? Or can the check simply be placed into the list comprehension a little later on? Here's what it would look like, I'll let you decide (I like this way):

    tokens = set([token.lower()
              for token in re.findall('([A-z]+)+', text)
              if kw['min_len'] <= len(token) <= kw['max_len'] and
              token not in kw['ignore_list']]).intersection(plain_words)
    
  6. The next thing I would consider is your used of the kw dict. How needed is it? Could you just implement optional parameters in the function declaration? I like that implementation because this function does not (and probably won't) take an arbitrary number of keyword arguments. It will only take the 3. As a related side note, you have swapped your max_len and min_len values. Currently, max_len < min_len.

  7. Finally, I would remove the intersection function call when you assign the tokens variable. The name tokens and the context with which its generated implies that tokens will hold all tokens that match the given regex. However, in your case, it only contains that are also in plain_words. I would wait to do the intersection until the return statement:

    return tokens.intersection(plain_words)
    

Here is 'my' version of the tokenize function:

def tokenize(text, min_len=0, max_len=4, ignore_list=[]):
    """
    Break down text into a set of unique lower case words.
    """
    plain_words = nltk.corpus.abc.words()
    stop_words = nltk.corpus.stopwords.words()
    ignore_list += stop_words

    tokens = set([token.lower()
                  for token in re.findall('\w+', text)
                  if min_len <= len(token) <= max_len and
                  token not in kw['ignore_list']])

    return tokens.intersection(plain_words)
share|improve this answer
add comment

Here is my second version of tokenize after reading DarinDouglass's answer.

#!/usr/bin/env python

"""
Tokenization is the process of breaking a stream of text up into words, phrases,
symbols, or other meaningful elements called tokens. The list of tokens becomes
input for further processing such as parsing or text mining. -- Wikipedia
"""

import re
import nltk.corpus

def tokenize(**kw):
    """ Tokenize string data """
    # Defaults
    kw.setdefault('text', None)
    kw.setdefault('min_len', 4)
    kw.setdefault('ignore_list', [])
    kw.setdefault('filter_stopwords', False)
    kw.setdefault('filter_plainwords', False)

    if  kw['text'] == None:
        return None

    if not kw['filter_stopwords'] == False:
        kw['ignore_list'] += nltk.corpus.stopwords.words()

    if  kw['filter_plainwords'] == True:
        plain_words = nltk.corpus.abc.words()
    else:
        plain_words = []

    matches = set(re.findall('([A-z]+)+', kw['text']))

    tokens = set([token.lower() for token in matches
                  if token not in kw['ignore_list']
                  and len(token) >= kw['min_len']])

    if not kw['filter_plainwords']:
        return tokens
    else:
        return tokens.intersection(plain_words)

def test():
    """ tokenize() test """
    import requests
    text = raw_input("path or url to string data > ")
    if not text.startswith('http'):
        text = open(text, 'r').read()
    else:
        text = requests.get(text).content
    print tokenize(text=text,
                   min_len=4,
                   ignore_list=['code'],
                   filter_plainwords=True,
                   filter_stopwords=True)


## TESTING tokenize
if __name__ == '__main__':
    test()
share|improve this answer
add comment

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.