Python function to find and tokenize string data

Question

#!/usr/bin/env python

import re
import nltk.corpus

def tokenize(text, **kw):
    """
    Break down text into a set of unique lower case words.
    """
    plain_words = nltk.corpus.abc.words()
    stop_words = nltk.corpus.stopwords.words()
    # Defaults
    kw.setdefault('min_len', 4)
    kw.setdefault('max_len', 0)
    kw.setdefault('ignore_list', [])
    kw['ignore_list'] += stop_words

    ok_length = lambda token: [len(token) >= kw['min_len']
                             and kw['max_len'] <= kw['max_len']][0]

    tokens = set([token.lower()
                  for token in re.findall('([A-z]+)+', text)
                  if ok_length(token) and
                  token not in kw['ignore_list']]).intersection(plain_words)

    return tokens

## TESTING tokenize

TEXT = open('/bin/bash', 'r').read()

print tokenize(TEXT,
               min_len=4,
               max_len=0,
               ignore_list=['code'])

My concerns:

Is tokenize() too complex as one function?
Should I replace the list comprehensions?
Does this regex '([A-z]+)+' need improving?
How could I make this code more idiomatic?
Have I overlooked any faults in any of the logic?

Example output:

11:26 PM$ python Tokenize.py 
set(['replacing', 'default', 'all', 'forget', 'chain', 'skip', 'global', 'dollar', 'splitting', 'existing', 'four', 'executing', 'go', 'follow', 'expressions', 'activates', 'saved', 'children', 'causes', 'row', 'whose', 'tv', 'graph', 'discard', 'send', 'environment', 'to', 'topic', 'program', 'marks', 'include', 'sent', 'allocate', 'division', 'random', 'slash', 'dynamic', 'reserved', 'removing', 'manipulated', 'every', 'nesting', 'decide', 'entries', 'locked', 'syntax', 'exact', 'condition', 'entire', 'redistribute', 'magic', 'exits', 'level', 'turns', 'array', 'exec', 'list', 'fewer', 'try', 'mm', 'quick', 'refer', 'upper', 'unexpected', 'force', 'portable', 'be', 'obsolete', 'sign', 'jump', 'consists', 'second', 'displays', 'insertion', 'pass', 'gm', 'even', 'index', 'errors', 'adds', 'sub', 'directive', 'near', 'supplied', 'current', 'seconds', 'waiting', 'version', 'new', 'movement', 'redirect', 'full', 'simultaneously', 'exchange', 'respectively', 'error', 'commercial', 'equals', 'reported', 'objects', 'let', 'undo', 'groups', 'erm', 'active', 'path', '[', 'diagnostic', 'appears', 'change', 'wait', 'digits', 'great', 'copyright', 'coerced', 'handlers', 'items', 'changed', 'allows', 'reports', 'ignoring', 'amount', 'resulting', 'menu', 'usually', 'history', 'makes', 'exited', 'missing', 'composed', 'named', 'via', 'useful', 'extra', 'prefer', 'logical', 'replace', 'visible', 'names', 'apply', 'unit', 'use', 'takes', 'working', 'nine', 'escapes', 'two', 'next', 'r', 'duplicate', 'handler', 'call', 'memory', 'scope', 'type', 'until', 'more', 'separated', 'successful', 'initial', 'operators', 'tested', 'flag', 'controlling', 'encountered', 'disabling', 'must', 'me', 'escaped', 'none', 'te', 'word', 'err', 'indicates', 'this', 'loops', 'work', 'mi', 'modified', 'abort', 'values', 'can', 'socket', 'following', 'making', 'closing', 'my', 'example', 'performed', 'control', 'del', 'prompt', 'links', 'give', 'process', 'lock', 'functions', 'share', 'accept', 'trap', 'high', 'effectively', 'tag', 'numbers', 'allowed', 'scheduling', 'counting', 'audible', 'information', 'rather', 'means', 'j', 'write', 'how', 'silent', 'instead', 'profile', 'map', 'fr', 'blocks', 'description', 'may', 'max', 'resumes', 'tries', 'disable', 'coming', 'date', 'horizontal', 'law', 'data', 'types', 'fo', 'a', 'ambiguous', 'short', 'physical', 'remember', 'third', 'whenever', 'maybe', 'lines', 'bugs', 'element', 'provide', 'expression', 'allow', 'decreasing', 'scroll', 'subsequently', 'operate', 'order', 'se', 'feed', 'breaking', 'interpretation', 'help', 'disables', 'move', 'displayed', 'interpreted', 'disabled', 'timing', 'suspend', 'un', 'differs', 'interpreter', 'still', 'pointer', 'positional', 'style', 'le', 'group', 'monitor', 'curly', 'shifts', 'lo', 'll', 'detected', 'lu', 'systems', 'listing', 'mail', 'hidden', 'main', 'pending', 'split', 'non', 'return', 'greater', 'output', 'matches', 'auto', 'runs', 'number', 'break', 'internally', 'blink', 'killed', 'matched', 'term', 'name', 'ifs', 'always', 'revert', 'identified', 'privileged', 'possibilities', 'applied', 'token', 'inequality', 'stopped', 'mode', 'arrow', 'each', 'found', 'cc', 'reset', 'preceded', 'square', 'invoked', 'generation', 'ed', 'chunk', 'hard', 'frames', 'expect', 'exceeded', 'eu', 'et', 'operation', 'bay', 'event', 'special', 'intended', 'large', 'shown', 'network', 'space', 'restricted', 'since', 'preserve', 'unknown', 'looking', 're', 'acting', 'flushing', 'exporting', 'print', 'got', 'rn', 'cause', 'occurs', 'common', 'foundation', 'turning', 'resume', 'free', 'standard', 'indices', 'base', 'execute', 'put', 'org', 'wanted', 'beginning', 'l', 'software', 'resumed', 'definition', 'g', 'created', 'locations', 'retrieving', 'messages', 'times', 'creates', 'turn', 'length', 'place', 'w', 'assumed', 'timed', 'onto', 'assign', 'first', 'origin', 'already', 'succeeds', 'omitted', 'variables', 'symbolic', 'primary', 'owned', 'one', 'restrict', 'hook', 'done', 'notify', 'suspended', 'blank', 'reached', 'message', 'open', 'braces', 'size', 'given', 'checked', 'exists', 'service', 'redirection', 'meaningful', 'top', 'behaves', 'accent', 'system', 'construct', 'priority', 'indicate', 'returns', 'listed', 'passed', 'typing', 'white', 'final', 'gives', 'shell', 'option', 'trapped', 'ch', 'completed', 'exactly', 'lists', 'copy', 'completes', 'specify', 'character', 'begins', 'b', 'target', 'quantum', 'instruction', 'enabled', 'depends', 'i', 'determined', 'bind', 'enables', 'declare', 'interactive', 'and', 'files', 'false', 'topics', 'turned', 'argument', 'dash', 'width', 'need', 'seen', 'any', 'contents', 'forced', 'zero', 'depending', 'self', 'note', 'also', 'internal', 'build', 'indexed', 'destroy', 'copied', 'brace', 'begin', 'added', 'unless', 'trace', 'normal', 'buffer', 'object', 'leave', 'regular', 'eight', 'printed', 'letter', 'termination', 'nothing', 'alpha', 'segment', 'associative', 'grave', 'appear', 'kg', 'foreground', 'clear', 'later', 'm', 'km', 'looked', 'bracket', 'keywords', 'pattern', 'normally', 'notion', 'selection', 'show', 'text', 'supported', 'brief', 'session', 'beg', 'conditional', 'find', 'completion', 'access', 'based', 'quoted', 'parameters', 'implementation', 'true', 'specified', 'assertion', 'controls', 'terminal', 'failed', 'only', 'inherited', 'override', 'query', 'local', 'columns', 'do', 'specifications', 'invoke', 'get', 'convert', 'de', 'stop', 'da', 'cannot', 'negative', 'words', 'reply', 'report', 'du', 'procedures', 'sorts', 'secondary', 'processes', 'resource', 'horizontally', 'fields', 'remove', 'calling', 'arrays', 'bad', 'processed', 'contain', 'release', 'x', 'fixed', 'automatic', 'flagged', 'ignored', 'set', 'dump', 'frame', 'prints', 'maximum', 'relative', 'see', 'result', 'successive', 'sequences', 'fails', 'evaluation', 'vertical', 'placed', 'ways', 'subsequent', 'currently', 'written', 'protected', 'neither', 'reading', 'conditions', 'checks', 'available', 'suppresses', 'jobs', 'parent', 'opening', 'modify', 'screen', 'sole', 'transpose', 'disallow', 'nd', 'job', 'succeed', 'selectively', 'key', 'interface', 'printing', 'optional', 'valid', 'hits', 'last', 'reverse', 'limits', 'many', 'region', 'la', 'according', 'minus', 'etc', 's', 'context', 'attributes', 'delete', 'whole', 'botched', 'otherwise', 'load', 'pre', 'permitted', 'co', 'extent', 'point', 'simple', 'effective', 'period', 'pop', 'cz', 'simply', 'unsuccessful', 'table', 'allocated', 'indefinite', 'suppressing', 'described', 'duo', 'addition', 'shells', 'create', 'three', 'mark', 'pc', 'treat', 'expected', 'entered', 'empty', 'define', 'generating', 'enable', 'corresponding', 'suppress', 'sufficient', 'search', 'else', 'child', 'an', 'assigning', 'present', '^', 'case', 'handling', 'license', 'these', 'plain', 'expanded', 'examine', 'value', 'n', 'while', 'replaced', 'behavior', 'shift', 'evaluates', 'di', 'property', 'precede', 'loop', 'seven', 've', 'resident', 'is', 'dumped', 'binding', 'it', 'equal', 'vu', 'in', 'ie', 'if', 'binary', 'containing', 'perform', 'make', 'attribute', 'member', 'read', 'arguments', 'freed', 'modification', 'document', 'events', 'resources', 'status', 'used', 'temporary', 'receives', 'keys', 'reporting', 'upon', 'effect', 'alert', 'action', 'running', 'levels', 'uses', 'user', 'characters', 'stack', 'expand', 'recent', 'lower', 'older', 'shared', 'changes', 'well', 'spent', 'options', 'patterns', 'without', 'flags', 'sets', 'y', 'position', 'the', 'left', 'comment', 'newest', 'sourced', 'less', 'percent', 'obtain', 'actions', 'assigned', 'stored', 'kill', 'immediately', 'followed', 'alternative', 'rotates', 'previous', 'adding', 'loading', 'generator', 'grouped', 'bell', 'guaranteed', 'except', 'signals', 'source', 'add', 'setting', 'combine', 'location', 'usage', 'input', 'reusable', 'interprets', 'remaining', 'match', 'take', 'real', 'tests', 'format', 'rules', 'evaluate', 'showing', 'unlimited', 'possible', 'five', 'background', 'using', 'bit', 'accepted', 'string', 'd', 'insert', 'appearing', 'like', 'success', 'sizes', 'signal', 'performing', 'manual', 'specific', 'exhausted', 'continue', 'hosts', 't', 'become', 'soft', 'attempting', 'right', 'old', 'often', 'sequence', 'oriented', 'creation', 'some', 'back', 'oh', 'export', 'evaluated', 'loaded', 'duration', 'multiple', 'matching', 'reasons', 'ignore', 'describing', 'for', 'notification', 'avoid', 'though', 'comments', 'disk', 'exit', 'select', 'provides', 'indication', 'leader', 'either', 'core', 'command', 'run', 'remembered', 'equivalent', 'processing', 'continuing', 'bi', 'expansion', 'utilities', 'host', 'display', 'offset', 'leftover', 'post', 'refers', 'by', 'comparison', 'pipeline', 'ok', 'would', 'getting', 'column', 'of', 'http', 'o', 'page', 'stamp', 'range', 'plus', 'stand', 'illegal', 'connected', 'os', 'or', 'block', 'op', 'contains', 'letters', 'previously', 'within', 'bound', 'son', 'en', 'determine', 'operator', 'accumulated', 'exchanges', 'terminated', 'statistics', 'additional', 'waits', ']', 'there', 'question', 'long', 'start', 'restricts', 'editor', 'way', 'forward', 'eg', 'combined', 'function', 'head', 'successfully', 'complete', 'form', 'hr', 'attempted', 'removes', 'commands', 'failure', 'manipulate', 'hi', 'link', 'newer', 'line', 'with', 'bug', 'he', 'count', 'entry', 'places', 'versions', 'whether', 'wish', 'caller', 'up', 'us', 'record', 'carriage', 'converted', 'limit', 'fetch', 'pm', 'similar', 'called', 'connect', 'detailed', 'storing', 'definitions', 'associated', 'ad', 'ag', 'defined', 'pseudo', 'universal', 'escape', 'incremental', 'al', 'general', 'consumed', 'single', 'warning', 'exist', 'at', 'file', 'home', 'importing', 'trailing', 'check', 'defines', 'echo', 'pipe', 'marking', 'remainder', 'no', 'when', 'virtual', 'started', 'other', 'outputs', 'test', 'you', 'acceptable', 'arithmetic', 'formats', 'elements', 'star', 'colon', 'separate', 'preceding', 'searched', 'reused', 'includes', 'generated', 'exported', 'variable', 'structure', 'opened', 'e', 'requires', 'required', 'mask', 'visual', 'strings', 'u', 'time', 'directory', 'backward', 'starting', 'original'])

DarinDouglass · Answer 1 · 2014-05-17 04:27:39Z

No, tokenize is not too complex for one function. It does a single operation: it tokenizes a string.
Your list comprehensions are fine. They are somewhat complex but are still readable and easily understandable.
Instead of using the regex ([A-z]+)+ you could simply use the 'shortcut' `\w+'. This regex will match one-to-many words.

If you wanted your regex to recognize hyphenated words, a simple change will suffice:
```
\w(-?\w)*
```
Your code is quite Pythonic. Your spacing and indentation is fine. The same goes for your variable names. If you have more concerns about style, consult PEP8, the official Python style guide.
There is a flaw in your logic though. Take your lambda expression:
```
ok_length = lambda token: [len(token) >= kw['min_len']
                     and kw['max_len'] <= kw['max_len']][0]
```
Currently, this will except any string whose length is greater than the min_length this is because you have kw['max_len'] == kw['max_len'] which will always evaluate True. You also have the conditionals switched around. This is how you should have the check:
```
ok_length = lambda token: [kw['min_length'] <= len(token) <= kw['max_length']][0]
```
Now, the question you need to ask is, whether or not it pays off to use a lambda expression here. Is the functionality 'deserving' of a lambda expression? Or can the check simply be placed into the list comprehension a little later on? Here's what it would look like, I'll let you decide (I like this way):
```
tokens = set([token.lower()
          for token in re.findall('([A-z]+)+', text)
          if kw['min_len'] <= len(token) <= kw['max_len'] and
          token not in kw['ignore_list']]).intersection(plain_words)
```
The next thing I would consider is your used of the kw dict. How needed is it? Could you just implement optional parameters in the function declaration? I like that implementation because this function does not (and probably won't) take an arbitrary number of keyword arguments. It will only take the 3. As a related side note, you have swapped your max_len and min_len values. Currently, max_len < min_len.
Finally, I would remove the intersection function call when you assign the tokens variable. The name tokens and the context with which its generated implies that tokens will hold all tokens that match the given regex. However, in your case, it only contains that are also in plain_words. I would wait to do the intersection until the return statement:
```
return tokens.intersection(plain_words)
```

Here is 'my' version of the tokenize function:

def tokenize(text, min_len=0, max_len=4, ignore_list=[]):
    """
    Break down text into a set of unique lower case words.
    """
    plain_words = nltk.corpus.abc.words()
    stop_words = nltk.corpus.stopwords.words()
    ignore_list += stop_words

    tokens = set([token.lower()
                  for token in re.findall('\w+', text)
                  if min_len <= len(token) <= max_len and
                  token not in kw['ignore_list']])

    return tokens.intersection(plain_words)

Ricky Wilson · Answer 2 · 2014-05-17 23:28:39Z

Here is my second version of tokenize after reading DarinDouglass's answer.

#!/usr/bin/env python

"""
Tokenization is the process of breaking a stream of text up into words, phrases,
symbols, or other meaningful elements called tokens. The list of tokens becomes
input for further processing such as parsing or text mining. -- Wikipedia
"""

import re
import nltk.corpus

def tokenize(**kw):
    """ Tokenize string data """
    # Defaults
    kw.setdefault('text', None)
    kw.setdefault('min_len', 4)
    kw.setdefault('ignore_list', [])
    kw.setdefault('filter_stopwords', False)
    kw.setdefault('filter_plainwords', False)

    if  kw['text'] == None:
        return None

    if not kw['filter_stopwords'] == False:
        kw['ignore_list'] += nltk.corpus.stopwords.words()

    if  kw['filter_plainwords'] == True:
        plain_words = nltk.corpus.abc.words()
    else:
        plain_words = []

    matches = set(re.findall('([A-z]+)+', kw['text']))

    tokens = set([token.lower() for token in matches
                  if token not in kw['ignore_list']
                  and len(token) >= kw['min_len']])

    if not kw['filter_plainwords']:
        return tokens
    else:
        return tokens.intersection(plain_words)

def test():
    """ tokenize() test """
    import requests
    text = raw_input("path or url to string data > ")
    if not text.startswith('http'):
        text = open(text, 'r').read()
    else:
        text = requests.get(text).content
    print tokenize(text=text,
                   min_len=4,
                   ignore_list=['code'],
                   filter_plainwords=True,
                   filter_stopwords=True)


## TESTING tokenize
if __name__ == '__main__':
    test()

asked	23 days ago
viewed	65 times
active	22 days ago

current community

your communities

more stack exchange communities

Python function to find and tokenize string data

2 Answers

Your Answer

Not the answer you're looking for? Browse other questions tagged python clean-code regex or ask your own question.

Hot Network Questions

current community

your communities

more stack exchange communities

Python function to find and tokenize string data

2 Answers

Your Answer

Sign up or log in

Post as a guest

Not the answer you're looking for? Browse other questions tagged python clean-code regex or ask your own question.

Related

Hot Network Questions