Tell me more ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

I would love some thoughts on this class that a wrote to extract user information from a logfile line.

I'm concerned about design patterns and the complexity of the code. The extract_user method feels especially awkward.

The class should be used as a public interface ala:

parser = UserExtraction(s)
data = parser.extract_user()

Now, the majority of the strings that I pass in will simply not be authentication strings. How do I make this available as some sort of response code / error-message to an end user? Right now I'm using self.error_message = yadayada. But I have a feeling this is not very pythonic.

I didn't write any documentation yet since I'm very uncertain about the general design.

Every detail is appreciated, I'm trying to get better at this.

import re
import logging
import os
import ConfigParser

class UserExtraction():

    def __init__(self, string):
        self.string = string

    def _string_identification(self):
        identifiers = {
                'Successfully logged in' : 'login', 
                'logout' : 'logout',
                }
        regex_patterns = {

                #TODO Store regexpatterns in outside file
                "login" : "(\w{3}.+?\d{1,2} \d{2}:\d{2}:\d{2}).+?Authentication: \[([0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}) ## (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\] (.+?) - Successfully .+? Role: (.+?),",

                "logout" : "(\w{3}.+?\d{1,2} \d{2}:\d{2}:\d{2}).+?Authentication: Unable to ping (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}), going to logout user (.+)"

                }
        for key, value in identifiers.iteritems():
            if key in self.string:
                self.event_type = value
                self.regex = regex_patterns[value]
                return True
        return False

    def _matching(self):
        self.matches = re.search(self.regex, self.string)
        if self.matches: 
            return True
        else:
            return False

    def _match_to_dict(self):
        matches = self.matches 
        if self.event_type == 'login':
            self.userinfo = { 
                    'date_time' : matches.group(1),
                    'mac_addr'  : matches.group(2),
                    'ip_addr'   : matches.group(3),
                    'login'     : matches.group(4),
                    'role'      : matches.group(5)
                    } 

        elif self.event_type == 'logout':
            self.userinfo = {
                    'date_time' : matches.group(1),
                    'ip_addr'   : matches.group(2),
                    'login'     : matches.group(3)
                    }
        if self.userinfo:
            return True 

    def extract_user(self):
        if self._string_identification() == True:
            if self._matching() == True:
                if self._match_to_dict() == True:
                    return self.userinfo
                else:
                    self.error_message = "COULD NOT PARSE USERINFO INTO DICTIONARY"
            else:
                self.error_message = "COULD NOT MATCH AUTHENTICATION STRING"
        else:
            self.error_message = "NOT AN AUTHENTICATION STRING"


if __name__ == "__main__":

    ROOT_PATH = os.path.dirname(__file__)

    config = ConfigParser.ConfigParser()
    config.readfp(open(os.path.join(ROOT_PATH + '/settings.cfg')))
    filename = config.get('defaults', 'file_location')

    LOG_FILENAME = config.get('defaults', 'log_location')
    LOG_FORMAT = '%(asctime)s - %(levelname)s %(message)s' 
    LOG_LEVEL = config.get('defaults', 'logging_level')
    #TODO Change datefmt to be consistent with Apache logfile
    logging.basicConfig(filename=LOG_FILENAME, format=LOG_FORMAT, datefmt='%m/%d %I:%M:%S', level=eval(LOG_LEVEL))

    sample_strings = []

    sample_strings.append('''Jan 2 15:32:49 cam-1/10.0.0.1 Authentication: [01:26:b2:F8:39:27 ## 172.16.197.23] Anonymous - Successfully logged in, Provider: anon, L2 MAC address: 00:26:B2:F2:39:87, Role: User, OS: Mac OS 10.6''')

    for s in sample_strings:
        Processor = UserExtraction(s)
        Processor.extract_user()
        if hasattr(Processor, 'userinfo'):
            print(Processor.userinfo)
        else:
            print(Processor.error_message)
share|improve this question

1 Answer

up vote 1 down vote accepted
class UserExtraction():

Either put object as an explicit base class, or don't have the parens.

    def __init__(self, string):
        self.string = string

    def _string_identification(self):
        identifiers = {
                'Successfully logged in' : 'login', 
                'logout' : 'logout',
                }
        regex_patterns = {

                #TODO Store regexpatterns in outside file
                "login" : "(\w{3}.+?\d{1,2} \d{2}:\d{2}:\d{2}).+?Authentication: \[([0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}) ## (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\] (.+?) - Successfully .+? Role: (.+?),",

                "logout" : "(\w{3}.+?\d{1,2} \d{2}:\d{2}:\d{2}).+?Authentication: Unable to ping (\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}), going to logout user (.+)"

                }

I'd combine these two dictionaries into one list of tuples. I'd also probably make a global constant. Its not clear to me that putting it into an external file would help.

        for key, value in identifiers.iteritems():

You don't gain much using iteritems here

            if key in self.string:
                self.event_type = value
                self.regex = regex_patterns[value]

Don't pass data between your method by storing it on the object. That just obscures what is happening. Anything the caller needs to know should be returned not stored.

                return True
        return False

    def _matching(self):
        self.matches = re.search(self.regex, self.string)
        if self.matches: 
            return True
        else:
            return False

Just return matches or return bool(matches) rather then introducing an if block to convert a bool into a bool.

    def _match_to_dict(self):
        matches = self.matches 
        if self.event_type == 'login':
            self.userinfo = { 
                    'date_time' : matches.group(1),
                    'mac_addr'  : matches.group(2),
                    'ip_addr'   : matches.group(3),
                    'login'     : matches.group(4),
                    'role'      : matches.group(5)
                    } 

        elif self.event_type == 'logout':
            self.userinfo = {
                    'date_time' : matches.group(1),
                    'ip_addr'   : matches.group(2),
                    'login'     : matches.group(3)
                    }

Python's regular expressions have a named group feature. You can name the groups in the regular expression and thus simplify this code.

        if self.userinfo:
            return True 

    def extract_user(self):
        if self._string_identification() == True:
            if self._matching() == True:
                if self._match_to_dict() == True:

Don't use == True. Just if self._string_identification():

                    return self.userinfo
                else:
                    self.error_message = "COULD NOT PARSE USERINFO INTO DICTIONARY"
            else:
                self.error_message = "COULD NOT MATCH AUTHENTICATION STRING"
        else:
            self.error_message = "NOT AN AUTHENTICATION STRING"

This is python! Throw exceptions, don't store error messages.

if __name__ == "__main__":

It's usually best to put your main code in a main function that you call from here.

    ROOT_PATH = os.path.dirname(__file__)

    config = ConfigParser.ConfigParser()
    config.readfp(open(os.path.join(ROOT_PATH + '/settings.cfg')))
    filename = config.get('defaults', 'file_location')

    LOG_FILENAME = config.get('defaults', 'log_location')
    LOG_FORMAT = '%(asctime)s - %(levelname)s %(message)s' 
    LOG_LEVEL = config.get('defaults', 'logging_level')

Convention says that ALL_CAPS is for constants. These aren't constants

    #TODO Change datefmt to be consistent with Apache logfile
    logging.basicConfig(filename=LOG_FILENAME, format=LOG_FORMAT, datefmt='%m/%d %I:%M:%S', level=eval(LOG_LEVEL))

    sample_strings = []

    sample_strings.append('''Jan 2 15:32:49 cam-1/10.0.0.1 Authentication: [01:26:b2:F8:39:27 ## 172.16.197.23] Anonymous - Successfully logged in, Provider: anon, L2 MAC address: 00:26:B2:F2:39:87, Role: User, OS: Mac OS 10.6''')

Considering combining these two lines into a string literal.

    for s in sample_strings:

Avoid single letter variable names, they are hard to follow

        Processor = UserExtraction(s)
        Processor.extract_user()

By convetion Processor should be the name of a class. Local variables should undercase_with_underscores. Furthermore, should this even be a class? It looks to me more like the job for a function.

        if hasattr(Processor, 'userinfo'):
            print(Processor.userinfo)
        else:
            print(Processor.error_message)

Here's me reworking of your code:

import re
import logging
import os
import ConfigParser

PATTERNS = [
    ('Successfully logged in', "(?P<date_time>\w{3}.+?\d{1,2} \d{2}:\d{2}:\d{2}).+?Authentication: \[(?P<mac_addr>[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}:[0-9a-zA-Z]{2}) ## (?P<ip_addr>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\] (?P<login>.+?) - Successfully .+? Role: (?P<role>.+?),"),
    ('logout', "(?P<date_time>\w{3}.+?\d{1,2} \d{2}:\d{2}:\d{2}).+?Authentication: Unable to ping (?P<ip_addr>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}), going to logout user (?P<login>.+)")
]


class UserExtractionError(Exception):
    pass

def extract_user(string):
    for key, regex in PATTERNS:
        if key in string:
            match = re.search(regex, string)
            if match is None:
                raise UserExtractionError("Could not match authentication string")
            return match.groupdict()
    else:
        raise UserExtractionError("Not an authenication string")


if __name__ == "__main__":


    sample_strings = [
        '''Jan 2 15:32:49 cam-1/10.0.0.1 Authentication: [01:26:b2:F8:39:27 ## 172.16.197.23] Anonymous - Successfully logged in, Provider: anon, L2 MAC address: 00:26:B2:F2:39:87, Role: User, OS: Mac OS 10.6'''
    ]

    for sample in sample_strings:
        try:
            print extract_user(sample)
        except UserExtractionError as error:
            print error

Lightly tested. I didn't keep the logging code.

share|improve this answer
Thank you Winston. What a phenomenally detailed answer. I can't come to terms with the fact that somebody would take time out off their day to teach me how to code. Thanks for the effort, your explanations are tremendously helpful. – Daniel Richter Feb 13 '12 at 8:35

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.