Tell me more ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

I have this module which computes checksums from a list of files in a given directory. The problem is that my is_changed_file is long and ugly, but my attempts in refactoring failed, so I would like some other point of views...

import hashlib
import logging

# if available use the faster cPickle module
try:
    import cPickle as pickle
except ImportError:
    import pickle

from os import path


class DirectoryChecksum(object):
    """Manage the checksums of the given files in a directory
    """

    def __init__(self, directory, to_hash):
        self.directory = directory
        self.to_hash = to_hash
        self.checksum_file = path.join(self.directory, '.checksums')
        self.checks = self._compute()
        self.logger = logging.getLogger("checksum(%s): " % self.directory)

    def _abs_path(self, filename):
        return path.join(self.directory, filename)

    def _get_checksum(self, filename):
        content = open(self._abs_path(filename)).read()
        return hashlib.md5(content).hexdigest()

    def _compute(self):
        """Compute all the checksums for the files to hash
        """
        dic = {}
        for f in self.to_hash:
            if self._file_exists(f):
                dic[f] = self._get_checksum(f)
        return dic

    def _file_exists(self, filename):
        return path.isfile(self._abs_path(filename))

    def is_changed(self):
        """True if any of the files to hash has been changed
        """
        return any(self.is_file_changed(x) for x in self.to_hash)

    #FIXME: refactor this mess, there is also a bug which impacts on
    #the airbus. eggs, so probably something to do with the path
    def is_file_changed(self, filename):
        """Return true if the given file was changed
        """
        if not self._has_checksum():
            self.logger.debug("no checksum is available yet, creating it")
            self.write_checksums()
            return True

        stored_checks = self.load_checksums()

        if not self._file_exists(filename):
            if filename in stored_checks:
                self.logger.debug("file %s has been removed" % filename)
                # what if it existed before and now it doesn't??
                return True
            else:
                return False

        checksum = self._get_checksum(filename)
        if filename in stored_checks:
            # if the file is already there but the values are changed
            # then we also need to rewrite the stored checksums
            if stored_checks[filename] != checksum:
                self.write_checksums()
                return True
            else:
                return False
        else:
            # this means that there is a new file to hash, just do it again
            self.write_checksums()
            return True

    def _has_checksum(self):
        return path.isfile(self.checksum_file)

    def load_checksums(self):
        """Load the checksum file, returning the dictionary stored
        """
        return pickle.load(open(self.checksum_file))

    def write_checksums(self):
        """Write to output (potentially overwriting) the computed checksum dictionary
        """
        self.logger.debug("writing the checksums to %s" % self.checksum_file)
        return pickle.dump(self.checks, open(self.checksum_file, 'w'))
share|improve this question

2 Answers

up vote 1 down vote accepted

Probably there is no need to refactor the method as it represents quite clear decision tree. Breaking it down into smaller methods will cause readability problems.

Just make sure you have unit tests covering each outcome and that there are no cases, which the code doesn't handle.

If you definitely want to refactor, I propose to add a method like this:

def _check_log_writesums(cond, logmessage, rewrite=True):
    if cond:
        if logmessage:
            self.logger.debug(logmessage)
        if rewrite:
            self.write_checksums()
        return True
    return False

Then you can call it like this:

    if filename in stored_checks:
        return self._check_log_writesums(stored_checks[filename] != checksum,
                                         "",
                                         rewrite=True)

Maybe the name can be better (_update_checksums or _do_write_checksums).

share|improve this answer
Yes well I don't definitively want to refactor, but I found it very strange that I could not easily refactor it... What is your "if logmessage" for? I normally just use levels or more specialized loggers that can be filtered more easily... – andrea_crotti Jan 10 '12 at 11:57
I just wanted to make it optional for "if stored_checks[filename] != checksum", because logging is missing in the original code. There is no sense to log empty message – Roman Susi Jan 11 '12 at 20:05

I felt there was there was a better way to do what i wanted, and after a refactoring I came up with this.

class DirChecksum(object):
    def __init__(self, directory, files_to_hash):
        self.directory = directory
        self.files_to_hash = files_to_hash

    def _abs_path(self, filename):
        return path.join(self.directory, filename)

    def _get_checksum(self, filename):
        content = open(filename).read()
        return hashlib.md5(content).hexdigest()

    def _compute_checksums(self):
        logger.debug("computing checksums for %s" % self.directory)
        ck = {}
        for fname in self.files_to_hash:
            abs_fname = self._abs_path(fname)
            if path.isfile(abs_fname):
                ck[fname] = self._get_checksum(abs_fname)

        return ck

    @property
    def checksums(self):
        return self._compute_checksums()

    def is_changed(self, old_checksums):
        # there was nothing actually previously stored
        if not old_checksums:
            logger.debug("checksums were never computed before")
            return True

        actual_checksums = self.checksums
        if len(actual_checksums) != len(old_checksums):
            logger.debug("number of files hashed changed")
            return True

        # if _checksums isn't there it means that probably is never been called
        return old_checksums != actual_checksums

Which is quite different, because now it doesn't read and write anything on disk, but it takes the dictionary with the checksums as input. Much cleaner now, even if of course can still made better..

share|improve this answer

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Not the answer you're looking for? Browse other questions tagged or ask your own question.