I have this module which computes checksums from a list of files in a given directory. The problem is that my is_changed_file is long and ugly, but my attempts in refactoring failed, so I would like some other point of views...
import hashlib
import logging
# if available use the faster cPickle module
try:
import cPickle as pickle
except ImportError:
import pickle
from os import path
class DirectoryChecksum(object):
"""Manage the checksums of the given files in a directory
"""
def __init__(self, directory, to_hash):
self.directory = directory
self.to_hash = to_hash
self.checksum_file = path.join(self.directory, '.checksums')
self.checks = self._compute()
self.logger = logging.getLogger("checksum(%s): " % self.directory)
def _abs_path(self, filename):
return path.join(self.directory, filename)
def _get_checksum(self, filename):
content = open(self._abs_path(filename)).read()
return hashlib.md5(content).hexdigest()
def _compute(self):
"""Compute all the checksums for the files to hash
"""
dic = {}
for f in self.to_hash:
if self._file_exists(f):
dic[f] = self._get_checksum(f)
return dic
def _file_exists(self, filename):
return path.isfile(self._abs_path(filename))
def is_changed(self):
"""True if any of the files to hash has been changed
"""
return any(self.is_file_changed(x) for x in self.to_hash)
#FIXME: refactor this mess, there is also a bug which impacts on
#the airbus. eggs, so probably something to do with the path
def is_file_changed(self, filename):
"""Return true if the given file was changed
"""
if not self._has_checksum():
self.logger.debug("no checksum is available yet, creating it")
self.write_checksums()
return True
stored_checks = self.load_checksums()
if not self._file_exists(filename):
if filename in stored_checks:
self.logger.debug("file %s has been removed" % filename)
# what if it existed before and now it doesn't??
return True
else:
return False
checksum = self._get_checksum(filename)
if filename in stored_checks:
# if the file is already there but the values are changed
# then we also need to rewrite the stored checksums
if stored_checks[filename] != checksum:
self.write_checksums()
return True
else:
return False
else:
# this means that there is a new file to hash, just do it again
self.write_checksums()
return True
def _has_checksum(self):
return path.isfile(self.checksum_file)
def load_checksums(self):
"""Load the checksum file, returning the dictionary stored
"""
return pickle.load(open(self.checksum_file))
def write_checksums(self):
"""Write to output (potentially overwriting) the computed checksum dictionary
"""
self.logger.debug("writing the checksums to %s" % self.checksum_file)
return pickle.dump(self.checks, open(self.checksum_file, 'w'))