I wrote a script that is automating the archiving of specific files based on given parameters. The script is crashing on files that are over 500 MB when searching by "special id" or "date", values that should be in the "special" element. There should only be one "special" element per xml file, but in some cases the xml file was created using bad data and there are multiple "special" elements. In this case, it is alright to just check the first "special" element, check for the given values, and then break if those values are not found.
import argparse, sys, os, shutil
def get_parsed_args():
parser = argparse.ArgumentParser(description='Archive old data')
parser.add_argument('-d', action='store', dest='directory', help='main directory for files (optional, assumes current directory if blank and always assumes subdirectories of "current" and "archived")')
parser.add_argument('-t', action='store', dest='type', help='input type, and enum of one of the following: files, date, specialid')
parser.add_argument('-l', action='append', dest='dlist', default=[], help='list of the data type given above, although if the type is "date" this will only use the first parameter')
return parser.parse_args()
def get_directory(results):
tempdir = ""
if results.directory:
tempdir = results.directory
if len(tempdir) > 0 and directory[-1] != "/":
tempdir = tempdir + "/"
return tempdir
def archive_file(fname, cdir, adir):
print "Archiving file: " + fname
shutil.move(cdir+fname, adir+fname)
results = get_parsed_args()
valid_types = ['files','date','specialid']
directory = get_directory(results)
curdir = directory + "current/"
arcdir = directory + "archived/"
extension = ".xml"
#check that the directory has current/archived folders in it
if not(os.path.exists(curdir)) or not(os.path.exists(arcdir)):
print "Error: Missing current or archived folder in directory path"
sys.exit(0)
if not(results.type) or not(results.type in valid_types):
print "Error: Invalid or empty type parameter, quitting program"
sys.exit(0)
dtype = results.type
if len(results.dlist) <= 0:
print "Error: No list of values given, quitting program"
sys.exit(0)
#date should just take the first value, all others will take the list of objects
if dtype == "date":
dlist = results.dlist[0]
else:
dlist = results.dlist
if dtype == "files":
for f in dlist:
print f
if os.path.exists(curdir + f):
archive_file(f, curdir, arcdir)
else:
from lxml import etree
xmlparser = etree.XMLParser()
files = [file for file in os.listdir(curdir) if file.lower().endswith(extension)]
for f in files:
print "Checking file: " + f
data = etree.parse(open(curdir+f),xmlparser)
root = data.getroot()
for element in root.iter("special"):
if dtype == "specialid":
if element.get("id") in dlist:
archive_file(f, curdir, arcdir)
break
elif dtype == "date":
for e in element.iter("date"):
if e.text < dlist:
archive_file(f, curdir, arcdir)
break
break
Example XML to input into this:
<?xml version="1.0" encoding="UTF-8"?>
<our_object xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://schemaurl" schemaVersion="2.0">
<source id="0">
<name>Source Name</name>
<our_id>05</our_id>
<datetime>2011-06-29T14:53:52</datetime>
</source>
<special id="12345">
<date>2011-11-08</date>
<special_type>GenericType</special_type>
<state_id>05</state_id>
</special>
<state id="05">
<name>StateName</name>
</state>
<locality id="001">
<name>Town1</name>
<state_id>05</state_id>
<type>Town</type>
</locality>
<locality id="002">
<name>Town2</name>
<state_id>05</state_id>
<type>Town</type>
</locality>
<locality id="003">
<name>Town3</name>
<state_id>05</state_id>
<type>Town</type>
</locality>
</our_object>
Edited as per suggestions below