After solving the error in SO (like suggested) I return now for codereview. :-)
The task is to parse a huge file dblp.xml
(~800 MB) presented by DBLP. The records in this huge file do look for example like this or this. In particular:
<?xml version="1.0" encoding="ISO-8859-1"?>
<!DOCTYPE dblp SYSTEM "dblp.dtd">
<dblp>
record_1
...
record_n
</dblp>
I wrote some code, that shall get me ech tag of some records (will bes tored in a database).
I adapted this approach from an article from IBM developerWorks which refers to the article Incremental Parsing on effbot.org. Is this the correct approach for this task? Or is there a better way?
import sys
import os
import MySQLdb
from lxml import etree
def fast_iter2(context, cursor):
# Available elements are: article|inproceedings|proceedings|book|incollection|phdthesis|mastersthesis|www
elements = set(['article', 'inproceedings', 'proceedings', 'book', 'incollection', 'phdthesis', "mastersthesis", "www"])
# Available tags are: author|editor|title|booktitle|pages|year|address|journal|volume|number|month|url|ee|cdrom|cite|
# publisher|note|crossref|isbn|series|school|chapter
childElements = set(["title", "booktitle", "year", "journal", "ee"])
paper = {} # represents a paper with all its tags.
authors = [] # a list of authors who have written the paper "together".
paperCounter = 0
for event, element in context:
tag = element.tag
if tag in childElements:
if element.text:
paper[tag] = element.text
# print tag, paper[tag]
elif tag == "author":
if element.text:
authors.append(element.text)
# print "AUTHOR:", authors[-1]
elif tag in elements:
paper["element"] = tag
paper["mdate"] = element.get("mdate")
paper["dblpkey"] = element.get("key")
# print tag, element.get("mdate"), element.get("key"), event
if paper["element"] in ['phdthesis', "mastersthesis", "www"]:
pass # throw away "unwanted" records.
else:
populate_database(paper, authors, cursor)
paperCounter += 1
print paperCounter
paper = {}
authors = []
# if paperCounter == 100:
# break
element.clear()
while element.getprevious() is not None:
del element.getparent()[0]
del context
def main():
cursor = connectToDatabase()
cursor.execute("""SET NAMES utf8""")
context = etree.iterparse(PATH_TO_XML, dtd_validation=True, events=("start", "end"))
fast_iter(context, cursor)
cursor.close()
if __name__ == '__main__':
main()