#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Bot to upload NARA images to Commons.
The bot expects a directory containing the images on the commandline and a text file containing the mappings.
The bot uses http://toolserver.org/~slakr/archives.php to get the description
'''
import sys, os.path, hashlib, base64, glob, re, urllib, time
sys.path.append("..\..\pywikipedia")
import wikipedia, config, query, upload
import shutil, socket
def getRecords(textfile):
result = {}
f = open(textfile, "r")
for line in f.readlines():
(filename, sep, arc) = line.partition(u' ')
result[filename] = int(arc.strip())
return result
def findDuplicateImages(filename, site = wikipedia.getSite(u'commons', u'commons')):
'''
Takes the photo, calculates the SHA1 hash and asks the mediawiki api for a list of duplicates.
TODO: Add exception handling, fix site thing
'''
f = open(filename, 'rb')
hashObject = hashlib.sha1()
hashObject.update(f.read(-1))
return site.getFilesFromAnHash(base64.b16encode(hashObject.digest()))
def getDescription(fileId):
url = u'http://toolserver.org/~slakr/archives.php?archiveHint=%s' % (fileId,)
textareaRe = re.compile('^$', re.MULTILINE + re.DOTALL)
gotInfo = False
matches = None
maxtries = 10
tries = 0
while(not gotInfo):
try:
if ( tries < maxtries ):
tries = tries + 1
archivesPage = urllib.urlopen(url)
matches = textareaRe.search(archivesPage.read().decode('utf-8'))
gotInfo = True
else:
break
except IOError:
wikipedia.output(u'Got an IOError, let\'s try again')
except socket.timeout:
wikipedia.output(u'Got a timeout, let\'s try again')
if (matches and gotInfo):
return matches.group(1)
return u''
def getTitle(fileId, description):
titleRe = re.compile('^\|Title=(.+)$', re.MULTILINE)
titleMatch = titleRe.search(description)
titleText = titleMatch.group(1)
if len(titleText)>120:
titleText = titleText[0 : 120]
title = u'%s - NARA - %s.tif' % (titleText, fileId)
return cleanUpTitle(title)
def cleanUpTitle(title):
'''
Clean up the title of a potential mediawiki page. Otherwise the title of
the page might not be allowed by the software.
'''
title = title.strip()
title = re.sub(u"[<{\\[]", u"(", title)
title = re.sub(u"[>}\\]]", u")", title)
title = re.sub(u"[ _]?\\(!\\)", u"", title)
title = re.sub(u",:[ _]", u", ", title)
title = re.sub(u"[;:][ _]", u", ", title)
title = re.sub(u"[\t\n ]+", u" ", title)
title = re.sub(u"[\r\n ]+", u" ", title)
title = re.sub(u"[\n]+", u"", title)
title = re.sub(u"[?!]([.\"]|$)", u"\\1", title)
title = re.sub(u"[%?!]", u"^", title)
title = re.sub(u"[;]", u",", title)
title = re.sub(u"[/+\\\\:]", u"-", title)
title = re.sub(u"--+", u"-", title)
title = re.sub(u",,+", u",", title)
title = re.sub(u"[-,^]([.]|$)", u"\\1", title)
title = title.replace(u" ", u"_")
return title
def main(args):
'''
Main loop.
'''
workdir = u''
textfile = u''
records = {}
site = wikipedia.getSite(u'commons', u'commons')
wikipedia.setSite(site)
if not (len(args)==2):
wikipedia.output(u'Too few arguments. Usage: NARA_uploader.py ')
sys.exit()
if os.path.isdir(args[0]):
workdir = args[0]
else:
wikipedia.output(u'%s doesn\'t appear to be a directory. Exiting' % (args[0],))
sys.exit()
textfile = args[1]
records = getRecords(textfile)
#print records
sourcefilenames = glob.glob(workdir + u"/*.TIF")
for sourcefilename in sourcefilenames:
filename = os.path.basename(sourcefilename)
# This will give an ugly error if the id is unknown
if not records.get(filename):
wikipedia.output(u'Can\'t find %s in %s. Skipping this file.' % (filename, textfile))
else:
fileId = records.get(filename)
duplicates = findDuplicateImages(sourcefilename)
if duplicates:
wikipedia.output(u'Found duplicate image at %s' % duplicates.pop())
else:
# No metadata handling. We use a webtool
description = getDescription(fileId)
categories = u'{{Uncategorized-NARA|year={{subst:CURRENTYEAR}}|month={{subst:CURRENTMONTHNAME}}|day={{subst:CURRENTDAY}}}}\n'
description = description + categories
title = getTitle(fileId, description)
wikipedia.output(title)
wikipedia.output(description)
bot = upload.UploadRobot(url=sourcefilename.decode(sys.getfilesystemencoding()), description=description, useFilename=title, keepFilename=True, verifyDescription=False)
bot.run()
if __name__ == "__main__":
try:
main(sys.argv[1:])
finally:
print u'All done'