I am working on converting a mailing list that has longitude and latitude coordinates within the CSV file. This script I came up with does what I need, but this is my first real-world use of python. I want to know where I am making any mistakes, not using best-practices, and what can be optimized to make it faster.
The input.csv file has the following header:
"Email Address",MEMBER_RATING,OPTIN_TIME,OPTIN_IP,CONFIRM_TIME,CONFIRM_IP,LATITUDE,LONGITUDE,GMTOFF,DSTOFF,TIMEZONE,CC,REGION,LAST_CHANGED,LEID,EUID
And the script:
import sys
import os
import csv
import signal
import json
import urllib
import urllib2
import sqlite3
import codecs
import cStringIO
csv.field_size_limit(sys.maxsize)
class EmailList:
WEB_SERVICE_URL = 'http://open.mapquestapi.com/nominatim/v1/reverse.php?format=json'
def __init__(self, inputFile):
signal.signal(signal.SIGINT, self.signal_handler)
self.conn = None
self.initialize_database(inputFile)
self.convert_rows()
self.db_to_csv()
def signal_handler(self, signal, frame):
try:
self.conn.commit()
self.conn.close()
print '[DB changes committed and connection closed.]'
except sqlite3.ProgrammingError as e:
print '[script stopped]'
print e.message
sys.exit(0)
def initialize_database(self, file):
print 'checking for data.db...'
if not os.path.isfile('data.db'):
print 'data.db does not exist, converting csv to sqlite...'
with open(file) as inputFile:
reader = UnicodeReader(inputFile)
header = reader.next()
if self.conn is None:
self.conn = sqlite3.connect('data.db')
c = self.conn.cursor()
c.execute("DROP TABLE IF EXISTS email_list")
sql = """CREATE TABLE email_list (\n""" + \
",\n".join([("%s varchar" % name) for name in header]) \
+ ")"
c.execute(sql)
for line in reader:
if line:
try:
c.execute('INSERT INTO email_list VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)', line)
except sqlite3.ProgrammingError as e:
print e.message
print line
try:
c.execute('ALTER TABLE email_list ADD COLUMN CITY varchar')
c.execute('ALTER TABLE email_list ADD COLUMN STATE varchar')
c.execute('ALTER TABLE email_list ADD COLUMN COUNTRY varchar')
c.execute('ALTER TABLE email_list ADD COLUMN POSTAL_CODE varchar')
except sqlite3.OperationalError as e:
print 'error creating new columns: '
print e.message
self.conn.commit()
self.conn.close()
print 'converted csv to sqlite, stored in data.db'
def convert_rows(self):
print 'converting coordinates...'
self.conn = sqlite3.connect('data.db')
c = self.conn.cursor()
results = c.execute('SELECT LATITUDE AS lat, LONGITUDE AS lon, Email as email FROM email_list WHERE POSTAL_CODE IS NULL AND CITY IS NULL AND COUNTRY IS NULL AND STATE IS NULL')
rows = []
for row in results:
lat, lon, email = row
data = {'lat': lat, 'lon': lon, 'email': email}
rows.append(data)
self.conn.commit()
self.conn.close()
for item in rows:
try:
converted = self.convert_coordinates(item['lat'], item['lon'])
print str(rows.index(item) + 1) + '/' + str(len(rows))
except TypeError:
converted['city': '']
converted['state': '']
converted['country': '']
converted['postal_code': '']
self.conn = sqlite3.connect('data.db')
c = self.conn.cursor()
try:
c.execute('UPDATE email_list SET CITY=?, STATE=?, COUNTRY=?, POSTAL_CODE=? WHERE Email=?', [converted['city'], converted['state'], converted['country'], converted['postal_code'], item['email']])
except KeyboardInterrupt:
print 'user quit'
self.conn.commit()
self.conn.close()
print 'converted coordinates.'
def convert_coordinates(self, lat, lon):
if lat and long:
try:
values = {'lat': lat, 'lon': lon}
data = urllib.urlencode(values)
request = urllib2.Request(self.WEB_SERVICE_URL, data)
response = urllib2.urlopen(request)
except urllib2.HTTPError as e:
print 'error loading web service'
print e.message
json_result = json.load(response)
try:
city = json_result['address']['city']
except KeyError:
city = ''
try:
state = json_result['address']['state']
except KeyError:
state = ''
try:
cc = json_result['address']['country_code']
except KeyError:
cc = ''
try:
postal_code = json_result['address']['postcode']
except KeyError:
postal_code = ''
else:
city = ''
state = ''
cc = ''
postal_code = ''
return {'city': city, 'state': state, 'country': cc, 'postal_code': postal_code}
def db_to_csv(self):
print 'beginning write to csv...'
self.conn = sqlite3.connect('data.db')
c = self.conn.cursor()
c.execute('SELECT * FROM email_list')
with open('output.csv', 'wb') as outputFile:
writer = UnicodeWriter(outputFile)
writer.writerow([i[0] for i in c.description])
writer.writerows(c)
print 'write finished.'
self.conn.commit()
self.conn.close()
print 'done.'
# The CSV module has issues with reading/writing unicode,
# the following classes were taken from docs.python.org to
# help with that: http://docs.python.org/2/library/csv.html
class UTF8Recoder:
"""
Iterator that reads an encoded stream and reencodes the input to UTF-8
"""
def __init__(self, f, encoding):
self.reader = codecs.getreader(encoding)(f)
def __iter__(self):
return self
def next(self):
return self.reader.next().encode("utf-8")
class UnicodeReader:
"""
A CSV reader which will iterate over lines in the CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
f = UTF8Recoder(f, encoding)
self.reader = csv.reader(f, dialect=dialect, **kwds)
def next(self):
row = self.reader.next()
return [unicode(s, "utf-8") for s in row]
def __iter__(self):
return self
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
EmailList('input.csv')