I am given a CSV file of stations, with data like this:
station_id,date,temperature_c
68,2000.375,10.500
68,2000.542,5.400
68,2000.958,23.000
68,2001.125,20.400
68,2001.292,13.300
68,2001.375,10.400
68,2001.958,21.800
68,2002.208,15.500
and so on for many different station_id
s.
Then I want to create a Python program that (1) gives the minimum reading (the third column) and (2) the station with the maximum "travel distance" with its readings. Thus if a station has 3 readings of -5,0,8 then that would mean a travel distance of 13. This can take an optional date range. Here is what I did.
#!/usr/bin/python
from collections import defaultdict
import csv
import random
import sys
# In order to track each station's statistics, we'll create a Station class
# to hold the data on a per-station basis.
class Station:
def __init__(self):
self.readings = []
self.minimum = 99999999.0
self.travel = 0
# travel holds the change in temperature reading-by-reading
def get_travel(self):
return self.travel
def set_travel(self, n):
self.travel += abs(n)
# getter & setter for station minimums
def get_minimum(self):
return self.minimum
def set_minimum(self, n):
self.minimum = n
# infrastructure for future code expansion
def get_readings(self):
return self.readings
def set_readings(self, date, temp):
self.readings.append({ "date" : date, "temp" : temp})
"""
Reporter class handles a list of Stations
"""
class Reporter:
def __init__(self):
# stations dict with entries for holding the specified stats.
self.stations = defaultdict(Station)
self.global_minimum = { "station" : "default", "date" : 1, "temp" : 9999999 }
self.longest_travel = { "station" : "default", "range" : 0 }
"""
Determines which station recorded the coldest temperature
args: CSV file
returns: dict with data
"""
def minimum_temperature(self, filename):
with open(filename, 'r') as datafile:
try:
csv_reader = csv.reader(datafile)
next(csv_reader)
# reading line-by-line since CSV could be a big file
for row in csv_reader:
station, date, temp = row
# save the station's readings
self.stations[station].set_readings(date, temp)
temp = float(temp)
if (temp < self.stations[station].get_minimum()):
self.stations[station].set_minimum(temp)
if(temp < self.global_minimum["temp"]):
self.global_minimum = { "station" : station, "temp" : temp, "date" : date }
# The specs state that in the event that a tie occurs simply return
# one pair at random.
if (temp == self.global_minimum["temp"]):
if (random.randint(1,100) % 2 == 0):
self.global_minimum = { "station" : station, "date" : date, "temp" : temp }
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(filename, reader.line_num, e))
return self.global_minimum
"""
Determines which station "traveled" the most
args: CSV file, begin date (optional), end date (optional)
returns: dict with data
"""
def max_travel(self,filename,begin=1.0,end=9999.9):
with open(filename, 'r') as datafile:
try:
csv_reader = csv.reader(datafile)
next(csv_reader)
# reading line-by-line since CSV could be a big file
for row in csv_reader:
station, date, temp = row
# save for future expansion
self.stations[station].set_readings(date, temp)
date = float(date)
if date > begin and date < end:
temp = float(temp)
self.stations[station].set_travel(temp)
travel = self.stations[station].get_travel()
if ( travel > self.longest_travel["range"]):
self.longest_travel = { "station" : station, "range" : travel }
except csv.Error as e:
sys.exit('file {}, line {}: {}'.format(filename, reader.line_num, e))
return self.longest_travel
if __name__ == "__main__":
csv_file = sys.argv[1]
# fetch lowest temperature
reporter = Reporter()
global_minimum = reporter.minimum_temperature(csv_file)
print("station {} had the global minimum on {}".format(global_minimum["station"], global_minimum["date"]))
# fetch maximum travel overall
longest_travel = reporter.max_travel(csv_file)
print("station {} had the greatest travel at {}".format(longest_travel["station"], longest_travel["range"]))
# now try a date range
reporter2 = Reporter()
begin = 2001.0
end = 2006.0
longest_travel = reporter2.max_travel(csv_file,begin,end)
print("for {} to {}, station {} had the greatest travel at {}".format(begin, end, longest_travel["station"], longest_travel["range"]))
I'm particularly interested in speeding it up and memory usage but also how to Pythonically deal with with getters/setters.