I wrote a log parser in Python and Julia. The Python version takes about 8 seconds to process 350 files, while the Julia version takes about 8 seconds to compile, after which it takes over 20 seconds to process 350 log files.
I wrote the original parser in Python several months ago, with regex, a custom class, and lots of methods for detailed analysis. I recently wrote a Julia version with regex, a custom type, and only a single analysis function (the only one I regularly used). I then wrote a very trimmed-down version of each parser, using only basic string functions. I'm experienced with Python but not with Julia, and I can't figure out why the Julia version is so slow (even discounting the compilation time). If I can AoT compile the Julia script, dramatically reduce its run time, and package it into a native x86/64 executable, it might be worth using.
What am I doing wrong with this Julia script? How can I make it faster?
The Python version:
import datetime
import sys
import os
import tzlocal
min_date = tzlocal.get_localzone().localize(datetime.datetime(2002, 1, 1))
min_date -= min_date.utcoffset()
day = min_date
keys = ['Contraband', 'Dilithium', 'Dilithium Ore', 'Energy Credits']
dct = {key:0 for key in keys}
files = sorted(f for f in os.listdir() if f[:5]=="Chat_")
l = len(files)
try:
with open('lastlog.txt') as f:
lastlog = f.read()
except (FileNotFoundError, PermissionError):
lastlog = ''
print('Date', *keys, sep='\t')
for counter,fname in enumerate(files):
if fname >= lastlog:
print('Processing {} out of {}...'.format(counter, l), end='\r', file=sys.stderr)
with open(fname, encoding='utf-8-sig') as f:
for line in f:
if (line[26:70] == ",0,NumericReceived@,@,,,System]You received " and
('Dilithium' in line or 'Energy Credits' in line)):
quantity,item = line.strip()[70:].split(None, 1)
quantity = int(quantity.replace(',', ''))
if item == 'Refined Dilithium':
item = 'Dilithium'
elif line[26:66] == ",0,NumericReceived@,@,,,System]You sold ":
item = 'Energy Credits'
quantity = int(line.rsplit(None, 3)[1].replace(',', ''))
elif line[26:78] == ",0,NumericConversionSuccess@,@,,,System]You refined ":
item = 'Dilithium'
quantity = int(line[78:-11].replace(',', ''))
elif (line[26:56] == ",0,NumericLost@,@,,,System]You" and
(line[56:62] == " lost " or line[56:63] == " spent ") and
('Dilithium' in line or 'Energy Credits' in line)):
quantity,item = line[62:].strip().split(None, 1)
quantity = -int(quantity.replace(',', ''))
if item == 'Refined Dilithium':
item = 'Dilithium'
elif line[26:83] == ',0,ItemReceived@,@,,,System]Items acquired: Contraband x ':
quantity = int(line[70:].rsplit(None, 1)[1].replace(',', ''))
item = 'Contraband'
elif line[26:79] == ",0,ItemReceived@,@,,,System]Item acquired: Contraband":
quantity = 1
item = 'Contraband'
else:
continue
dt = tzlocal.get_localzone().localize(datetime.datetime(year=int(line[11:15]), month=int(line[15:17]),
day=int(line[17:19]), hour=int(line[20:22]), minute=int(line[22:24]), second=int(line[24:25])))
dt -= dt.utcoffset()
if dt.day != day.day and day != min_date:
print('{}-{}-{}'.format(day.year,
day.month,
day.day),
*(dct[k] for k in keys), sep='\t')
dct = {key:0 for key in keys}
day = dt
if item in dct:
dct[item] += quantity
print('{}-{}-{}'.format(dt.year,
dt.month,
dt.day),
*(dct[k] for k in keys), sep='\t')
print('\nDone.', file=sys.stderr)
last = files[-1][5:15]
current = files[-1]
for fname in files[::-1]:
current = fname
if fname[5:15] != last:
break
with open('lastlog.txt', 'w') as f:
f.write(current)
The Julia version:
import TimeZones
function GetTotals()
zone = TimeZones.localzone() # get local time zone
min_date = TimeZones.astimezone(
TimeZones.ZonedDateTime(
DateTime(2002,1,1), zone),
TimeZones.TimeZone("UTC")) # set local minimum date as UTC
day = min_date
k = String["Contraband", "Dilithium", "Dilithium Ore", "Energy Credits"]
dct = Dict{String,Int64}(key=>0 for key in k)
files = sort([f for f in readdir() if f[1:5]=="Chat_"])
l = length(files)
if isfile("lastlog.txt")
lastlog = open("lastlog.txt") do f
readstring(f)
end
else
lastlog = ""
end
println("Date", '\t', join(k, '\t'))
for (counter,fname) in enumerate(files)
if fname >= lastlog # "Chat_2016-11-04" < fname < "Chat_2016-11-12"
print(STDERR, "Processing ", counter, " out of ", l, "...\r")
open(fname) do f
for line in eachline(f)
if length(line) > 70 && line[27:70] == ",0,NumericReceived@,@,,,System]You received " &&
(contains(line, "Dilithium") || contains(line, "Energy Credits"))
quantity,item = split(strip(line)[71:end], ' '; limit=2)
quantity = parse(Int, replace(quantity, ",", ""))
if item == "Refined Dilithium"
item = "Dilithium"
end
elseif length(line) > 66 && line[27:66] == ",0,NumericReceived@,@,,,System]You sold "
item = "Energy Credits"
quantity = parse(Int, replace(rsplit(line, ' '; limit=4)[2], ",", ""))
elseif length(line) > 78 && line[27:78] == ",0,NumericConversionSuccess@,@,,,System]You refined "
item = "Dilithium"
quantity = parse(Int, replace(line[79:end-13], ",", ""))
elseif length(line) > 63 && (line[27:56] == ",0,NumericLost@,@,,,System]You" &&
(line[57:62] == " lost " || line[57:63] == " spent ") &&
(contains(line, "Dilithium") || contains(line, "Energy Credits")))
quantity,item = split(strip(line[63:end]), ' '; limit=2)
quantity = -parse(Int, replace(quantity, ",", ""))
if item == "Refined Dilithium"
item = "Dilithium"
end
elseif length(line) > 83 && line[27:83] == ",0,ItemReceived@,@,,,System]Items acquired: Contraband x "
quantity = parse(Int, replace(rsplit(line[71:end], ' ', limit=2)[2], ",", ""))
item = "Contraband"
elseif length(line) > 79 && line[27:79] == ",0,ItemReceived@,@,,,System]Item acquired: Contraband"
quantity = 1
item = "Contraband"
else
continue
end
dt = TimeZones.astimezone(TimeZones.ZonedDateTime(DateTime(line[12:19]*line[21:26], "yyyymmddHHMMSS"), zone), TimeZones.TimeZone("UTC"))
current = TimeZones.floor(dt, Dates.Day)
if day != current && day != min_date
println(Dates.format(day, "yyyy-mm-dd"),
'\t', join((dct[key] for key in k), '\t'))
dct = Dict{String,Int64}(key=>0 for key in k)
end
day = current
if item in keys(dct)
dct[item] += quantity
end
end
end
end
end
println(Dates.format(day, "yyyy-mm-dd"), '\t', join((dct[key] for key in k), '\t'))
println(STDERR, "\nDone.")
last = files[end][6:15]
current = files[end]
for fname in files[end:-1:1]
current = fname
if fname[6:15] != last
break
end
end
open("lastlog.txt", "w") do f
write(f, current)
end
end
GetTotals()
@code_warntype
? Also,v[a:b]
in Julia creates a copy. You may want to reduce allocations by using views:@view v[a:b]
. To handle compilation times, put the function in a module and precompile the module. \$\endgroup\$ – Chris Rackauckas Jan 2 '17 at 7:43@code_warntype
but didn't understand the output (I can post or link it if you'd like to see it).@view
doesn't work on strings. I added__precompile__()
andmodule MyModule
to the top of the script (andend
at the bottom), and I'm getting the impression from the doc that it'll only make a difference if this module is imported into another script, because it still has a long delay before producing output. \$\endgroup\$ – TigerhawkT3 Jan 2 '17 at 9:00@code_warntype
. (I notified some people in the Gitter channel about this too since strings aren't really my thing. But many people there could help you) \$\endgroup\$ – Chris Rackauckas Jan 2 '17 at 16:06@code_warntype
output into a zip file here, along with some sample log files. \$\endgroup\$ – TigerhawkT3 Jan 3 '17 at 1:10