I'm trying to optimize a python string that works on big data sets, the way it works is by taking in a with a list of keywords and scores and taking in a file loaded with data from the twitter api. The program does a keyword match against tweet text. At the end of the program I want to produce an average for each term found in text object of the json file. e.g.
sad 3
With sad being the keyword and 3 being the average score.
It's running way too slow but I'm new to Python coming from a php background and I think I'm doing things the php way in python.
How can I get this code to run faster?
import sys
import json
import re
def findRecord(key, records):
for r in records:
if r[0] == key:
return r
def average_records(records):
for r in records:
if r[1] > 0:
avg = r[1] / r[2]
print r[0] + ' ' + str(avg)
else:
avg = r[3] / r[4]
print r[0] + ' ' + str(avg)
def hw(sent_file, tweet_file):
scores = {}
sent_file = open(sent_file, 'r')
for line in sent_file:
term, score = line.split("\t")
scores[term] = int(score)
recored_affin = []
#print scores.items()
data = []
with open(tweet_file, 'r') as f:
for line in f:
data.append(json.loads(line))
#print data[4]['text']
for tweet in data:
total = 0
if 'text' in tweet:
for k, v in scores.iteritems():
#print tweet['text']
num_of_aff = len(re.findall(k, tweet['text']))
if num_of_aff > 0:
#print "Number is: " + str(num_of_aff)
#print "Word is: " + k
#print "Tweet is: " + tweet['text']
total += (v * num_of_aff)
#print "Score is: " + str(total)
#while count < len(recorded_affin):
foundRow = findRecord(k, recored_affin)
if foundRow != None:
index = recored_affin.index(foundRow)
quick_rec = recored_affin[index]
if v > 0:
new_value = quick_rec[1] + v
new_count = quick_rec[2] + 1
old_neg_value = 0
old_neg_count = 0
recored_affin.append([k, new_value, new_count, old_neg_value, old_neg_count])
recored_affin.remove(foundRow)
elif v < 0:
old_pos_value = 0
old_pos_count = 0
new_value = quick_rec[3] + v
new_count = quick_rec[4] + 1
recored_affin.append([k, old_pos_value, old_pos_count, new_value, new_count])
recored_affin.remove(foundRow)
else:
if v > 0:
recored_affin.append([k,v,1,0,0])
elif v < 0:
recored_affin.append([k,0,0,v,1])
#print recored_affin
##print foundRow
##print total
average_records(recored_affin)
def lines(fp):
print str(len(fp.readlines()))
def main():
sent_file = open(sys.argv[1])
tweet_file = open(sys.argv[2])
hw(sys.argv[1], sys.argv[2])
#lines(sent_file)
#lines(tweet_file)
if __name__ == '__main__':
main()