I want to optimize my Apriori algorithm for speed.
from itertools import combinations
import pandas as pd
import numpy as np
trans=pd.read_table('output.txt', header=None,index_col=0)
def apriori(trans, support=0.01, minlen=1):
ts=pd.get_dummies(trans.unstack().dropna()).groupby(level=1).sum()
collen, rowlen =ts.shape
#-------------Max leng (not used)
#tssum=ts.sum(axis=1)
#maxlen=int(tssum.loc[tssum.idxmax()])
pattern = []
for cnum in range(minlen, rowlen+1):
for cols in combinations(ts, cnum):
patsup = ts[list(cols)].all(axis=1).sum()
patsup=float(patsup)/collen
pattern.append([",".join(cols), patsup])
sdf = pd.DataFrame(pattern, columns=["Pattern", "Support"])
results=sdf[sdf.Support >= support]
return results
When you input a dataframe of transactions:
>>> trans
1 2 3 4
0
11 a b c NaN
666 a d e NaN
10101 b c d NaN
1010 a b c d
414147 b c NaN NaN
10101 a b d NaN
1242 d e NaN NaN
101 a b c NaN
411 c d e NaN
444 a b c NaN
[10 rows x 4 columns]
It will yield:
Ap=apriori(trans)
print Ap
>>>
Pattern Support
0 a 0.6
1 b 0.7
2 c 0.7
3 d 0.6
4 e 0.3
5 a,b 0.5
6 a,c 0.4
7 a,d 0.3
8 a,e 0.1
9 b,c 0.6
10 b,d 0.3
12 c,d 0.3
13 c,e 0.1
14 d,e 0.3
15 a,b,c 0.4
16 a,b,d 0.2
18 a,c,d 0.1
20 a,d,e 0.1
21 b,c,d 0.2
24 c,d,e 0.1
I want to know if this can be optimized further so that it can run faster on large datasets. I also want to know if there a way to use purely pandas without combinations from itertools.