This program takes a huge data set as input, processes it, calculates and then writes the output to an array. Most calculations may be quite simple, such as summation. In the input file, there are about 100 million rows and 3 columns.
- first column is the name of the gene (total 100 millions)
- second column is the specific value
- third column is another value of each gene
The problem I face is a long runtime. How can I reduce it?
I need to write all new values (from GenePair
to RM_pval
with header) I calculated from the new file.
fi = open ('1.txt')
fo = open ('2.txt','w')
import math
def log(x):
return math.log(x)
from math import sqrt
import sys
sys.path.append('/tools/lib/python2.7/site-packages')
import numpy
import scipy
import numpy as np
from scipy.stats.distributions import norm
for line in fi.xreadlines():
tmp = line.split('\t')
GenePair = tmp[0].strip()
PCC_A = float(tmp[1].strip())
PCC_B = float(tmp[2].strip())
ZVAL_A = 0.5 * log((1+PCC_A)/(1-PCC_A))
ZVAL_B = 0.5 * log((1+PCC_B)/(1-PCC_B))
ABS_ZVAL_A = abs(ZVAL_A)
ABS_ZVAL_B = abs(ZVAL_B)
Var_A = float(1) / float(21-3) #SAMPLESIZE - 3
Var_B = float(1) / float(18-3) #SAMPLESIZE - 3
WT_A = 1/Var_A #float
WT_B = 1/Var_B #float
ZVAL_A_X_WT_A = ZVAL_A * WT_A #float
ZVAL_B_X_WT_B = ZVAL_B * WT_B #float
SumofWT = (WT_A + WT_B) #float
SumofZVAL_X_WT = (ZVAL_A_X_WT_A + ZVAL_B_X_WT_B) #float
#FIXED MODEL
meanES = SumofZVAL_X_WT / SumofWT #float
Var = float(1) / SumofWT #float
SE = math.sqrt(float(Var)) #float
LL = meanES - (1.96 * SE) #float
UL = meanES - (1.96 * SE) #float
z_score = meanES / SE #float
p_val = scipy.stats.norm.sf(z_score)
#CAL
ES_POWER_X_WT_A = pow(ZVAL_A,2) * WT_A #float
ES_POWER_X_WT_B = pow(ZVAL_B,2) * WT_B #float
WT_POWER_A = pow(WT_A,2)
WT_POWER_B = pow(WT_B,2)
SumofES_POWER_X_WT = ES_POWER_X_WT_A + ES_POWER_X_WT_B
SumofWT_POWER = WT_POWER_A + WT_POWER_B
#COMPUTE TAU
tmp_A = ZVAL_A - meanES
tmp_B = ZVAL_B - meanES
temp = pow(SumofZVAL_X_WT,2)
Q = SumofES_POWER_X_WT - (temp /(SumofWT))
if PCC_A !=0 or PCC_B !=0:
df = 0
else:
df = 1
c = SumofWT - ((pow(SumofWT,2))/SumofWT)
if c == 0:
tau_square = 0
else:
tau_square = (Q - df) / c
#calculation
Var_total_A = Var_A + tau_square
Var_total_B = Var_B + tau_square
WT_total_A = float(1) / Var_total_A
WT_total_B = float(1) / Var_total_B
ZVAL_X_WT_total_A = ZVAL_A * WT_total_A
ZVAL_X_WT_total_B = ZVAL_B * WT_total_B
Sumoftotal_WT = WT_total_A + WT_total_B
Sumoftotal_ZVAL_X_WT= ZVAL_X_WT_total_A + ZVAL_X_WT_total_B
#RANDOM MODEL
RM_meanES = Sumoftotal_ZVAL_X_WT / Sumoftotal_WT
RM_Var = float(1) / Sumoftotal_WT
RM_SE = math.sqrt(float(RM_Var))
RM_LL = RM_meanES - (1.96 * RM_SE)
RM_UL = RM_meanES + (1.96 * RM_SE)
RM_z_score = RM_meanES / RM_Var
RM_p_val = scipy.stats.norm.sf(RM_z_score)
numpy
but don't take advantage of its vectorized operations. See What is NumPy? to get some ideas. – Janne Karila Nov 1 '13 at 8:16