Is there anything I can improve? The distance function is Pearson correlation.
import os
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
def corrpairs(df1, df2):
"""
Pairwise correlation for columns of two data frames
:param df1:
:type df1:
:param df2:
:type df2:
:return:
:rtype: pandas.core.frame.DataFrame
"""
return df1.apply(lambda x: df2.corrwith(x))
import pdb
def kcluster(cols, k=4):
"""
K Means clustering algorithm, applied to columns of a data frame.
Using Pearson correlation as the distance function.
:param rows:
:type rows: pandas.core.frame.DataFrame
:param k:
:type k: int
:return:
:rtype: list[int]
"""
cols = cols.astype(float)
nrow, ncol = cols.shape
nuclear0 = cols.iloc[:, :k]
nuclear0.columns = range(k)
nuclear0 += np.random.randn(np.prod(nuclear0.shape)).reshape(nuclear0.shape)
correlations = corrpairs(cols, nuclear0)
groups = correlations.idxmax(axis=0)
nuclear1 = []
for i in range(k):
sub_cols = cols.loc[:, groups == i]
sub_mean = sub_cols.mean(axis=1)
nuclear1.append(sub_mean)
nuclear1 = pd.concat(nuclear1, axis=1)
while ((nuclear0 - nuclear1).abs() > 0.00001).any().any():
print(nuclear0)
print(nuclear1)
print((nuclear0 - nuclear1).abs())
nuclear0 = nuclear1
correlations = corrpairs(cols, nuclear0)
groups = correlations.idxmax(axis=0)
nuclear1 = []
for i in range(k):
sub_cols = cols.loc[:, groups == i]
sub_mean = sub_cols.mean(axis=1)
nuclear1.append(sub_mean)
nuclear1 = pd.concat(nuclear1, axis=1)
return groups