Take the 2-minute tour ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

Is there anything I can improve? The distance function is Pearson correlation.

import os
import pandas as pd
import numpy as np
from pandas import Series, DataFrame


def corrpairs(df1, df2):
    """
    Pairwise correlation for columns of two data frames
    :param df1:
    :type df1:
    :param df2:
    :type df2:
    :return:
    :rtype: pandas.core.frame.DataFrame
    """
    return df1.apply(lambda x: df2.corrwith(x))



import pdb
def kcluster(cols, k=4):
    """
    K Means clustering algorithm, applied to columns of a data frame.
    Using Pearson correlation as the distance function.
    :param rows:
    :type rows: pandas.core.frame.DataFrame
    :param k:
    :type k: int
    :return:
    :rtype: list[int]
    """
    cols = cols.astype(float)
    nrow, ncol = cols.shape
    nuclear0 = cols.iloc[:, :k]
    nuclear0.columns = range(k)
    nuclear0 += np.random.randn(np.prod(nuclear0.shape)).reshape(nuclear0.shape)

    correlations = corrpairs(cols, nuclear0)
    groups = correlations.idxmax(axis=0)
    nuclear1 = []
    for i in range(k):
        sub_cols = cols.loc[:, groups == i]
        sub_mean = sub_cols.mean(axis=1)
        nuclear1.append(sub_mean)
    nuclear1 = pd.concat(nuclear1, axis=1)

    while ((nuclear0 - nuclear1).abs() > 0.00001).any().any():
        print(nuclear0)
        print(nuclear1)
        print((nuclear0 - nuclear1).abs())
        nuclear0 = nuclear1
        correlations = corrpairs(cols, nuclear0)
        groups = correlations.idxmax(axis=0)
        nuclear1 = []
        for i in range(k):
            sub_cols = cols.loc[:, groups == i]
            sub_mean = sub_cols.mean(axis=1)
            nuclear1.append(sub_mean)
        nuclear1 = pd.concat(nuclear1, axis=1)

    return groups
share|improve this question
    
Are you sure that using Pearson correlation with K-means is a good idea? See here –  Janne Karila Jan 8 at 6:33

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.