Code Review Stack Exchange is a question and answer site for peer programmer code reviews. Join them; it only takes a minute:

Sign up
Here's how it works:
  1. Anybody can ask a question
  2. Anybody can answer
  3. The best answers are voted up and rise to the top

I've implemented the K-Means clustering algorithm in Python2, and I wanted to know what remarks you guys could make regarding my code. I've included a small test set with 2D-vectors and 2 classes, but it works with higher dimensions and more classes. Here, it should sort all the elements starting with the same letters in the same classes (except ak, with is quite in between).

What I'm interested in:

  • Did I do something unpythonic?
  • Are there unclear or unnecessary parts?
  • What changes could drastically improve the code's performance?

from __future__ import print_function
from operator import add, sub
from _collections import defaultdict
import random as rd
import math


class Element:
    def __init__(self, idf, data, cls='ukn'):
        self.idf = idf
        self.data = data
        self.cls = cls

    def __str__(self):
        return '%s: %s -> %s' % (str(self.idf), str(self.data), str(self.cls))

    def eucl_dist_to(self, other):
        diff = map(sub, self.data, other.data)
        norm2 = math.sqrt(sum([math.pow(coord, 2) for coord in diff]))
        return norm2


def main():
    test_data_tuple = [('aa', 1, 7), ('ab', 1, 8), ('ac', 1, 9), ('ba', 2, 1), ('bb', 2, 3), ('ad', 2, 8), ('ae', 2, 9),
                   ('bc', 3, 2), ('bd', 3, 4), ('af', 3, 7), ('be', 4, 2), ('bf', 4, 3), ('ag', 4, 8),
                   ('bg', 5, 1), ('bh', 5, 3), ('ah', 5, 6), ('bi', 6, 2), ('bj', 6, 4), ('ai', 6, 6), ('aj', 6, 7),
                   ('bk', 7, 0), ('bl', 7, 2), ('bm', 7, 3), ('ak', 7, 5)]
    test_data_elt = list()
    for t in test_data_tuple:
        test_data_elt.append(Element(t[0], tuple(t[1:])))
    # test_data_elt.sort(key=lambda e: e.idf)
    results = k_means(test_data_elt, 2, 5)


def k_means(elts, nb_classes, nb_steps):
    k = nb_classes
    #init
    centroids = list()
    cls_content = defaultdict(list)
    for cls_nb in range(k):
        cls_name = 'class_'+str(cls_nb)
        centroids.append(Element(cls_name, rd.choice(elts).data, cls='centroid'))
        cls_content[cls_name] = list()

    for itr in range(nb_steps):
        # assign
        cls_content.clear()
        for elt in elts:
            min_dist = float('inf')
            for c in centroids:
                dist = c.eucl_dist_to(elt)
                if dist < min_dist:
                    min_dist = dist
                    best_class = c.idf
            cls_content[best_class].append(elt)
            elt.cls = best_class
        # adjust
        for c in centroids:
            elts_in = list(cls_content[c.idf])
            if len(elts_in) == 0:
                sum_elts_data = 0
            else:
                sum_elts_data = list(elts_in[0].data)
                for elt in elts_in[1:]:
                    sum_elts_data = map(add, sum_elts_data, elt.data)
                sum_elts_data[:] = [e / float(len(elts_in)) for e in sum_elts_data]
                c.data = tuple(sum_elts_data)
        print('\nIteration %d' % itr)
        for c in centroids:
            cls_name = c.idf
            print('\n'+str(c))
            for e in cls_content[cls_name]:
                print(e)
    return cls_content


if __name__ == '__main__':
    main()
share|improve this question
    
Did you consider using scipy.cluster.vq.kmeans? – Gareth Rees Jan 9 at 19:41
    
Well, the point was to code the thing myself, I sure assumed that really good implementations already existed, I just wanted to have a go on my own :) Although I didn't know this one, thanks for having linked it! – BusyAnt Jan 10 at 11:30
1  
That's fine, I just wanted to check. (Take a look at this answer for some implementation ideas.) – Gareth Rees Jan 14 at 19:20

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.