Package nltk :: Package cluster :: Module kmeans
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.kmeans

  1  # Natural Language Toolkit: K-Means Clusterer 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import numpy 
  9  import random 
 10   
 11  from api import * 
 12  from util import * 
 13   
14 -class KMeansClusterer(VectorSpaceClusterer):
15 """ 16 The K-means clusterer starts with k arbitrary chosen means then allocates 17 each vector to the cluster with the closest mean. It then recalculates the 18 means of each cluster as the centroid of the vectors in the cluster. This 19 process repeats until the cluster memberships stabilise. This is a 20 hill-climbing algorithm which may converge to a local maximum. Hence the 21 clustering is often repeated with random initial means and the most 22 commonly occuring output means are chosen. 23 """ 24
25 - def __init__(self, num_means, distance, repeats=1, 26 conv_test=1e-6, initial_means=None, 27 normalise=False, svd_dimensions=None, 28 rng=None):
29 """ 30 @param num_means: the number of means to use (may use fewer) 31 @type num_means: int 32 @param distance: measure of distance between two vectors 33 @type distance: function taking two vectors and returing a float 34 @param repeats: number of randomised clustering trials to use 35 @type repeats: int 36 @param conv_test: maximum variation in mean differences before 37 deemed convergent 38 @type conv_test: number 39 @param initial_means: set of k initial means 40 @type initial_means: sequence of vectors 41 @param normalise: should vectors be normalised to length 1 42 @type normalise: boolean 43 @param svd_dimensions: number of dimensions to use in reducing vector 44 dimensionsionality with SVD 45 @type svd_dimensions: int 46 @param rng: random number generator (or None) 47 @type rng: Random 48 """ 49 VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) 50 self._num_means = num_means 51 self._distance = distance 52 self._max_difference = conv_test 53 assert not initial_means or len(initial_means) == num_means 54 self._means = initial_means 55 assert repeats >= 1 56 assert not (initial_means and repeats > 1) 57 self._repeats = repeats 58 if rng: self._rng = rng 59 else: self._rng = random.Random()
60
61 - def cluster_vectorspace(self, vectors, trace=False):
62 if self._means and self._repeats > 1: 63 print 'Warning: means will be discarded for subsequent trials' 64 65 meanss = [] 66 for trial in range(self._repeats): 67 if trace: print 'k-means trial', trial 68 if not self._means or trial > 1: 69 self._means = self._rng.sample(vectors, self._num_means) 70 self._cluster_vectorspace(vectors, trace) 71 meanss.append(self._means) 72 73 if len(meanss) > 1: 74 # sort the means first (so that different cluster numbering won't 75 # effect the distance comparison) 76 for means in meanss: 77 means.sort(cmp = _vector_compare) 78 79 # find the set of means that's minimally different from the others 80 min_difference = min_means = None 81 for i in range(len(meanss)): 82 d = 0 83 for j in range(len(meanss)): 84 if i != j: 85 d += self._sum_distances(meanss[i], meanss[j]) 86 if min_difference == None or d < min_difference: 87 min_difference, min_means = d, meanss[i] 88 89 # use the best means 90 self._means = min_means
91
92 - def _cluster_vectorspace(self, vectors, trace=False):
93 if self._num_means < len(vectors): 94 # perform k-means clustering 95 converged = False 96 while not converged: 97 # assign the tokens to clusters based on minimum distance to 98 # the cluster means 99 clusters = [[] for m in range(self._num_means)] 100 for vector in vectors: 101 index = self.classify_vectorspace(vector) 102 clusters[index].append(vector) 103 104 if trace: print 'iteration' 105 #for i in range(self._num_means): 106 #print ' mean', i, 'allocated', len(clusters[i]), 'vectors' 107 108 # recalculate cluster means by computing the centroid of each cluster 109 new_means = map(self._centroid, clusters) 110 111 # measure the degree of change from the previous step for convergence 112 difference = self._sum_distances(self._means, new_means) 113 if difference < self._max_difference: 114 converged = True 115 116 # remember the new means 117 self._means = new_means
118
119 - def classify_vectorspace(self, vector):
120 # finds the closest cluster centroid 121 # returns that cluster's index 122 best_distance = best_index = None 123 for index in range(len(self._means)): 124 mean = self._means[index] 125 dist = self._distance(vector, mean) 126 if best_distance == None or dist < best_distance: 127 best_index, best_distance = index, dist 128 return best_index
129
130 - def num_clusters(self):
131 if self._means: 132 return len(self._means) 133 else: 134 return self._num_means
135
136 - def means(self):
137 """ 138 The means used for clustering. 139 """ 140 return self._means
141
142 - def _sum_distances(self, vectors1, vectors2):
143 difference = 0.0 144 for u, v in zip(vectors1, vectors2): 145 difference += self._distance(u, v) 146 return difference
147
148 - def _centroid(self, cluster):
149 assert len(cluster) > 0 150 centroid = copy.copy(cluster[0]) 151 for vector in cluster[1:]: 152 centroid += vector 153 return centroid / float(len(cluster))
154
155 - def __repr__(self):
156 return '<KMeansClusterer means=%s repeats=%d>' % \ 157 (self._means, self._repeats)
158
159 -def _vector_compare(x, y):
160 xs, ys = sum(x), sum(y) 161 if xs < ys: return -1 162 elif xs > ys: return 1 163 else: return 0
164 165 ################################################################################# 166
167 -def demo():
168 # example from figure 14.9, page 517, Manning and Schutze 169 170 from nltk import cluster 171 172 vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] 173 means = [[4, 3], [5, 5]] 174 175 clusterer = cluster.KMeansClusterer(2, euclidean_distance, initial_means=means) 176 clusters = clusterer.cluster(vectors, True, trace=True) 177 178 print 'Clustered:', vectors 179 print 'As:', clusters 180 print 'Means:', clusterer.means() 181 print 182 183 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 184 185 # test k-means using the euclidean distance metric, 2 means and repeat 186 # clustering 10 times with random seeds 187 188 clusterer = cluster.KMeansClusterer(2, euclidean_distance, repeats=10) 189 clusters = clusterer.cluster(vectors, True) 190 print 'Clustered:', vectors 191 print 'As:', clusters 192 print 'Means:', clusterer.means() 193 print 194 195 # classify a new vector 196 vector = numpy.array([3, 3]) 197 print 'classify(%s):' % vector, 198 print clusterer.classify(vector) 199 print
200 201 if __name__ == '__main__': 202 demo() 203