1
2
3
4
5
6
7
8 import numpy
9 import random
10
11 from api import *
12 from util import *
13
15 """
16 The K-means clusterer starts with k arbitrary chosen means then allocates
17 each vector to the cluster with the closest mean. It then recalculates the
18 means of each cluster as the centroid of the vectors in the cluster. This
19 process repeats until the cluster memberships stabilise. This is a
20 hill-climbing algorithm which may converge to a local maximum. Hence the
21 clustering is often repeated with random initial means and the most
22 commonly occuring output means are chosen.
23 """
24
25 - def __init__(self, num_means, distance, repeats=1,
26 conv_test=1e-6, initial_means=None,
27 normalise=False, svd_dimensions=None,
28 rng=None):
29 """
30 @param num_means: the number of means to use (may use fewer)
31 @type num_means: int
32 @param distance: measure of distance between two vectors
33 @type distance: function taking two vectors and returing a float
34 @param repeats: number of randomised clustering trials to use
35 @type repeats: int
36 @param conv_test: maximum variation in mean differences before
37 deemed convergent
38 @type conv_test: number
39 @param initial_means: set of k initial means
40 @type initial_means: sequence of vectors
41 @param normalise: should vectors be normalised to length 1
42 @type normalise: boolean
43 @param svd_dimensions: number of dimensions to use in reducing vector
44 dimensionsionality with SVD
45 @type svd_dimensions: int
46 @param rng: random number generator (or None)
47 @type rng: Random
48 """
49 VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
50 self._num_means = num_means
51 self._distance = distance
52 self._max_difference = conv_test
53 assert not initial_means or len(initial_means) == num_means
54 self._means = initial_means
55 assert repeats >= 1
56 assert not (initial_means and repeats > 1)
57 self._repeats = repeats
58 if rng: self._rng = rng
59 else: self._rng = random.Random()
60
62 if self._means and self._repeats > 1:
63 print 'Warning: means will be discarded for subsequent trials'
64
65 meanss = []
66 for trial in range(self._repeats):
67 if trace: print 'k-means trial', trial
68 if not self._means or trial > 1:
69 self._means = self._rng.sample(vectors, self._num_means)
70 self._cluster_vectorspace(vectors, trace)
71 meanss.append(self._means)
72
73 if len(meanss) > 1:
74
75
76 for means in meanss:
77 means.sort(cmp = _vector_compare)
78
79
80 min_difference = min_means = None
81 for i in range(len(meanss)):
82 d = 0
83 for j in range(len(meanss)):
84 if i != j:
85 d += self._sum_distances(meanss[i], meanss[j])
86 if min_difference == None or d < min_difference:
87 min_difference, min_means = d, meanss[i]
88
89
90 self._means = min_means
91
93 if self._num_means < len(vectors):
94
95 converged = False
96 while not converged:
97
98
99 clusters = [[] for m in range(self._num_means)]
100 for vector in vectors:
101 index = self.classify_vectorspace(vector)
102 clusters[index].append(vector)
103
104 if trace: print 'iteration'
105
106
107
108
109 new_means = map(self._centroid, clusters)
110
111
112 difference = self._sum_distances(self._means, new_means)
113 if difference < self._max_difference:
114 converged = True
115
116
117 self._means = new_means
118
120
121
122 best_distance = best_index = None
123 for index in range(len(self._means)):
124 mean = self._means[index]
125 dist = self._distance(vector, mean)
126 if best_distance == None or dist < best_distance:
127 best_index, best_distance = index, dist
128 return best_index
129
131 if self._means:
132 return len(self._means)
133 else:
134 return self._num_means
135
137 """
138 The means used for clustering.
139 """
140 return self._means
141
143 difference = 0.0
144 for u, v in zip(vectors1, vectors2):
145 difference += self._distance(u, v)
146 return difference
147
154
156 return '<KMeansClusterer means=%s repeats=%d>' % \
157 (self._means, self._repeats)
158
160 xs, ys = sum(x), sum(y)
161 if xs < ys: return -1
162 elif xs > ys: return 1
163 else: return 0
164
165
166
168
169
170 from nltk import cluster
171
172 vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
173 means = [[4, 3], [5, 5]]
174
175 clusterer = cluster.KMeansClusterer(2, euclidean_distance, initial_means=means)
176 clusters = clusterer.cluster(vectors, True, trace=True)
177
178 print 'Clustered:', vectors
179 print 'As:', clusters
180 print 'Means:', clusterer.means()
181 print
182
183 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
184
185
186
187
188 clusterer = cluster.KMeansClusterer(2, euclidean_distance, repeats=10)
189 clusters = clusterer.cluster(vectors, True)
190 print 'Clustered:', vectors
191 print 'As:', clusters
192 print 'Means:', clusterer.means()
193 print
194
195
196 vector = numpy.array([3, 3])
197 print 'classify(%s):' % vector,
198 print clusterer.classify(vector)
199 print
200
201 if __name__ == '__main__':
202 demo()
203