1
2
3
4
5
6
7
8 import copy
9 import sys
10 import math
11 import numpy
12
13 from api import *
14
16 """
17 Abstract clusterer which takes tokens and maps them into a vector space.
18 Optionally performs singular value decomposition to reduce the
19 dimensionality.
20 """
21 - def __init__(self, normalise=False, svd_dimensions=None):
22 """
23 @param normalise: should vectors be normalised to length 1
24 @type normalise: boolean
25 @param svd_dimensions: number of dimensions to use in reducing vector
26 dimensionsionality with SVD
27 @type svd_dimensions: int
28 """
29 self._Tt = None
30 self._should_normalise = normalise
31 self._svd_dimensions = svd_dimensions
32
33 - def cluster(self, vectors, assign_clusters=False, trace=False):
34 assert len(vectors) > 0
35
36
37 if self._should_normalise:
38 vectors = map(self._normalise, vectors)
39
40
41 if self._svd_dimensions and self._svd_dimensions < len(vectors[0]):
42 [u, d, vt] = numpy.linalg.svd(numpy.transpose(array(vectors)))
43 S = d[:self._svd_dimensions] * \
44 numpy.identity(self._svd_dimensions, numpy.Float64)
45 T = u[:,:self._svd_dimensions]
46 Dt = vt[:self._svd_dimensions,:]
47 vectors = numpy.transpose(numpy.matrixmultiply(S, Dt))
48 self._Tt = numpy.transpose(T)
49
50
51 self.cluster_vectorspace(vectors, trace)
52
53
54 if assign_clusters:
55 print self._Tt, vectors
56 return [self.classify(vector) for vector in vectors]
57
59 """
60 Finds the clusters using the given set of vectors.
61 """
62 raise AssertionError()
63
71
73 """
74 Returns the index of the appropriate cluster for the vector.
75 """
76 raise AssertionError()
77
84
86 """
87 Returns the likelihood of the vector belonging to the cluster.
88 """
89 predicted = self.classify_vectorspace(vector)
90 if cluster == predicted: return 1.0
91 else: return 0.0
92
94 """
95 Returns the vector after normalisation and dimensionality reduction
96 """
97 if self._should_normalise:
98 vector = self._normalise(vector)
99 if self._Tt != None:
100 vector = numpy.matrixmultiply(self._Tt, vector)
101 return vector
102
108
110 """
111 Returns the euclidean distance between vectors u and v. This is equivalent
112 to the length of the vector (u - v).
113 """
114 diff = u - v
115 return math.sqrt(numpy.dot(diff, diff))
116
118 """
119 Returns the cosine of the angle between vectors v and u. This is equal to
120 u.v / |u||v|.
121 """
122 return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
123
125 """ Tree node of a dendrogram. """
126
128 self._value = value
129 self._children = children
130
131 - def leaves(self, values=True):
141
162
164 """
165 Represents a dendrogram, a tree with a specified branching order. This
166 must be initialised with the leaf items, then iteratively call merge for
167 each branch. This class constructs a tree representing the order of calls
168 to the merge function.
169 """
170
172 """
173 @param items: the items at the leaves of the dendrogram
174 @type items: sequence of (any)
175 """
176 self._items = [_DendrogramNode(item) for item in items]
177 self._original_items = copy.copy(self._items)
178 self._merge = 1
179
180 - def merge(self, *indices):
181 """
182 Merges nodes at given indices in the dendrogram. The nodes will be
183 combined which then replaces the first node specified. All other nodes
184 involved in the merge will be removed.
185
186 @param indices: indices of the items to merge (at least two)
187 @type indices: seq of int
188 """
189 assert len(indices) >= 2
190 node = _DendrogramNode(self._merge, *[self._items[i] for i in indices])
191 self._merge += 1
192 self._items[indices[0]] = node
193 for i in indices[1:]:
194 del self._items[i]
195
197 """
198 Finds the n-groups of items (leaves) reachable from a cut at depth n.
199 @param n: number of groups
200 @type n: int
201 """
202 if len(self._items) > 1:
203 root = _DendrogramNode(self._merge, *self._items)
204 else:
205 root = self._items[0]
206 return root.groups(n)
207
208 - def show(self, leaf_labels=[]):
209 """
210 Print the dendrogram in ASCII art to standard out.
211 @param leaf_labels: an optional list of strings to use for labeling the leaves
212 @type leaf_labels: list
213 """
214
215
216 JOIN, HLINK, VLINK = '+', '-', '|'
217
218
219 if len(self._items) > 1:
220 root = _DendrogramNode(self._merge, *self._items)
221 else:
222 root = self._items[0]
223 leaves = self._original_items
224
225 if leaf_labels:
226 last_row = leaf_labels
227 else:
228 last_row = [str(leaf._value) for leaf in leaves]
229
230
231 width = max(map(len, last_row)) + 1
232 lhalf = width / 2
233 rhalf = width - lhalf - 1
234
235
236 def format(centre, left=' ', right=' '):
237 return '%s%s%s' % (lhalf*left, centre, right*rhalf)
238 def display(str):
239 sys.stdout.write(str)
240
241
242 queue = [(root._value, root)]
243 verticals = [ format(' ') for leaf in leaves ]
244 while queue:
245 priority, node = queue.pop()
246 child_left_leaf = map(lambda c: c.leaves(False)[0], node._children)
247 indices = map(leaves.index, child_left_leaf)
248 if child_left_leaf:
249 min_idx = min(indices)
250 max_idx = max(indices)
251 for i in range(len(leaves)):
252 if leaves[i] in child_left_leaf:
253 if i == min_idx: display(format(JOIN, ' ', HLINK))
254 elif i == max_idx: display(format(JOIN, HLINK, ' '))
255 else: display(format(JOIN, HLINK, HLINK))
256 verticals[i] = format(VLINK)
257 elif min_idx <= i <= max_idx:
258 display(format(HLINK, HLINK, HLINK))
259 else:
260 display(verticals[i])
261 display('\n')
262 for child in node._children:
263 if child._children:
264 queue.append((child._value, child))
265 queue.sort()
266
267 for vertical in verticals:
268 display(vertical)
269 display('\n')
270
271
272 display(''.join(item.center(width) for item in last_row))
273 display('\n')
274
282