1
2
3
4
5
6
7
8 import numpy
9
10 from api import *
11 from util import *
12
14 """
15 The Group Average Agglomerative starts with each of the N vectors as singleton
16 clusters. It then iteratively merges pairs of clusters which have the
17 closest centroids. This continues until there is only one cluster. The
18 order of merges gives rise to a dendrogram: a tree with the earlier merges
19 lower than later merges. The membership of a given number of clusters c, 1
20 <= c <= N, can be found by cutting the dendrogram at depth c.
21
22 This clusterer uses the cosine similarity metric only, which allows for
23 efficient speed-up in the clustering process.
24 """
25
26 - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
31
32 - def cluster(self, vectors, assign_clusters=False, trace=False):
37
39
40 clusters = [[vector] for vector in vectors]
41
42
43 vector_sum = copy.copy(vectors)
44
45 while len(clusters) > max(self._num_clusters, 1):
46
47
48 best = None
49 for i in range(len(clusters)):
50 for j in range(i + 1, len(clusters)):
51 sim = self._average_similarity(
52 vector_sum[i], len(clusters[i]),
53 vector_sum[j], len(clusters[j]))
54 if not best or sim > best[0]:
55 best = (sim, i, j)
56
57
58 i, j = best[1:]
59 sum = clusters[i] + clusters[j]
60 if trace: print 'merging %d and %d' % (i, j)
61
62 clusters[i] = sum
63 del clusters[j]
64 vector_sum[i] = vector_sum[i] + vector_sum[j]
65 del vector_sum[j]
66
67 self._dendrogram.merge(i, j)
68
69 self.update_clusters(self._num_clusters)
70
88
90 best = None
91 for i in range(self._num_clusters):
92 centroid = self._centroids[i]
93 sim = self._average_similarity(vector, 1, centroid, 1)
94 if not best or sim > best[0]:
95 best = (sim, i)
96 return best[1]
97
99 """
100 @return: The dendrogram representing the current clustering
101 @rtype: Dendrogram
102 """
103 return self._dendrogram
104
106 return self._num_clusters
107
112
114 return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
115
117 """
118 Non-interactive demonstration of the clusterers with simple 2-D data.
119 """
120
121 from nltk import cluster
122
123
124 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]]
125
126
127 clusterer = cluster.GAAClusterer(4)
128 clusters = clusterer.cluster(vectors, True)
129
130 print 'Clusterer:', clusterer
131 print 'Clustered:', vectors
132 print 'As:', clusters
133 print
134
135
136 clusterer.dendrogram().show()
137
138
139 vector = numpy.array([3, 3])
140 print 'classify(%s):' % vector,
141 print clusterer.classify(vector)
142 print
143
144
145 if __name__ == '__main__':
146 demo()
147