Package nltk :: Package cluster :: Module gaac
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.gaac

  1  # Natural Language Toolkit: Group Average Agglomerative Clusterer 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import numpy 
  9   
 10  from api import * 
 11  from util import * 
 12   
13 -class GAAClusterer(VectorSpaceClusterer):
14 """ 15 The Group Average Agglomerative starts with each of the N vectors as singleton 16 clusters. It then iteratively merges pairs of clusters which have the 17 closest centroids. This continues until there is only one cluster. The 18 order of merges gives rise to a dendrogram: a tree with the earlier merges 19 lower than later merges. The membership of a given number of clusters c, 1 20 <= c <= N, can be found by cutting the dendrogram at depth c. 21 22 This clusterer uses the cosine similarity metric only, which allows for 23 efficient speed-up in the clustering process. 24 """ 25
26 - def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
27 VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) 28 self._num_clusters = num_clusters 29 self._dendrogram = None 30 self._groups_values = None
31
32 - def cluster(self, vectors, assign_clusters=False, trace=False):
33 # stores the merge order 34 self._dendrogram = Dendrogram( 35 [numpy.array(vector, numpy.float64) for vector in vectors]) 36 return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
37
38 - def cluster_vectorspace(self, vectors, trace=False):
39 # create a cluster for each vector 40 clusters = [[vector] for vector in vectors] 41 42 # the sum vectors 43 vector_sum = copy.copy(vectors) 44 45 while len(clusters) > max(self._num_clusters, 1): 46 # find the two best candidate clusters to merge, based on their 47 # S(union c_i, c_j) 48 best = None 49 for i in range(len(clusters)): 50 for j in range(i + 1, len(clusters)): 51 sim = self._average_similarity( 52 vector_sum[i], len(clusters[i]), 53 vector_sum[j], len(clusters[j])) 54 if not best or sim > best[0]: 55 best = (sim, i, j) 56 57 # merge them and replace in cluster list 58 i, j = best[1:] 59 sum = clusters[i] + clusters[j] 60 if trace: print 'merging %d and %d' % (i, j) 61 62 clusters[i] = sum 63 del clusters[j] 64 vector_sum[i] = vector_sum[i] + vector_sum[j] 65 del vector_sum[j] 66 67 self._dendrogram.merge(i, j) 68 69 self.update_clusters(self._num_clusters)
70
71 - def update_clusters(self, num_clusters):
72 clusters = self._dendrogram.groups(num_clusters) 73 self._centroids = [] 74 for cluster in clusters: 75 assert len(cluster) > 0 76 if self._should_normalise: 77 centroid = self._normalise(cluster[0]) 78 else: 79 centroid = numpy.array(cluster[0]) 80 for vector in cluster[1:]: 81 if self._should_normalise: 82 centroid += self._normalise(vector) 83 else: 84 centroid += vector 85 centroid /= float(len(cluster)) 86 self._centroids.append(centroid) 87 self._num_clusters = len(self._centroids)
88
89 - def classify_vectorspace(self, vector):
90 best = None 91 for i in range(self._num_clusters): 92 centroid = self._centroids[i] 93 sim = self._average_similarity(vector, 1, centroid, 1) 94 if not best or sim > best[0]: 95 best = (sim, i) 96 return best[1]
97
98 - def dendrogram(self):
99 """ 100 @return: The dendrogram representing the current clustering 101 @rtype: Dendrogram 102 """ 103 return self._dendrogram
104
105 - def num_clusters(self):
106 return self._num_clusters
107
108 - def _average_similarity(self, v1, l1, v2, l2):
109 sum = v1 + v2 110 length = l1 + l2 111 return (numpy.dot(sum, sum) - length) / (length * (length - 1))
112
113 - def __repr__(self):
114 return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
115
116 -def demo():
117 """ 118 Non-interactive demonstration of the clusterers with simple 2-D data. 119 """ 120 121 from nltk import cluster 122 123 # use a set of tokens with 2D indices 124 vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] 125 126 # test the GAAC clusterer with 4 clusters 127 clusterer = cluster.GAAClusterer(4) 128 clusters = clusterer.cluster(vectors, True) 129 130 print 'Clusterer:', clusterer 131 print 'Clustered:', vectors 132 print 'As:', clusters 133 print 134 135 # show the dendrogram 136 clusterer.dendrogram().show() 137 138 # classify a new vector 139 vector = numpy.array([3, 3]) 140 print 'classify(%s):' % vector, 141 print clusterer.classify(vector) 142 print
143 144 145 if __name__ == '__main__': 146 demo() 147