Package nltk :: Package cluster :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.cluster.util

  1  # Natural Language Toolkit: Clusterer Utilities 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Trevor Cohn <tacohn@cs.mu.oz.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import copy 
  9  import sys 
 10  import math 
 11  import numpy 
 12   
 13  from api import * 
 14   
15 -class VectorSpaceClusterer(ClusterI):
16 """ 17 Abstract clusterer which takes tokens and maps them into a vector space. 18 Optionally performs singular value decomposition to reduce the 19 dimensionality. 20 """
21 - def __init__(self, normalise=False, svd_dimensions=None):
22 """ 23 @param normalise: should vectors be normalised to length 1 24 @type normalise: boolean 25 @param svd_dimensions: number of dimensions to use in reducing vector 26 dimensionsionality with SVD 27 @type svd_dimensions: int 28 """ 29 self._Tt = None 30 self._should_normalise = normalise 31 self._svd_dimensions = svd_dimensions
32
33 - def cluster(self, vectors, assign_clusters=False, trace=False):
34 assert len(vectors) > 0 35 36 # normalise the vectors 37 if self._should_normalise: 38 vectors = map(self._normalise, vectors) 39 40 # use SVD to reduce the dimensionality 41 if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): 42 [u, d, vt] = numpy.linalg.svd(numpy.transpose(array(vectors))) 43 S = d[:self._svd_dimensions] * \ 44 numpy.identity(self._svd_dimensions, numpy.Float64) 45 T = u[:,:self._svd_dimensions] 46 Dt = vt[:self._svd_dimensions,:] 47 vectors = numpy.transpose(numpy.matrixmultiply(S, Dt)) 48 self._Tt = numpy.transpose(T) 49 50 # call abstract method to cluster the vectors 51 self.cluster_vectorspace(vectors, trace) 52 53 # assign the vectors to clusters 54 if assign_clusters: 55 print self._Tt, vectors 56 return [self.classify(vector) for vector in vectors]
57
58 - def cluster_vectorspace(self, vectors, trace):
59 """ 60 Finds the clusters using the given set of vectors. 61 """ 62 raise AssertionError()
63
64 - def classify(self, vector):
65 if self._should_normalise: 66 vector = self._normalise(vector) 67 if self._Tt != None: 68 vector = numpy.matrixmultiply(self._Tt, vector) 69 cluster = self.classify_vectorspace(vector) 70 return self.cluster_name(cluster)
71
72 - def classify_vectorspace(self, vector):
73 """ 74 Returns the index of the appropriate cluster for the vector. 75 """ 76 raise AssertionError()
77
78 - def likelihood(self, vector, label):
79 if self._should_normalise: 80 vector = self._normalise(vector) 81 if self._Tt != None: 82 vector = numpy.matrixmultiply(self._Tt, vector) 83 return self.likelihood_vectorspace(vector, label)
84
85 - def likelihood_vectorspace(self, vector, cluster):
86 """ 87 Returns the likelihood of the vector belonging to the cluster. 88 """ 89 predicted = self.classify_vectorspace(vector) 90 if cluster == predicted: return 1.0 91 else: return 0.0
92
93 - def vector(self, vector):
94 """ 95 Returns the vector after normalisation and dimensionality reduction 96 """ 97 if self._should_normalise: 98 vector = self._normalise(vector) 99 if self._Tt != None: 100 vector = numpy.matrixmultiply(self._Tt, vector) 101 return vector
102
103 - def _normalise(self, vector):
104 """ 105 Normalises the vector to unit length. 106 """ 107 return vector / math.sqrt(numpy.dot(vector, vector))
108
109 -def euclidean_distance(u, v):
110 """ 111 Returns the euclidean distance between vectors u and v. This is equivalent 112 to the length of the vector (u - v). 113 """ 114 diff = u - v 115 return math.sqrt(numpy.dot(diff, diff))
116
117 -def cosine_distance(u, v):
118 """ 119 Returns the cosine of the angle between vectors v and u. This is equal to 120 u.v / |u||v|. 121 """ 122 return numpy.dot(u, v) / (math.sqrt(numpy.dot(u, u)) * math.sqrt(numpy.dot(v, v)))
123
124 -class _DendrogramNode(object):
125 """ Tree node of a dendrogram. """ 126
127 - def __init__(self, value, *children):
128 self._value = value 129 self._children = children
130
131 - def leaves(self, values=True):
132 if self._children: 133 leaves = [] 134 for child in self._children: 135 leaves.extend(child.leaves(values)) 136 return leaves 137 elif values: 138 return [self._value] 139 else: 140 return [self]
141
142 - def groups(self, n):
143 queue = [(self._value, self)] 144 145 while len(queue) < n: 146 priority, node = queue.pop() 147 if not node._children: 148 queue.push((priority, node)) 149 break 150 for child in node._children: 151 if child._children: 152 queue.append((child._value, child)) 153 else: 154 queue.append((0, child)) 155 # makes the earliest merges at the start, latest at the end 156 queue.sort() 157 158 groups = [] 159 for priority, node in queue: 160 groups.append(node.leaves()) 161 return groups
162
163 -class Dendrogram(object):
164 """ 165 Represents a dendrogram, a tree with a specified branching order. This 166 must be initialised with the leaf items, then iteratively call merge for 167 each branch. This class constructs a tree representing the order of calls 168 to the merge function. 169 """ 170
171 - def __init__(self, items=[]):
172 """ 173 @param items: the items at the leaves of the dendrogram 174 @type items: sequence of (any) 175 """ 176 self._items = [_DendrogramNode(item) for item in items] 177 self._original_items = copy.copy(self._items) 178 self._merge = 1
179
180 - def merge(self, *indices):
181 """ 182 Merges nodes at given indices in the dendrogram. The nodes will be 183 combined which then replaces the first node specified. All other nodes 184 involved in the merge will be removed. 185 186 @param indices: indices of the items to merge (at least two) 187 @type indices: seq of int 188 """ 189 assert len(indices) >= 2 190 node = _DendrogramNode(self._merge, *[self._items[i] for i in indices]) 191 self._merge += 1 192 self._items[indices[0]] = node 193 for i in indices[1:]: 194 del self._items[i]
195
196 - def groups(self, n):
197 """ 198 Finds the n-groups of items (leaves) reachable from a cut at depth n. 199 @param n: number of groups 200 @type n: int 201 """ 202 if len(self._items) > 1: 203 root = _DendrogramNode(self._merge, *self._items) 204 else: 205 root = self._items[0] 206 return root.groups(n)
207
208 - def show(self, leaf_labels=[]):
209 """ 210 Print the dendrogram in ASCII art to standard out. 211 @param leaf_labels: an optional list of strings to use for labeling the leaves 212 @type leaf_labels: list 213 """ 214 215 # ASCII rendering characters 216 JOIN, HLINK, VLINK = '+', '-', '|' 217 218 # find the root (or create one) 219 if len(self._items) > 1: 220 root = _DendrogramNode(self._merge, *self._items) 221 else: 222 root = self._items[0] 223 leaves = self._original_items 224 225 if leaf_labels: 226 last_row = leaf_labels 227 else: 228 last_row = [str(leaf._value) for leaf in leaves] 229 230 # find the bottom row and the best cell width 231 width = max(map(len, last_row)) + 1 232 lhalf = width / 2 233 rhalf = width - lhalf - 1 234 235 # display functions 236 def format(centre, left=' ', right=' '): 237 return '%s%s%s' % (lhalf*left, centre, right*rhalf)
238 def display(str): 239 sys.stdout.write(str)
240 241 # for each merge, top down 242 queue = [(root._value, root)] 243 verticals = [ format(' ') for leaf in leaves ] 244 while queue: 245 priority, node = queue.pop() 246 child_left_leaf = map(lambda c: c.leaves(False)[0], node._children) 247 indices = map(leaves.index, child_left_leaf) 248 if child_left_leaf: 249 min_idx = min(indices) 250 max_idx = max(indices) 251 for i in range(len(leaves)): 252 if leaves[i] in child_left_leaf: 253 if i == min_idx: display(format(JOIN, ' ', HLINK)) 254 elif i == max_idx: display(format(JOIN, HLINK, ' ')) 255 else: display(format(JOIN, HLINK, HLINK)) 256 verticals[i] = format(VLINK) 257 elif min_idx <= i <= max_idx: 258 display(format(HLINK, HLINK, HLINK)) 259 else: 260 display(verticals[i]) 261 display('\n') 262 for child in node._children: 263 if child._children: 264 queue.append((child._value, child)) 265 queue.sort() 266 267 for vertical in verticals: 268 display(vertical) 269 display('\n') 270 271 # finally, display the last line 272 display(''.join(item.center(width) for item in last_row)) 273 display('\n') 274
275 - def __repr__(self):
276 if len(self._items) > 1: 277 root = _DendrogramNode(self._merge, *self._items) 278 else: 279 root = self._items[0] 280 leaves = root.leaves(False) 281 return '<Dendrogram with %d leaves>' % len(leaves)
282