Package nltk :: Module collocations
[hide private]
[frames] | no frames]

Source Code for Module nltk.collocations

  1  # Natural Language Toolkit: Collocations and Association Measures  
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Joel Nothman <jnothman@student.usyd.edu.au> 
  5  # URL: <http://nltk.org> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  """ 
  9  Tools to identify X{collocation}s --- words that often appear consecutively 
 10  --- within corpora. They may also be used to find other X{association}s between 
 11  word occurrences. 
 12  See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf 
 13  and the Text::NSP Perl package at http://ngram.sourceforge.net 
 14   
 15  Finding collocations requires first calculating the frequencies of words and 
 16  their appearance in the context of other words. Often the collection of words 
 17  will then requiring filtering to only retain useful content terms. Each ngram 
 18  of words may then be scored according to some X{association measure}, in order 
 19  to determine the relative likelihood of each ngram being a collocation. 
 20   
 21  The L{BigramCollocationFinder} and L{TrigramCollocationFinder} classes provide 
 22  these functionalities, dependent on being provided a function which scores a 
 23  ngram given appropriate frequency counts. A number of standard association 
 24  measures are provided in L{bigram_measures} and L{trigram_measures}. 
 25  """ 
 26   
 27  # Possible TODOs: 
 28  # - consider the distinction between f(x,_) and f(x) and whether our 
 29  #   approximation is good enough for fragmented data, and mention it 
 30  # - add a n-gram collocation finder with measures which only utilise n-gram 
 31  #   and unigram counts (raw_freq, pmi, student_t) 
 32   
 33  import itertools as _itertools 
 34  from operator import itemgetter as _itemgetter 
 35   
 36  from nltk.compat import any 
 37  from nltk.probability import FreqDist 
 38  from nltk.util import ingrams 
 39  from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures 
 40  from nltk.metrics.spearman import * 
41 42 -class AbstractCollocationFinder(object):
43 """ 44 An abstract base class for X{collocation finder}s whose purpose is to 45 collect collocation candidate frequencies, filter and rank them. 46 """ 47
48 - def __init__(self, word_fd, ngram_fd):
49 """As a minimum, collocation finders require the frequencies of each 50 word in a corpus, and the joint frequency of word tuples. This data 51 should be provided through L{nltk.probability.FreqDist} objects or an 52 identical interface. 53 """ 54 self.word_fd = word_fd 55 self.ngram_fd = ngram_fd
56 57 @classmethod
58 - def from_documents(cls, documents):
59 """Constructs a collocation finder given a collection of documents, 60 each of which is a list (or iterable) of tokens. 61 """ 62 return cls.from_words(_itertools.chain(*documents))
63 64 @staticmethod
65 - def _ngram_freqdist(words, n):
66 return FreqDist(tuple(words[i:i+n]) for i in range(len(words)-1))
67
68 - def _apply_filter(self, fn=lambda ngram, freq: False):
69 """Generic filter removes ngrams from the frequency distribution 70 if the function returns True when passed an ngram tuple. 71 """ 72 for ngram, freq in self.ngram_fd.items(): 73 if fn(ngram, freq): 74 try: 75 del self.ngram_fd[ngram] 76 except KeyError: 77 pass
78
79 - def apply_freq_filter(self, min_freq):
80 """Removes candidate ngrams which have frequency less than min_freq.""" 81 self._apply_filter(lambda ng, freq: freq < min_freq)
82
83 - def apply_ngram_filter(self, fn):
84 """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...) 85 evaluates to True. 86 """ 87 self._apply_filter(lambda ng, f: fn(*ng))
88
89 - def apply_word_filter(self, fn):
90 """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2), 91 ...) evaluates to True. 92 """ 93 self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
94
95 - def _score_ngrams(self, score_fn):
96 """Generates of (ngram, score) pairs as determined by the scoring 97 function provided. 98 """ 99 for tup in self.ngram_fd: 100 score = self.score_ngram(score_fn, *tup) 101 if score is not None: 102 yield tup, score
103
104 - def score_ngrams(self, score_fn):
105 """Returns a sequence of (ngram, score) pairs ordered from highest to 106 lowest score, as determined by the scoring function provided. 107 """ 108 return sorted(self._score_ngrams(score_fn), 109 key=_itemgetter(1), reverse=True)
110
111 - def nbest(self, score_fn, n):
112 """Returns the top n ngrams when scored by the given function.""" 113 return [p for p,s in self.score_ngrams(score_fn)[:n]]
114
115 - def above_score(self, score_fn, min_score):
116 """Returns a sequence of ngrams, ordered by decreasing score, whose 117 scores each exceed the given minimum score. 118 """ 119 for ngram, score in self.score_ngrams(score_fn): 120 if score > min_score: 121 yield ngram 122 else: 123 break
124
125 126 -class BigramCollocationFinder(AbstractCollocationFinder):
127 """A tool for the finding and ranking of bigram collocations or other 128 association measures. It is often useful to use from_words() rather than 129 constructing an instance directly. 130 """ 131 132 @classmethod
133 - def from_words(cls, words, window_size=2):
134 """Construct a BigramCollocationFinder for all bigrams in the given 135 sequence. By default, bigrams must be contiguous. 136 """ 137 wfd = FreqDist() 138 bfd = FreqDist() 139 140 if window_size < 2: 141 raise ValueError, "Specify window_size at least 2" 142 143 for window in ingrams(words, window_size, pad_right=True): 144 w1 = window[0] 145 try: 146 window = window[:list(window).index(w1, 1)] 147 except ValueError: 148 pass 149 wfd.inc(w1) 150 for w2 in set(window[1:]): 151 if w2 is not None: 152 bfd.inc((w1, w2)) 153 return cls(wfd, bfd)
154
155 - def score_ngram(self, score_fn, w1, w2):
156 """Returns the score for a given bigram using the given scoring 157 function. 158 """ 159 n_all = self.word_fd.N() 160 n_ii = self.ngram_fd[(w1, w2)] 161 if not n_ii: 162 return 163 n_ix = self.word_fd[w1] 164 n_xi = self.word_fd[w2] 165 return score_fn(n_ii, (n_ix, n_xi), n_all)
166
167 168 -class TrigramCollocationFinder(AbstractCollocationFinder):
169 """A tool for the finding and ranking of bigram collocations or other 170 association measures. It is often useful to use from_words() rather than 171 constructing an instance directly. 172 """ 173
174 - def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
175 """Construct a TrigramCollocationFinder, given FreqDists for 176 appearances of words, bigrams, two words with any word between them, 177 and trigrams. 178 """ 179 AbstractCollocationFinder.__init__(self, word_fd, trigram_fd) 180 self.wildcard_fd = wildcard_fd 181 self.bigram_fd = bigram_fd
182 183 @classmethod
184 - def from_words(cls, words):
185 """Construct a TrigramCollocationFinder for all trigrams in the given 186 sequence. 187 """ 188 wfd = FreqDist() 189 wildfd = FreqDist() 190 bfd = FreqDist() 191 tfd = FreqDist() 192 193 for w1, w2, w3 in ingrams(words, 3, pad_right=True): 194 wfd.inc(w1) 195 if w2 is None: 196 continue 197 bfd.inc((w1, w2)) 198 if w3 is None: 199 continue 200 wildfd.inc((w1, w3)) 201 tfd.inc((w1, w2, w3)) 202 return cls(wfd, bfd, wildfd, tfd)
203
204 - def bigram_finder(self):
205 """Constructs a bigram collocation finder with the bigram and unigram 206 data from this finder. Note that this does not include any filtering 207 applied to this finder. 208 """ 209 return BigramCollocationFinder(self.word_fd, self.bigram_fd)
210
211 - def score_ngram(self, score_fn, w1, w2, w3):
212 """Returns the score for a given trigram using the given scoring 213 function. 214 """ 215 n_all = self.word_fd.N() 216 n_iii = self.ngram_fd[(w1, w2, w3)] 217 if not n_iii: 218 return 219 n_iix = self.bigram_fd[(w1, w2)] 220 n_ixi = self.wildcard_fd[(w1, w3)] 221 n_xii = self.bigram_fd[(w2, w3)] 222 n_ixx = self.word_fd[w1] 223 n_xix = self.word_fd[w2] 224 n_xxi = self.word_fd[w3] 225 return score_fn(n_iii, 226 (n_iix, n_ixi, n_xii), 227 (n_ixx, n_xix, n_xxi), 228 n_all)
229
230 231 -def demo(scorer=None, compare_scorer=None):
232 """Finds trigram collocations in the files of the WebText corpus.""" 233 from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores 234 235 if scorer is None: 236 scorer = BigramAssocMeasures.likelihood_ratio 237 if compare_scorer is None: 238 compare_scorer = BigramAssocMeasures.raw_freq 239 240 from nltk import corpus 241 242 ignored_words = corpus.stopwords.words('english') 243 word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words 244 245 for file in corpus.webtext.files(): 246 words = [word.lower() 247 for word in corpus.webtext.words(file)] 248 249 cf = BigramCollocationFinder.from_words(words) 250 cf.apply_freq_filter(3) 251 cf.apply_word_filter(word_filter) 252 253 print file 254 print '\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)] 255 print '\t Correlation to %s: %0.4f' % (compare_scorer.__name__, 256 spearman_correlation( 257 ranks_from_scores(cf.score_ngrams(scorer)), 258 ranks_from_scores(cf.score_ngrams(compare_scorer))))
259 260 # Slows down loading too much 261 # bigram_measures = BigramAssocMeasures() 262 # trigram_measures = TrigramAssocMeasures() 263 264 if __name__ == '__main__': 265 import sys 266 from nltk.metrics import BigramAssocMeasures 267 try: 268 scorer = eval('BigramAssocMeasures.' + sys.argv[1]) 269 except IndexError: 270 scorer = None 271 try: 272 compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2]) 273 except IndexError: 274 compare_scorer = None 275 276 demo(scorer, compare_scorer) 277 278 279 __all__ = ['BigramCollocationFinder', 'TrigramCollocationFinder'] 280