1
2
3
4
5
6
7
8 """
9 Tools to identify X{collocation}s --- words that often appear consecutively
10 --- within corpora. They may also be used to find other X{association}s between
11 word occurrences.
12 See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf
13 and the Text::NSP Perl package at http://ngram.sourceforge.net
14
15 Finding collocations requires first calculating the frequencies of words and
16 their appearance in the context of other words. Often the collection of words
17 will then requiring filtering to only retain useful content terms. Each ngram
18 of words may then be scored according to some X{association measure}, in order
19 to determine the relative likelihood of each ngram being a collocation.
20
21 The L{BigramCollocationFinder} and L{TrigramCollocationFinder} classes provide
22 these functionalities, dependent on being provided a function which scores a
23 ngram given appropriate frequency counts. A number of standard association
24 measures are provided in L{bigram_measures} and L{trigram_measures}.
25 """
26
27
28
29
30
31
32
33 import itertools as _itertools
34 from operator import itemgetter as _itemgetter
35
36 from nltk.compat import any
37 from nltk.probability import FreqDist
38 from nltk.util import ingrams
39 from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures
40 from nltk.metrics.spearman import *
43 """
44 An abstract base class for X{collocation finder}s whose purpose is to
45 collect collocation candidate frequencies, filter and rank them.
46 """
47
49 """As a minimum, collocation finders require the frequencies of each
50 word in a corpus, and the joint frequency of word tuples. This data
51 should be provided through L{nltk.probability.FreqDist} objects or an
52 identical interface.
53 """
54 self.word_fd = word_fd
55 self.ngram_fd = ngram_fd
56
57 @classmethod
59 """Constructs a collocation finder given a collection of documents,
60 each of which is a list (or iterable) of tokens.
61 """
62 return cls.from_words(_itertools.chain(*documents))
63
64 @staticmethod
67
69 """Generic filter removes ngrams from the frequency distribution
70 if the function returns True when passed an ngram tuple.
71 """
72 for ngram, freq in self.ngram_fd.items():
73 if fn(ngram, freq):
74 try:
75 del self.ngram_fd[ngram]
76 except KeyError:
77 pass
78
80 """Removes candidate ngrams which have frequency less than min_freq."""
81 self._apply_filter(lambda ng, freq: freq < min_freq)
82
84 """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...)
85 evaluates to True.
86 """
87 self._apply_filter(lambda ng, f: fn(*ng))
88
90 """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2),
91 ...) evaluates to True.
92 """
93 self._apply_filter(lambda ng, f: any(fn(w) for w in ng))
94
96 """Generates of (ngram, score) pairs as determined by the scoring
97 function provided.
98 """
99 for tup in self.ngram_fd:
100 score = self.score_ngram(score_fn, *tup)
101 if score is not None:
102 yield tup, score
103
105 """Returns a sequence of (ngram, score) pairs ordered from highest to
106 lowest score, as determined by the scoring function provided.
107 """
108 return sorted(self._score_ngrams(score_fn),
109 key=_itemgetter(1), reverse=True)
110
111 - def nbest(self, score_fn, n):
112 """Returns the top n ngrams when scored by the given function."""
113 return [p for p,s in self.score_ngrams(score_fn)[:n]]
114
116 """Returns a sequence of ngrams, ordered by decreasing score, whose
117 scores each exceed the given minimum score.
118 """
119 for ngram, score in self.score_ngrams(score_fn):
120 if score > min_score:
121 yield ngram
122 else:
123 break
124
127 """A tool for the finding and ranking of bigram collocations or other
128 association measures. It is often useful to use from_words() rather than
129 constructing an instance directly.
130 """
131
132 @classmethod
134 """Construct a BigramCollocationFinder for all bigrams in the given
135 sequence. By default, bigrams must be contiguous.
136 """
137 wfd = FreqDist()
138 bfd = FreqDist()
139
140 if window_size < 2:
141 raise ValueError, "Specify window_size at least 2"
142
143 for window in ingrams(words, window_size, pad_right=True):
144 w1 = window[0]
145 try:
146 window = window[:list(window).index(w1, 1)]
147 except ValueError:
148 pass
149 wfd.inc(w1)
150 for w2 in set(window[1:]):
151 if w2 is not None:
152 bfd.inc((w1, w2))
153 return cls(wfd, bfd)
154
156 """Returns the score for a given bigram using the given scoring
157 function.
158 """
159 n_all = self.word_fd.N()
160 n_ii = self.ngram_fd[(w1, w2)]
161 if not n_ii:
162 return
163 n_ix = self.word_fd[w1]
164 n_xi = self.word_fd[w2]
165 return score_fn(n_ii, (n_ix, n_xi), n_all)
166
169 """A tool for the finding and ranking of bigram collocations or other
170 association measures. It is often useful to use from_words() rather than
171 constructing an instance directly.
172 """
173
174 - def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd):
175 """Construct a TrigramCollocationFinder, given FreqDists for
176 appearances of words, bigrams, two words with any word between them,
177 and trigrams.
178 """
179 AbstractCollocationFinder.__init__(self, word_fd, trigram_fd)
180 self.wildcard_fd = wildcard_fd
181 self.bigram_fd = bigram_fd
182
183 @classmethod
185 """Construct a TrigramCollocationFinder for all trigrams in the given
186 sequence.
187 """
188 wfd = FreqDist()
189 wildfd = FreqDist()
190 bfd = FreqDist()
191 tfd = FreqDist()
192
193 for w1, w2, w3 in ingrams(words, 3, pad_right=True):
194 wfd.inc(w1)
195 if w2 is None:
196 continue
197 bfd.inc((w1, w2))
198 if w3 is None:
199 continue
200 wildfd.inc((w1, w3))
201 tfd.inc((w1, w2, w3))
202 return cls(wfd, bfd, wildfd, tfd)
203
205 """Constructs a bigram collocation finder with the bigram and unigram
206 data from this finder. Note that this does not include any filtering
207 applied to this finder.
208 """
209 return BigramCollocationFinder(self.word_fd, self.bigram_fd)
210
212 """Returns the score for a given trigram using the given scoring
213 function.
214 """
215 n_all = self.word_fd.N()
216 n_iii = self.ngram_fd[(w1, w2, w3)]
217 if not n_iii:
218 return
219 n_iix = self.bigram_fd[(w1, w2)]
220 n_ixi = self.wildcard_fd[(w1, w3)]
221 n_xii = self.bigram_fd[(w2, w3)]
222 n_ixx = self.word_fd[w1]
223 n_xix = self.word_fd[w2]
224 n_xxi = self.word_fd[w3]
225 return score_fn(n_iii,
226 (n_iix, n_ixi, n_xii),
227 (n_ixx, n_xix, n_xxi),
228 n_all)
229
230
231 -def demo(scorer=None, compare_scorer=None):
259
260
261
262
263
264 if __name__ == '__main__':
265 import sys
266 from nltk.metrics import BigramAssocMeasures
267 try:
268 scorer = eval('BigramAssocMeasures.' + sys.argv[1])
269 except IndexError:
270 scorer = None
271 try:
272 compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2])
273 except IndexError:
274 compare_scorer = None
275
276 demo(scorer, compare_scorer)
277
278
279 __all__ = ['BigramCollocationFinder', 'TrigramCollocationFinder']
280