1
2
3
4
5
6
7
8
9 """
10 Utility functions and classes for classifiers.
11 """
12 import math
13
14
15 import nltk.classify.util
16 from nltk.util import LazyMap
17
18
19
20
21
22
23
24
25
27 """
28 Use the L{LazyMap} class to construct a lazy list-like
29 object that is analogous to C{map(feature_func, toks)}. In
30 particular, if C{labeled=False}, then the returned list-like
31 object's values are equal to::
32
33 [feature_func(tok) for tok in toks]
34
35 If C{labeled=True}, then the returned list-like object's values
36 are equal to::
37
38 [(feature_func(tok), label) for (tok, label) in toks]
39
40 The primary purpose of this function is to avoid the memory
41 overhead involved in storing all the featuresets for every token
42 in a corpus. Instead, these featuresets are constructed lazily,
43 as-needed. The reduction in memory overhead can be especially
44 significant when the underlying list of tokens is itself lazy (as
45 is the case with many corpus readers).
46
47 @param feature_func: The function that will be applied to each
48 token. It should return a featureset -- i.e., a C{dict}
49 mapping feature names to feature values.
50 @param toks: The list of tokens to which C{feature_func} should be
51 applied. If C{labeled=True}, then the list elements will be
52 passed directly to C{feature_func()}. If C{labeled=False},
53 then the list elements should be tuples C{(tok,label)}, and
54 C{tok} will be passed to C{feature_func()}.
55 @param labeled: If true, then C{toks} contains labeled tokens --
56 i.e., tuples of the form C{(tok, label)}. (Default:
57 auto-detect based on types.)
58 """
59 if labeled is None:
60 labeled = toks and isinstance(toks[0], (tuple, list))
61 if labeled:
62 def lazy_func(labeled_token):
63 return (feature_func(labeled_token[0]), labeled_token[1])
64 return LazyMap(lazy_func, toks)
65 else:
66 return LazyMap(feature_func, toks)
67
69 """
70 @return: A list of all labels that are attested in the given list
71 of tokens.
72 @rtype: C{list} of (immutable)
73 @param tokens: The list of classified tokens from which to extract
74 labels. A classified token has the form C{(token, label)}.
75 @type tokens: C{list}
76 """
77 return tuple(set([label for (tok,label) in tokens]))
78
83
91
93 """
94 A helper class that implements cutoff checks based on number of
95 iterations and log likelihood.
96
97 Accuracy cutoffs are also implemented, but they're almost never
98 a good idea to use.
99 """
101 self.cutoffs = cutoffs.copy()
102 if 'min_ll' in cutoffs:
103 cutoffs['min_ll'] = -abs(cutoffs['min_ll'])
104 if 'min_lldelta' in cutoffs:
105 cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta'])
106 self.ll = None
107 self.acc = None
108 self.iter = 1
109
110 - def check(self, classifier, train_toks):
111 cutoffs = self.cutoffs
112 self.iter += 1
113 if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']:
114 return True
115
116 new_ll = nltk.classify.util.log_likelihood(classifier, train_toks)
117 if math.isnan(new_ll):
118 return True
119
120 if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs:
121 if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']:
122 return True
123 if ('min_lldelta' in cutoffs and self.ll and
124 ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))):
125 return True
126 self.ll = new_ll
127
128 if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs:
129 new_acc = nltk.classify.util.log_likelihood(
130 classifier, train_toks)
131 if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']:
132 return True
133 if ('min_accdelta' in cutoffs and self.acc and
134 ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))):
135 return True
136 self.acc = new_acc
137
138 return False
139
140
141
142
143
145 features = {}
146 features['alwayson'] = True
147 features['startswith'] = name[0].lower()
148 features['endswith'] = name[-1].lower()
149 for letter in 'abcdefghijklmnopqrstuvwxyz':
150 features['count(%s)' % letter] = name.lower().count(letter)
151 features['has(%s)' % letter] = letter in name.lower()
152 return features
153
155 features = {}
156 features['alwayson'] = True
157 features['startswith(vowel)'] = name[0].lower() in 'aeiouy'
158 features['endswith(vowel)'] = name[-1].lower() in 'aeiouy'
159 for letter in 'abcdefghijklmnopqrstuvwxyz':
160 features['count(%s)' % letter] = name.lower().count(letter)
161 features['has(%s)' % letter] = letter in name.lower()
162 features['startswith(%s)' % letter] = (letter==name[0].lower())
163 features['endswith(%s)' % letter] = (letter==name[-1].lower())
164 return features
165
167 from nltk.corpus import names
168 import random
169
170
171 namelist = ([(name, 'male') for name in names.words('male.txt')] +
172 [(name, 'female') for name in names.words('female.txt')])
173
174
175 random.seed(123456)
176 random.shuffle(namelist)
177 train = namelist[:5000]
178 test = namelist[5000:5500]
179
180
181 print 'Training classifier...'
182 classifier = trainer( [(features(n), g) for (n,g) in train] )
183
184
185 print 'Testing classifier...'
186 acc = accuracy(classifier, [(features(n),g) for (n,g) in test])
187 print 'Accuracy: %6.4f' % acc
188
189
190
191 try:
192 test_featuresets = [features(n) for (n,g) in test]
193 pdists = classifier.batch_prob_classify(test_featuresets)
194 ll = [pdist.logprob(gold)
195 for ((name, gold), pdist) in zip(test, pdists)]
196 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
197 print
198 print 'Unseen Names P(Male) P(Female)\n'+'-'*40
199 for ((name, gender), pdist) in zip(test, pdists)[:5]:
200 if gender == 'male':
201 fmt = ' %-15s *%6.4f %6.4f'
202 else:
203 fmt = ' %-15s %6.4f *%6.4f'
204 print fmt % (name, pdist.prob('male'), pdist.prob('female'))
205 except NotImplementedError:
206 pass
207
208
209 return classifier
210
211 _inst_cache = {}
212 -def wsd_demo(trainer, word, features, n=1000):
213 from nltk.corpus import senseval
214 import random
215
216
217 print 'Reading data...'
218 global _inst_cache
219 if word not in _inst_cache:
220 _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)]
221 instances = _inst_cache[word][:]
222 if n> len(instances): n = len(instances)
223 senses = list(set(l for (i,l) in instances))
224 print ' Senses: ' + ' '.join(senses)
225
226
227 print 'Splitting into test & train...'
228 random.seed(123456)
229 random.shuffle(instances)
230 train = instances[:int(.8*n)]
231 test = instances[int(.8*n):n]
232
233
234 print 'Training classifier...'
235 classifier = trainer( [(features(i), l) for (i,l) in train] )
236
237
238 print 'Testing classifier...'
239 acc = accuracy(classifier, [(features(i),l) for (i,l) in test])
240 print 'Accuracy: %6.4f' % acc
241
242
243
244 try:
245 test_featuresets = [features(i) for (i,n) in test]
246 pdists = classifier.batch_prob_classify(test_featuresets)
247 ll = [pdist.logprob(gold)
248 for ((name, gold), pdist) in zip(test, pdists)]
249 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test))
250 except NotImplementedError:
251 pass
252
253
254 return classifier
255