Package nltk :: Package classify :: Module util
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.util

  1  # Natural Language Toolkit: Classifier Utility Functions 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@csse.unimelb.edu.au> (minor additions) 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  Utility functions and classes for classifiers. 
 11  """ 
 12  import math 
 13   
 14  #from nltk.util import Deprecated 
 15  import nltk.classify.util # for accuracy & log_likelihood 
 16  from nltk.util import LazyMap 
 17   
 18  ###################################################################### 
 19  #{ Helper Functions 
 20  ###################################################################### 
 21   
 22  # alternative name possibility: 'map_featurefunc()'? 
 23  # alternative name possibility: 'detect_features()'? 
 24  # alternative name possibility: 'map_featuredetect()'? 
 25  # or.. just have users use LazyMap directly? 
26 -def apply_features(feature_func, toks, labeled=None):
27 """ 28 Use the L{LazyMap} class to construct a lazy list-like 29 object that is analogous to C{map(feature_func, toks)}. In 30 particular, if C{labeled=False}, then the returned list-like 31 object's values are equal to:: 32 33 [feature_func(tok) for tok in toks] 34 35 If C{labeled=True}, then the returned list-like object's values 36 are equal to:: 37 38 [(feature_func(tok), label) for (tok, label) in toks] 39 40 The primary purpose of this function is to avoid the memory 41 overhead involved in storing all the featuresets for every token 42 in a corpus. Instead, these featuresets are constructed lazily, 43 as-needed. The reduction in memory overhead can be especially 44 significant when the underlying list of tokens is itself lazy (as 45 is the case with many corpus readers). 46 47 @param feature_func: The function that will be applied to each 48 token. It should return a featureset -- i.e., a C{dict} 49 mapping feature names to feature values. 50 @param toks: The list of tokens to which C{feature_func} should be 51 applied. If C{labeled=True}, then the list elements will be 52 passed directly to C{feature_func()}. If C{labeled=False}, 53 then the list elements should be tuples C{(tok,label)}, and 54 C{tok} will be passed to C{feature_func()}. 55 @param labeled: If true, then C{toks} contains labeled tokens -- 56 i.e., tuples of the form C{(tok, label)}. (Default: 57 auto-detect based on types.) 58 """ 59 if labeled is None: 60 labeled = toks and isinstance(toks[0], (tuple, list)) 61 if labeled: 62 def lazy_func(labeled_token): 63 return (feature_func(labeled_token[0]), labeled_token[1])
64 return LazyMap(lazy_func, toks) 65 else: 66 return LazyMap(feature_func, toks) 67
68 -def attested_labels(tokens):
69 """ 70 @return: A list of all labels that are attested in the given list 71 of tokens. 72 @rtype: C{list} of (immutable) 73 @param tokens: The list of classified tokens from which to extract 74 labels. A classified token has the form C{(token, label)}. 75 @type tokens: C{list} 76 """ 77 return tuple(set([label for (tok,label) in tokens]))
78
79 -def log_likelihood(classifier, gold):
80 results = classifier.batch_prob_classify([fs for (fs,l) in gold]) 81 ll = [pdist.prob(l) for ((fs,l), pdist) in zip(gold, results)] 82 return math.log(float(sum(ll))/len(ll))
83
84 -def accuracy(classifier, gold):
85 results = classifier.batch_classify([fs for (fs,l) in gold]) 86 correct = [l==r for ((fs,l), r) in zip(gold, results)] 87 if correct: 88 return float(sum(correct))/len(correct) 89 else: 90 return 0
91
92 -class CutoffChecker(object):
93 """ 94 A helper class that implements cutoff checks based on number of 95 iterations and log likelihood. 96 97 Accuracy cutoffs are also implemented, but they're almost never 98 a good idea to use. 99 """
100 - def __init__(self, cutoffs):
101 self.cutoffs = cutoffs.copy() 102 if 'min_ll' in cutoffs: 103 cutoffs['min_ll'] = -abs(cutoffs['min_ll']) 104 if 'min_lldelta' in cutoffs: 105 cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta']) 106 self.ll = None 107 self.acc = None 108 self.iter = 1
109
110 - def check(self, classifier, train_toks):
111 cutoffs = self.cutoffs 112 self.iter += 1 113 if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']: 114 return True # iteration cutoff. 115 116 new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) 117 if math.isnan(new_ll): 118 return True 119 120 if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs: 121 if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']: 122 return True # log likelihood cutoff 123 if ('min_lldelta' in cutoffs and self.ll and 124 ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))): 125 return True # log likelihood delta cutoff 126 self.ll = new_ll 127 128 if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs: 129 new_acc = nltk.classify.util.log_likelihood( 130 classifier, train_toks) 131 if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']: 132 return True # log likelihood cutoff 133 if ('min_accdelta' in cutoffs and self.acc and 134 ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))): 135 return True # log likelihood delta cutoff 136 self.acc = new_acc 137 138 return False # no cutoff reached.
139 140 ###################################################################### 141 #{ Demos 142 ###################################################################### 143
144 -def names_demo_features(name):
145 features = {} 146 features['alwayson'] = True 147 features['startswith'] = name[0].lower() 148 features['endswith'] = name[-1].lower() 149 for letter in 'abcdefghijklmnopqrstuvwxyz': 150 features['count(%s)' % letter] = name.lower().count(letter) 151 features['has(%s)' % letter] = letter in name.lower() 152 return features
153
154 -def binary_names_demo_features(name):
155 features = {} 156 features['alwayson'] = True 157 features['startswith(vowel)'] = name[0].lower() in 'aeiouy' 158 features['endswith(vowel)'] = name[-1].lower() in 'aeiouy' 159 for letter in 'abcdefghijklmnopqrstuvwxyz': 160 features['count(%s)' % letter] = name.lower().count(letter) 161 features['has(%s)' % letter] = letter in name.lower() 162 features['startswith(%s)' % letter] = (letter==name[0].lower()) 163 features['endswith(%s)' % letter] = (letter==name[-1].lower()) 164 return features
165
166 -def names_demo(trainer, features=names_demo_features):
167 from nltk.corpus import names 168 import random 169 170 # Construct a list of classified names, using the names corpus. 171 namelist = ([(name, 'male') for name in names.words('male.txt')] + 172 [(name, 'female') for name in names.words('female.txt')]) 173 174 # Randomly split the names into a test & train set. 175 random.seed(123456) 176 random.shuffle(namelist) 177 train = namelist[:5000] 178 test = namelist[5000:5500] 179 180 # Train up a classifier. 181 print 'Training classifier...' 182 classifier = trainer( [(features(n), g) for (n,g) in train] ) 183 184 # Run the classifier on the test data. 185 print 'Testing classifier...' 186 acc = accuracy(classifier, [(features(n),g) for (n,g) in test]) 187 print 'Accuracy: %6.4f' % acc 188 189 # For classifiers that can find probabilities, show the log 190 # likelihood and some sample probability distributions. 191 try: 192 test_featuresets = [features(n) for (n,g) in test] 193 pdists = classifier.batch_prob_classify(test_featuresets) 194 ll = [pdist.logprob(gold) 195 for ((name, gold), pdist) in zip(test, pdists)] 196 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) 197 print 198 print 'Unseen Names P(Male) P(Female)\n'+'-'*40 199 for ((name, gender), pdist) in zip(test, pdists)[:5]: 200 if gender == 'male': 201 fmt = ' %-15s *%6.4f %6.4f' 202 else: 203 fmt = ' %-15s %6.4f *%6.4f' 204 print fmt % (name, pdist.prob('male'), pdist.prob('female')) 205 except NotImplementedError: 206 pass 207 208 # Return the classifier 209 return classifier
210 211 _inst_cache = {}
212 -def wsd_demo(trainer, word, features, n=1000):
213 from nltk.corpus import senseval 214 import random 215 216 # Get the instances. 217 print 'Reading data...' 218 global _inst_cache 219 if word not in _inst_cache: 220 _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] 221 instances = _inst_cache[word][:] 222 if n> len(instances): n = len(instances) 223 senses = list(set(l for (i,l) in instances)) 224 print ' Senses: ' + ' '.join(senses) 225 226 # Randomly split the names into a test & train set. 227 print 'Splitting into test & train...' 228 random.seed(123456) 229 random.shuffle(instances) 230 train = instances[:int(.8*n)] 231 test = instances[int(.8*n):n] 232 233 # Train up a classifier. 234 print 'Training classifier...' 235 classifier = trainer( [(features(i), l) for (i,l) in train] ) 236 237 # Run the classifier on the test data. 238 print 'Testing classifier...' 239 acc = accuracy(classifier, [(features(i),l) for (i,l) in test]) 240 print 'Accuracy: %6.4f' % acc 241 242 # For classifiers that can find probabilities, show the log 243 # likelihood and some sample probability distributions. 244 try: 245 test_featuresets = [features(i) for (i,n) in test] 246 pdists = classifier.batch_prob_classify(test_featuresets) 247 ll = [pdist.logprob(gold) 248 for ((name, gold), pdist) in zip(test, pdists)] 249 print 'Avg. log likelihood: %6.4f' % (sum(ll)/len(test)) 250 except NotImplementedError: 251 pass 252 253 # Return the classifier 254 return classifier
255