Package nltk :: Package classify :: Module maxent
[hide private]
[frames] | no frames]

Source Code for Module nltk.classify.maxent

   1  # Natural Language Toolkit: Maximum Entropy Classifiers 
   2  # 
   3  # Copyright (C) 2001-2011 NLTK Project 
   4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   5  #         Dmitry Chichkov <dchichkov@gmail.com> (TypedMaxentFeatureEncoding) 
   6  # URL: <http://www.nltk.org/> 
   7  # For license information, see LICENSE.TXT 
   8   
   9  """ 
  10  A classifier model based on maximum entropy modeling framework.  This 
  11  framework considers all of the probability distributions that are 
  12  empirically consistant with the training data; and chooses the 
  13  distribution with the highest entropy.  A probability distribution is 
  14  X{empirically consistant} with a set of training data if its estimated 
  15  frequency with which a class and a feature vector value co-occur is 
  16  equal to the actual frequency in the data. 
  17   
  18  Terminology: 'feature' 
  19  ====================== 
  20  The term I{feature} is usually used to refer to some property of an 
  21  unlabeled token.  For example, when performing word sense 
  22  disambiguation, we might define a C{'prevword'} feature whose value is 
  23  the word preceeding the target word.  However, in the context of 
  24  maxent modeling, the term I{feature} is typically used to refer to a 
  25  property of a X{labeled} token.  In order to prevent confusion, we 
  26  will introduce two distinct terms to disambiguate these two different 
  27  concepts: 
  28   
  29    - An X{input-feature} is a property of an unlabeled token. 
  30    - A X{joint-feature} is a property of a labeled token. 
  31   
  32  In the rest of the C{nltk.classify} module, the term X{features} is 
  33  used to refer to what we will call X{input-features} in this module. 
  34   
  35  In literature that describes and discusses maximum entropy models, 
  36  input-features are typically called X{contexts}, and joint-features 
  37  are simply referred to as X{features}. 
  38   
  39  Converting Input-Features to Joint-Features 
  40  ------------------------------------------- 
  41  In maximum entropy models, joint-features are required to have numeric 
  42  values.  Typically, each input-feature C{input_feat} is mapped to a 
  43  set of joint-features of the form:: 
  44   
  45      joint_feat(token, label) = { 1 if input_feat(token) == feat_val 
  46                                 {      and label == some_label 
  47                                 { 
  48                                 { 0 otherwise 
  49   
  50  For all values of C{feat_val} and C{some_label}.  This mapping is 
  51  performed by classes that implement the L{MaxentFeatureEncodingI} 
  52  interface. 
  53  """ 
  54  __docformat__ = 'epytext en' 
  55   
  56  import numpy 
  57  import time 
  58  import tempfile 
  59  import os 
  60  import gzip 
  61   
  62  from nltk.compat import defaultdict 
  63  from nltk.util import OrderedDict 
  64  from nltk.probability import * 
  65   
  66  import nltk.classify.util # for accuracy & log_likelihood 
  67  from api import * 
  68  from util import attested_labels, CutoffChecker 
  69  from megam import call_megam, write_megam_file, parse_megam_weights 
  70  from tadm import call_tadm, write_tadm_file, parse_tadm_weights 
71 72 ###################################################################### 73 #{ Classifier Model 74 ###################################################################### 75 76 -class MaxentClassifier(ClassifierI):
77 """ 78 A maximum entropy classifier (also known as a X{conditional 79 exponential classifier}). This classifier is parameterized by a 80 set of X{weights}, which are used to combine the joint-features 81 that are generated from a featureset by an X{encoding}. In 82 particular, the encoding maps each C{(featureset, label)} pair to 83 a vector. The probability of each label is then computed using 84 the following equation:: 85 86 dotprod(weights, encode(fs,label)) 87 prob(fs|label) = --------------------------------------------------- 88 sum(dotprod(weights, encode(fs,l)) for l in labels) 89 90 Where C{dotprod} is the dot product:: 91 92 dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) 93 """
94 - def __init__(self, encoding, weights, logarithmic=True):
95 """ 96 Construct a new maxent classifier model. Typically, new 97 classifier models are created using the L{train()} method. 98 99 @type encoding: L{MaxentFeatureEncodingI} 100 @param encoding: An encoding that is used to convert the 101 featuresets that are given to the C{classify} method into 102 joint-feature vectors, which are used by the maxent 103 classifier model. 104 105 @type weights: C{list} of C{float} 106 @param weights: The feature weight vector for this classifier. 107 108 @type logarithmic: C{bool} 109 @param logarithmic: If false, then use non-logarithmic weights. 110 """ 111 self._encoding = encoding 112 self._weights = weights 113 self._logarithmic = logarithmic 114 #self._logarithmic = False 115 assert encoding.length() == len(weights)
116
117 - def labels(self):
118 return self._encoding.labels()
119
120 - def set_weights(self, new_weights):
121 """ 122 Set the feature weight vector for this classifier. 123 @param new_weights: The new feature weight vector. 124 @type new_weights: C{list} of C{float} 125 """ 126 self._weights = new_weights 127 assert (self._encoding.length() == len(new_weights))
128
129 - def weights(self):
130 """ 131 @return: The feature weight vector for this classifier. 132 @rtype: C{list} of C{float} 133 """ 134 return self._weights
135
136 - def classify(self, featureset):
137 return self.prob_classify(featureset).max()
138
139 - def prob_classify(self, featureset):
140 prob_dict = {} 141 for label in self._encoding.labels(): 142 feature_vector = self._encoding.encode(featureset, label) 143 144 if self._logarithmic: 145 total = 0.0 146 for (f_id, f_val) in feature_vector: 147 total += self._weights[f_id] * f_val 148 prob_dict[label] = total 149 150 else: 151 prod = 1.0 152 for (f_id, f_val) in feature_vector: 153 prod *= self._weights[f_id] ** f_val 154 prob_dict[label] = prod 155 156 # Normalize the dictionary to give a probability distribution 157 return DictionaryProbDist(prob_dict, log=self._logarithmic, 158 normalize=True)
159
160 - def explain(self, featureset, columns=4):
161 """ 162 Print a table showing the effect of each of the features in 163 the given feature set, and how they combine to determine the 164 probabilities of each label for that featureset. 165 """ 166 descr_width = 50 167 TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f' 168 169 pdist = self.prob_classify(featureset) 170 labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) 171 labels = labels[:columns] 172 print ' Feature'.ljust(descr_width)+''.join( 173 '%8s' % str(l)[:7] for l in labels) 174 print ' '+'-'*(descr_width-2+8*len(labels)) 175 sums = defaultdict(int) 176 for i, label in enumerate(labels): 177 feature_vector = self._encoding.encode(featureset, label) 178 feature_vector.sort(key=lambda (fid,_): abs(self._weights[fid]), 179 reverse=True) 180 for (f_id, f_val) in feature_vector: 181 if self._logarithmic: score = self._weights[f_id] * f_val 182 else: score = self._weights[fid] ** f_val 183 descr = self._encoding.describe(f_id) 184 descr = descr.split(' and label is ')[0] # hack 185 descr += ' (%s)' % f_val # hack 186 if len(descr) > 47: descr = descr[:44]+'...' 187 print TEMPLATE % (descr, i*8*' ', score) 188 sums[label] += score 189 print ' '+'-'*(descr_width-1+8*len(labels)) 190 print ' TOTAL:'.ljust(descr_width)+''.join( 191 '%8.3f' % sums[l] for l in labels) 192 print ' PROBS:'.ljust(descr_width)+''.join( 193 '%8.3f' % pdist.prob(l) for l in labels)
194
195 - def show_most_informative_features(self, n=10, show='all'):
196 """ 197 @param show: all, neg, or pos (for negative-only or positive-only) 198 """ 199 fids = sorted(range(len(self._weights)), 200 key=lambda fid: abs(self._weights[fid]), 201 reverse=True) 202 if show == 'pos': 203 fids = [fid for fid in fids if self._weights[fid]>0] 204 elif show == 'neg': 205 fids = [fid for fid in fids if self._weights[fid]<0] 206 for fid in fids[:n]: 207 print '%8.3f %s' % (self._weights[fid], 208 self._encoding.describe(fid))
209
210 - def __repr__(self):
211 return ('<ConditionalExponentialClassifier: %d labels, %d features>' % 212 (len(self._encoding.labels()), self._encoding.length()))
213 214 #: A list of the algorithm names that are accepted for the 215 #: L{train()} method's C{algorithm} parameter. 216 ALGORITHMS = ['GIS', 'IIS', 'CG', 'BFGS', 'Powell', 'LBFGSB', 217 'Nelder-Mead', 'MEGAM', 'TADM'] 218 219 @classmethod
220 - def train(cls, train_toks, algorithm=None, trace=3, encoding=None, 221 labels=None, sparse=True, gaussian_prior_sigma=0, **cutoffs):
222 """ 223 Train a new maxent classifier based on the given corpus of 224 training samples. This classifier will have its weights 225 chosen to maximize entropy while remaining empirically 226 consistent with the training corpus. 227 228 @rtype: L{MaxentClassifier} 229 @return: The new maxent classifier 230 231 @type train_toks: C{list} 232 @param train_toks: Training data, represented as a list of 233 pairs, the first member of which is a featureset, 234 and the second of which is a classification label. 235 236 @type algorithm: C{str} 237 @param algorithm: A case-insensitive string, specifying which 238 algorithm should be used to train the classifier. The 239 following algorithms are currently available. 240 241 - Iterative Scaling Methods 242 - C{'GIS'}: Generalized Iterative Scaling 243 - C{'IIS'}: Improved Iterative Scaling 244 245 - Optimization Methods (require C{scipy}) 246 - C{'CG'}: Conjugate gradient 247 - C{'BFGS'}: Broyden-Fletcher-Goldfarb-Shanno algorithm 248 - C{'Powell'}: Powell agorithm 249 - C{'LBFGSB'}: A limited-memory variant of the BFGS algorithm 250 - C{'Nelder-Mead'}: The Nelder-Mead algorithm 251 252 - External Libraries 253 - C{'megam'}: LM-BFGS algorithm, with training performed 254 by an U{megam <http://www.cs.utah.edu/~hal/megam/>}. 255 (requires that C{megam} be installed.) 256 257 The default algorithm is C{'CG'} if C{'scipy'} is 258 installed; and C{'iis'} otherwise. 259 260 @type trace: C{int} 261 @param trace: The level of diagnostic tracing output to produce. 262 Higher values produce more verbose output. 263 264 @type encoding: L{MaxentFeatureEncodingI} 265 @param encoding: A feature encoding, used to convert featuresets 266 into feature vectors. If none is specified, then a 267 L{BinaryMaxentFeatureEncoding} will be built based on the 268 features that are attested in the training corpus. 269 270 @type labels: C{list} of C{str} 271 @param labels: The set of possible labels. If none is given, then 272 the set of all labels attested in the training data will be 273 used instead. 274 275 @param sparse: If true, then use sparse matrices instead of 276 dense matrices. Currently, this is only supported by 277 the scipy (optimization method) algorithms. For other 278 algorithms, its value is ignored. 279 280 @param gaussian_prior_sigma: The sigma value for a gaussian 281 prior on model weights. Currently, this is supported by 282 the scipy (optimization method) algorithms and C{megam}. 283 For other algorithms, its value is ignored. 284 285 @param cutoffs: Arguments specifying various conditions under 286 which the training should be halted. (Some of the cutoff 287 conditions are not supported by some algorithms.) 288 289 - C{max_iter=v}: Terminate after C{v} iterations. 290 - C{min_ll=v}: Terminate after the negative average 291 log-likelihood drops under C{v}. 292 - C{min_lldelta=v}: Terminate if a single iteration improves 293 log likelihood by less than C{v}. 294 - C{tolerance=v}: Terminate a scipy optimization method when 295 improvement drops below a tolerance level C{v}. The 296 exact meaning of this tolerance depends on the scipy 297 algorithm used. See C{scipy} documentation for more 298 info. Default values: 1e-3 for CG, 1e-5 for LBFGSB, 299 and 1e-4 for other algorithms. I{(C{scipy} only)} 300 """ 301 if algorithm is None: 302 try: 303 import scipy 304 algorithm = 'cg' 305 except ImportError: 306 algorithm = 'iis' 307 for key in cutoffs: 308 if key not in ('max_iter', 'min_ll', 'min_lldelta', 'tolerance', 309 'max_acc', 'min_accdelta', 'count_cutoff', 310 'norm', 'explicit', 'bernoulli'): 311 raise TypeError('Unexpected keyword arg %r' % key) 312 algorithm = algorithm.lower() 313 if algorithm == 'iis': 314 return train_maxent_classifier_with_iis( 315 train_toks, trace, encoding, labels, **cutoffs) 316 elif algorithm == 'gis': 317 return train_maxent_classifier_with_gis( 318 train_toks, trace, encoding, labels, **cutoffs) 319 elif algorithm in cls._SCIPY_ALGS: 320 return train_maxent_classifier_with_scipy( 321 train_toks, trace, encoding, labels, 322 cls._SCIPY_ALGS[algorithm], sparse, 323 gaussian_prior_sigma, **cutoffs) 324 elif algorithm == 'megam': 325 return train_maxent_classifier_with_megam( 326 train_toks, trace, encoding, labels, 327 gaussian_prior_sigma, **cutoffs) 328 elif algorithm == 'tadm': 329 kwargs = cutoffs 330 kwargs['trace'] = trace 331 kwargs['encoding'] = encoding 332 kwargs['labels'] = labels 333 kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma 334 return TadmMaxentClassifier.train(train_toks, **kwargs) 335 else: 336 raise ValueError('Unknown algorithm %s' % algorithm)
337 338 _SCIPY_ALGS = {'cg':'CG', 'bfgs':'BFGS', 'powell':'Powell', 339 'lbfgsb':'LBFGSB', 'nelder-mead':'Nelder-Mead'}
340
341 342 #: Alias for MaxentClassifier. 343 ConditionalExponentialClassifier = MaxentClassifier 344 345 346 ###################################################################### 347 #{ Feature Encodings 348 ###################################################################### 349 350 -class MaxentFeatureEncodingI(object):
351 """ 352 A mapping that converts a set of input-feature values to a vector 353 of joint-feature values, given a label. This conversion is 354 necessary to translate featuresets into a format that can be used 355 by maximum entropy models. 356 357 The set of joint-features used by a given encoding is fixed, and 358 each index in the generated joint-feature vectors corresponds to a 359 single joint-feature. The length of the generated joint-feature 360 vectors is therefore constant (for a given encoding). 361 362 Because the joint-feature vectors generated by 363 C{MaxentFeatureEncodingI} are typically very sparse, they are 364 represented as a list of C{(index, value)} tuples, specifying the 365 value of each non-zero joint-feature. 366 367 Feature encodings are generally created using the L{train()} 368 method, which generates an appropriate encoding based on the 369 input-feature values and labels that are present in a given 370 corpus. 371 """
372 - def encode(self, featureset, label):
373 """ 374 Given a (featureset, label) pair, return the corresponding 375 vector of joint-feature values. This vector is represented as 376 a list of C{(index, value)} tuples, specifying the value of 377 each non-zero joint-feature. 378 379 @type featureset: C{dict} 380 @rtype: C{list} of C{(int, number)} 381 """ 382 raise AssertionError('Not implemented')
383
384 - def length(self):
385 """ 386 @return: The size of the fixed-length joint-feature vectors 387 that are generated by this encoding. 388 @rtype: C{int} 389 """ 390 raise AssertionError('Not implemented')
391
392 - def labels(self):
393 """ 394 @return: A list of the \"known labels\" -- i.e., all labels 395 C{l} such that C{self.encode(fs,l)} can be a nonzero 396 joint-feature vector for some value of C{fs}. 397 @rtype: C{list} 398 """ 399 raise AssertionError('Not implemented')
400
401 - def describe(self, fid):
402 """ 403 @return: A string describing the value of the joint-feature 404 whose index in the generated feature vectors is C{fid}. 405 @rtype: C{str} 406 """ 407 raise AssertionError('Not implemented')
408
409 - def train(cls, train_toks):
410 """ 411 Construct and return new feature encoding, based on a given 412 training corpus C{train_toks}. 413 414 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 415 @param train_toks: Training data, represented as a list of 416 pairs, the first member of which is a feature dictionary, 417 and the second of which is a classification label. 418 """ 419 raise AssertionError('Not implemented')
420
421 -class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI):
422 """ 423 A feature encoding that calls a user-supplied function to map a 424 given featureset/label pair to a sparse joint-feature vector. 425 """
426 - def __init__(self, func, length, labels):
427 """ 428 Construct a new feature encoding based on the given function. 429 430 @type func: (callable) 431 @param func: A function that takes two arguments, a featureset 432 and a label, and returns the sparse joint feature vector 433 that encodes them: 434 435 >>> func(featureset, label) -> feature_vector 436 437 This sparse joint feature vector (C{feature_vector}) is a 438 list of C{(index,value)} tuples. 439 440 @type length: C{int} 441 @param length: The size of the fixed-length joint-feature 442 vectors that are generated by this encoding. 443 444 @type labels: C{list} 445 @param labels: A list of the \"known labels\" for this 446 encoding -- i.e., all labels C{l} such that 447 C{self.encode(fs,l)} can be a nonzero joint-feature vector 448 for some value of C{fs}. 449 """ 450 self._length = length 451 self._func = func 452 self._labels = labels
453
454 - def encode(self, featureset, label):
455 return self._func(featureset, label)
456
457 - def length(self):
458 return self._length
459
460 - def labels(self):
461 return self._labels
462
463 - def describe(self, fid):
464 return 'no description available'
465
466 -class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI):
467 """ 468 A feature encoding that generates vectors containing a binary 469 joint-features of the form:: 470 471 joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) 472 { 473 { 0 otherwise 474 475 Where C{fname} is the name of an input-feature, C{fval} is a value 476 for that input-feature, and C{label} is a label. 477 478 Typically, these features are constructed based on a training 479 corpus, using the L{train()} method. This method will create one 480 feature for each combination of C{fname}, C{fval}, and C{label} 481 that occurs at least once in the training corpus. 482 483 The C{unseen_features} parameter can be used to add X{unseen-value 484 features}, which are used whenever an input feature has a value 485 that was not encountered in the training corpus. These features 486 have the form:: 487 488 joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) 489 { and l == label 490 { 491 { 0 otherwise 492 493 Where C{is_unseen(fname, fval)} is true if the encoding does not 494 contain any joint features that are true when C{fs[fname]==fval}. 495 496 The C{alwayson_features} parameter can be used to add X{always-on 497 features}, which have the form:: 498 499 joint_feat(fs, l) = { 1 if (l == label) 500 { 501 { 0 otherwise 502 503 These always-on features allow the maxent model to directly model 504 the prior probabilities of each label. 505 """
506 - def __init__(self, labels, mapping, unseen_features=False, 507 alwayson_features=False):
508 """ 509 @param labels: A list of the \"known labels\" for this encoding. 510 511 @param mapping: A dictionary mapping from C{(fname,fval,label)} 512 tuples to corresponding joint-feature indexes. These 513 indexes must be the set of integers from 0...len(mapping). 514 If C{mapping[fname,fval,label]=id}, then 515 C{self.encode({..., fname:fval, ...}, label)[id]} is 1; 516 otherwise, it is 0. 517 518 @param unseen_features: If true, then include unseen value 519 features in the generated joint-feature vectors. 520 521 @param alwayson_features: If true, then include always-on 522 features in the generated joint-feature vectors. 523 """ 524 if set(mapping.values()) != set(range(len(mapping))): 525 raise ValueError('Mapping values must be exactly the ' 526 'set of integers from 0...len(mapping)') 527 528 self._labels = list(labels) 529 """A list of attested labels.""" 530 531 self._mapping = mapping 532 """dict mapping from (fname,fval,label) -> fid""" 533 534 self._length = len(mapping) 535 """The length of generated joint feature vectors.""" 536 537 self._alwayson = None 538 """dict mapping from label -> fid""" 539 540 self._unseen = None 541 """dict mapping from fname -> fid""" 542 543 if alwayson_features: 544 self._alwayson = dict([(label,i+self._length) 545 for (i,label) in enumerate(labels)]) 546 self._length += len(self._alwayson) 547 548 if unseen_features: 549 fnames = set(fname for (fname, fval, label) in mapping) 550 self._unseen = dict([(fname, i+self._length) 551 for (i, fname) in enumerate(fnames)]) 552 self._length += len(fnames)
553
554 - def encode(self, featureset, label):
555 # Inherit docs. 556 encoding = [] 557 558 # Convert input-features to joint-features: 559 for fname, fval in featureset.items(): 560 # Known feature name & value: 561 if (fname, fval, label) in self._mapping: 562 encoding.append((self._mapping[fname, fval, label], 1)) 563 564 # Otherwise, we might want to fire an "unseen-value feature". 565 elif self._unseen: 566 # Have we seen this fname/fval combination with any label? 567 for label2 in self._labels: 568 if (fname, fval, label2) in self._mapping: 569 break # we've seen this fname/fval combo 570 # We haven't -- fire the unseen-value feature 571 else: 572 if fname in self._unseen: 573 encoding.append((self._unseen[fname], 1)) 574 575 # Add always-on features: 576 if self._alwayson and label in self._alwayson: 577 encoding.append((self._alwayson[label], 1)) 578 579 return encoding
580
581 - def describe(self, f_id):
582 # Inherit docs. 583 if not isinstance(f_id, (int, long)): 584 raise TypeError('describe() expected an int') 585 try: 586 self._inv_mapping 587 except AttributeError: 588 self._inv_mapping = [-1]*len(self._mapping) 589 for (info, i) in self._mapping.items(): 590 self._inv_mapping[i] = info 591 592 if f_id < len(self._mapping): 593 (fname, fval, label) = self._inv_mapping[f_id] 594 return '%s==%r and label is %r' % (fname, fval, label) 595 elif self._alwayson and f_id in self._alwayson.values(): 596 for (label, f_id2) in self._alwayson.items(): 597 if f_id==f_id2: return 'label is %r' % label 598 elif self._unseen and f_id in self._unseen.values(): 599 for (fname, f_id2) in self._unseen.items(): 600 if f_id==f_id2: return '%s is unseen' % fname 601 else: 602 raise ValueError('Bad feature id')
603
604 - def labels(self):
605 # Inherit docs. 606 return self._labels
607
608 - def length(self):
609 # Inherit docs. 610 return self._length
611 612 @classmethod
613 - def train(cls, train_toks, count_cutoff=0, labels=None, **options):
614 """ 615 Construct and return new feature encoding, based on a given 616 training corpus C{train_toks}. See the L{class description 617 <BinaryMaxentFeatureEncoding>} for a description of the 618 joint-features that will be included in this encoding. 619 620 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 621 @param train_toks: Training data, represented as a list of 622 pairs, the first member of which is a feature dictionary, 623 and the second of which is a classification label. 624 625 @type count_cutoff: C{int} 626 @param count_cutoff: A cutoff value that is used to discard 627 rare joint-features. If a joint-feature's value is 1 628 fewer than C{count_cutoff} times in the training corpus, 629 then that joint-feature is not included in the generated 630 encoding. 631 632 @type labels: C{list} 633 @param labels: A list of labels that should be used by the 634 classifier. If not specified, then the set of labels 635 attested in C{train_toks} will be used. 636 637 @param options: Extra parameters for the constructor, such as 638 C{unseen_features} and C{alwayson_features}. 639 """ 640 mapping = {} # maps (fname, fval, label) -> fid 641 seen_labels = set() # The set of labels we've encountered 642 count = defaultdict(int) # maps (fname, fval) -> count 643 644 for (tok, label) in train_toks: 645 if labels and label not in labels: 646 raise ValueError('Unexpected label %s' % label) 647 seen_labels.add(label) 648 649 # Record each of the features. 650 for (fname, fval) in tok.items(): 651 652 # If a count cutoff is given, then only add a joint 653 # feature once the corresponding (fname, fval, label) 654 # tuple exceeds that cutoff. 655 count[fname,fval] += 1 656 if count[fname,fval] >= count_cutoff: 657 if (fname, fval, label) not in mapping: 658 mapping[fname, fval, label] = len(mapping) 659 660 if labels is None: labels = seen_labels 661 return cls(labels, mapping, **options)
662
663 -class GISEncoding(BinaryMaxentFeatureEncoding):
664 """ 665 A binary feature encoding which adds one new joint-feature to the 666 joint-features defined by L{BinaryMaxentFeatureEncoding}: a 667 correction feature, whose value is chosen to ensure that the 668 sparse vector always sums to a constant non-negative number. This 669 new feature is used to ensure two preconditions for the GIS 670 training algorithm: 671 - At least one feature vector index must be nonzero for every 672 token. 673 - The feature vector must sum to a constant non-negative number 674 for every token. 675 """
676 - def __init__(self, labels, mapping, unseen_features=False, 677 alwayson_features=False, C=None):
678 """ 679 @param C: The correction constant. The value of the correction 680 feature is based on this value. In particular, its value is 681 C{C - sum([v for (f,v) in encoding])}. 682 @seealso: L{BinaryMaxentFeatureEncoding.__init__} 683 """ 684 BinaryMaxentFeatureEncoding.__init__( 685 self, labels, mapping, unseen_features, alwayson_features) 686 if C is None: 687 C = len(set([fname for (fname,fval,label) in mapping]))+1 688 689 self._C = C 690 """The non-negative constant that all encoded feature vectors 691 will sum to."""
692 693 C = property(lambda self: self._C, doc=""" 694 The non-negative constant that all encoded feature vectors 695 will sum to.""") 696
697 - def encode(self, featureset, label):
698 # Get the basic encoding. 699 encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) 700 base_length = BinaryMaxentFeatureEncoding.length(self) 701 702 # Add a correction feature. 703 total = sum([v for (f,v) in encoding]) 704 if total >= self._C: 705 raise ValueError('Correction feature is not high enough!') 706 encoding.append( (base_length, self._C-total) ) 707 708 # Return the result 709 return encoding
710
711 - def length(self):
712 return BinaryMaxentFeatureEncoding.length(self) + 1
713
714 - def describe(self, f_id):
715 if f_id == BinaryMaxentFeatureEncoding.length(self): 716 return 'Correction feature (%s)' % self._C 717 else: 718 return BinaryMaxentFeatureEncoding.describe(self, f_id)
719
720 721 -class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding):
722 - def __init__(self, labels, mapping, unseen_features=False, 723 alwayson_features=False):
724 self._mapping = OrderedDict(mapping) 725 self._label_mapping = OrderedDict() 726 BinaryMaxentFeatureEncoding.__init__(self, labels, self._mapping, 727 unseen_features, 728 alwayson_features)
729
730 - def encode(self, featureset, label):
731 encoding = [] 732 for feature, value in featureset.items(): 733 if (feature, label) not in self._mapping: 734 self._mapping[(feature, label)] = len(self._mapping) 735 if value not in self._label_mapping: 736 if not isinstance(value, int): 737 self._label_mapping[value] = len(self._label_mapping) 738 else: 739 self._label_mapping[value] = value 740 encoding.append((self._mapping[(feature, label)], 741 self._label_mapping[value])) 742 return encoding
743
744 - def labels(self):
745 return self._labels
746
747 - def describe(self, fid):
748 for (feature, label) in self._mapping: 749 if self._mapping[(feature, label)] == fid: 750 return (feature, label)
751
752 - def length(self):
753 return len(self._mapping)
754 755 @classmethod
756 - def train(cls, train_toks, count_cutoff=0, labels=None, **options):
757 mapping = OrderedDict() 758 if not labels: 759 labels = [] 760 761 # This gets read twice, so compute the values in case it's lazy. 762 train_toks = list(train_toks) 763 764 for (featureset, label) in train_toks: 765 if label not in labels: 766 labels.append(label) 767 768 for (featureset, label) in train_toks: 769 for label in labels: 770 for feature in featureset: 771 if (feature, label) not in mapping: 772 mapping[(feature, label)] = len(mapping) 773 774 return cls(labels, mapping, **options)
775
776 777 -class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI):
778 """ 779 A feature encoding that generates vectors containing integer, 780 float and binary joint-features of the form:: 781 782 Binary (for string and boolean features): 783 joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) 784 { 785 { 0 otherwise 786 Value (for integer and float features): 787 joint_feat(fs, l) = { fval if (fs[fname] == type(fval)) 788 { and (l == label) 789 { 790 { not encoded otherwise 791 792 Where C{fname} is the name of an input-feature, C{fval} is a value 793 for that input-feature, and C{label} is a label. 794 795 Typically, these features are constructed based on a training 796 corpus, using the L{train()} method. 797 798 For string and boolean features [type(fval) not in (int, float)] 799 this method will create one feature for each combination of 800 C{fname}, C{fval}, and C{label} that occurs at least once in the 801 training corpus. 802 803 For integer and float features [type(fval) in (int, float)] this 804 method will create one feature for each combination of C{fname} 805 and C{label} that occurs at least once in the training corpus. 806 807 For binary features the C{unseen_features} parameter can be used 808 to add X{unseen-value features}, which are used whenever an input 809 feature has a value that was not encountered in the training 810 corpus. These features have the form:: 811 812 joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) 813 { and l == label 814 { 815 { 0 otherwise 816 817 Where C{is_unseen(fname, fval)} is true if the encoding does not 818 contain any joint features that are true when C{fs[fname]==fval}. 819 820 The C{alwayson_features} parameter can be used to add X{always-on 821 features}, which have the form:: 822 823 joint_feat(fs, l) = { 1 if (l == label) 824 { 825 { 0 otherwise 826 827 These always-on features allow the maxent model to directly model 828 the prior probabilities of each label. 829 """
830 - def __init__(self, labels, mapping, unseen_features=False, 831 alwayson_features=False):
832 """ 833 @param labels: A list of the \"known labels\" for this encoding. 834 835 @param mapping: A dictionary mapping from C{(fname,fval,label)} 836 tuples to corresponding joint-feature indexes. These 837 indexes must be the set of integers from 0...len(mapping). 838 If C{mapping[fname,fval,label]=id}, then 839 C{self.encode({..., fname:fval, ...}, label)[id]} is 1; 840 otherwise, it is 0. 841 842 @param unseen_features: If true, then include unseen value 843 features in the generated joint-feature vectors. 844 845 @param alwayson_features: If true, then include always-on 846 features in the generated joint-feature vectors. 847 """ 848 if set(mapping.values()) != set(range(len(mapping))): 849 raise ValueError('Mapping values must be exactly the ' 850 'set of integers from 0...len(mapping)') 851 852 self._labels = list(labels) 853 """A list of attested labels.""" 854 855 self._mapping = mapping 856 """dict mapping from (fname,fval,label) -> fid""" 857 858 self._length = len(mapping) 859 """The length of generated joint feature vectors.""" 860 861 self._alwayson = None 862 """dict mapping from label -> fid""" 863 864 self._unseen = None 865 """dict mapping from fname -> fid""" 866 867 if alwayson_features: 868 self._alwayson = dict([(label,i+self._length) 869 for (i,label) in enumerate(labels)]) 870 self._length += len(self._alwayson) 871 872 if unseen_features: 873 fnames = set(fname for (fname, fval, label) in mapping) 874 self._unseen = dict([(fname, i+self._length) 875 for (i, fname) in enumerate(fnames)]) 876 self._length += len(fnames)
877
878 - def encode(self, featureset, label):
879 # Inherit docs. 880 encoding = [] 881 882 # Convert input-features to joint-features: 883 for fname, fval in featureset.items(): 884 if(type(fval) in (int, float)): 885 # Known feature name & value: 886 if (fname, type(fval), label) in self._mapping: 887 encoding.append((self._mapping[fname, type(fval), label], fval)) 888 else: 889 # Known feature name & value: 890 if (fname, fval, label) in self._mapping: 891 encoding.append((self._mapping[fname, fval, label], 1)) 892 893 # Otherwise, we might want to fire an "unseen-value feature". 894 elif self._unseen: 895 # Have we seen this fname/fval combination with any label? 896 for label2 in self._labels: 897 if (fname, fval, label2) in self._mapping: 898 break # we've seen this fname/fval combo 899 # We haven't -- fire the unseen-value feature 900 else: 901 if fname in self._unseen: 902 encoding.append((self._unseen[fname], 1)) 903 904 905 # Add always-on features: 906 if self._alwayson and label in self._alwayson: 907 encoding.append((self._alwayson[label], 1)) 908 909 return encoding
910
911 - def describe(self, f_id):
912 # Inherit docs. 913 if not isinstance(f_id, (int, long)): 914 raise TypeError('describe() expected an int') 915 try: 916 self._inv_mapping 917 except AttributeError: 918 self._inv_mapping = [-1]*len(self._mapping) 919 for (info, i) in self._mapping.items(): 920 self._inv_mapping[i] = info 921 922 if f_id < len(self._mapping): 923 (fname, fval, label) = self._inv_mapping[f_id] 924 return '%s==%r and label is %r' % (fname, fval, label) 925 elif self._alwayson and f_id in self._alwayson.values(): 926 for (label, f_id2) in self._alwayson.items(): 927 if f_id==f_id2: return 'label is %r' % label 928 elif self._unseen and f_id in self._unseen.values(): 929 for (fname, f_id2) in self._unseen.items(): 930 if f_id==f_id2: return '%s is unseen' % fname 931 else: 932 raise ValueError('Bad feature id')
933
934 - def labels(self):
935 # Inherit docs. 936 return self._labels
937
938 - def length(self):
939 # Inherit docs. 940 return self._length
941 942 @classmethod
943 - def train(cls, train_toks, count_cutoff=0, labels=None, **options):
944 """ 945 Construct and return new feature encoding, based on a given 946 training corpus C{train_toks}. See the L{class description 947 <TypedMaxentFeatureEncoding>} for a description of the 948 joint-features that will be included in this encoding. 949 950 Note: recognized feature values types are (int, float), over 951 types are interpreted as regular binary features. 952 953 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 954 @param train_toks: Training data, represented as a list of 955 pairs, the first member of which is a feature dictionary, 956 and the second of which is a classification label. 957 958 @type count_cutoff: C{int} 959 @param count_cutoff: A cutoff value that is used to discard 960 rare joint-features. If a joint-feature's value is 1 961 fewer than C{count_cutoff} times in the training corpus, 962 then that joint-feature is not included in the generated 963 encoding. 964 965 @type labels: C{list} 966 @param labels: A list of labels that should be used by the 967 classifier. If not specified, then the set of labels 968 attested in C{train_toks} will be used. 969 970 @param options: Extra parameters for the constructor, such as 971 C{unseen_features} and C{alwayson_features}. 972 """ 973 mapping = {} # maps (fname, fval, label) -> fid 974 seen_labels = set() # The set of labels we've encountered 975 count = defaultdict(int) # maps (fname, fval) -> count 976 977 for (tok, label) in train_toks: 978 if labels and label not in labels: 979 raise ValueError('Unexpected label %s' % label) 980 seen_labels.add(label) 981 982 # Record each of the features. 983 for (fname, fval) in tok.items(): 984 if(type(fval) in (int, float)): fval = type(fval) 985 # If a count cutoff is given, then only add a joint 986 # feature once the corresponding (fname, fval, label) 987 # tuple exceeds that cutoff. 988 count[fname,fval] += 1 989 if count[fname,fval] >= count_cutoff: 990 if (fname, fval, label) not in mapping: 991 mapping[fname, fval, label] = len(mapping) 992 993 if labels is None: labels = seen_labels 994 return cls(labels, mapping, **options)
995
996 997 998 999 ###################################################################### 1000 #{ Classifier Trainer: Generalized Iterative Scaling 1001 ###################################################################### 1002 1003 -def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None, 1004 labels=None, **cutoffs):
1005 """ 1006 Train a new C{ConditionalExponentialClassifier}, using the given 1007 training samples, using the Generalized Iterative Scaling 1008 algorithm. This C{ConditionalExponentialClassifier} will encode 1009 the model that maximizes entropy from all the models that are 1010 empirically consistent with C{train_toks}. 1011 1012 @see: L{train_maxent_classifier()} for parameter descriptions. 1013 """ 1014 cutoffs.setdefault('max_iter', 100) 1015 cutoffchecker = CutoffChecker(cutoffs) 1016 1017 # Construct an encoding from the training data. 1018 if encoding is None: 1019 encoding = GISEncoding.train(train_toks, labels=labels) 1020 1021 if not hasattr(encoding, 'C'): 1022 raise TypeError('The GIS algorithm requires an encoding that ' 1023 'defines C (e.g., GISEncoding).') 1024 1025 # Cinv is the inverse of the sum of each joint feature vector. 1026 # This controls the learning rate: higher Cinv (or lower C) gives 1027 # faster learning. 1028 Cinv = 1.0/encoding.C 1029 1030 # Count how many times each feature occurs in the training data. 1031 empirical_fcount = calculate_empirical_fcount(train_toks, encoding) 1032 1033 # Check for any features that are not attested in train_toks. 1034 unattested = set(numpy.nonzero(empirical_fcount==0)[0]) 1035 1036 # Build the classifier. Start with weight=0 for each attested 1037 # feature, and weight=-infinity for each unattested feature. 1038 weights = numpy.zeros(len(empirical_fcount), 'd') 1039 for fid in unattested: weights[fid] = numpy.NINF 1040 classifier = ConditionalExponentialClassifier(encoding, weights) 1041 1042 # Take the log of the empirical fcount. 1043 log_empirical_fcount = numpy.log2(empirical_fcount) 1044 del empirical_fcount 1045 1046 # Old log-likelihood and accuracy; used to check if the change 1047 # in log-likelihood or accuracy is sufficient to indicate convergence. 1048 ll_old = None 1049 acc_old = None 1050 1051 if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter'] 1052 if trace > 2: 1053 print 1054 print ' Iteration Log Likelihood Accuracy' 1055 print ' ---------------------------------------' 1056 1057 # Train the classifier. 1058 try: 1059 while True: 1060 if trace > 2: 1061 ll = cutoffchecker.ll or nltk.classify.util.log_likelihood( 1062 classifier, train_toks) 1063 acc = cutoffchecker.acc or nltk.classify.util.accuracy( 1064 classifier, train_toks) 1065 iternum = cutoffchecker.iter 1066 print ' %9d %14.5f %9.3f' % (iternum, ll, acc) 1067 1068 # Use the model to estimate the number of times each 1069 # feature should occur in the training data. 1070 estimated_fcount = calculate_estimated_fcount( 1071 classifier, train_toks, encoding) 1072 1073 # Take the log of estimated fcount (avoid taking log(0).) 1074 for fid in unattested: estimated_fcount[fid] += 1 1075 log_estimated_fcount = numpy.log2(estimated_fcount) 1076 del estimated_fcount 1077 1078 # Update the classifier weights 1079 weights = classifier.weights() 1080 weights += (log_empirical_fcount - log_estimated_fcount) * Cinv 1081 classifier.set_weights(weights) 1082 1083 # Check the log-likelihood & accuracy cutoffs. 1084 if cutoffchecker.check(classifier, train_toks): 1085 break 1086 1087 except KeyboardInterrupt: 1088 print ' Training stopped: keyboard interrupt' 1089 except: 1090 raise 1091 1092 if trace > 2: 1093 ll = nltk.classify.util.log_likelihood(classifier, train_toks) 1094 acc = nltk.classify.util.accuracy(classifier, train_toks) 1095 print ' Final %14.5f %9.3f' % (ll, acc) 1096 1097 # Return the classifier. 1098 return classifier
1099
1100 -def calculate_empirical_fcount(train_toks, encoding):
1101 fcount = numpy.zeros(encoding.length(), 'd') 1102 1103 for tok, label in train_toks: 1104 for (index, val) in encoding.encode(tok, label): 1105 fcount[index] += val 1106 1107 return fcount
1108
1109 -def calculate_estimated_fcount(classifier, train_toks, encoding):
1110 fcount = numpy.zeros(encoding.length(), 'd') 1111 1112 for tok, label in train_toks: 1113 pdist = classifier.prob_classify(tok) 1114 for label in pdist.samples(): 1115 prob = pdist.prob(label) 1116 for (fid, fval) in encoding.encode(tok, label): 1117 fcount[fid] += prob*fval 1118 1119 return fcount
1120
1121 1122 ###################################################################### 1123 #{ Classifier Trainer: Improved Iterative Scaling 1124 ###################################################################### 1125 1126 -def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None, 1127 labels=None, **cutoffs):
1128 """ 1129 Train a new C{ConditionalExponentialClassifier}, using the given 1130 training samples, using the Improved Iterative Scaling algorithm. 1131 This C{ConditionalExponentialClassifier} will encode the model 1132 that maximizes entropy from all the models that are empirically 1133 consistent with C{train_toks}. 1134 1135 @see: L{train_maxent_classifier()} for parameter descriptions. 1136 """ 1137 cutoffs.setdefault('max_iter', 100) 1138 cutoffchecker = CutoffChecker(cutoffs) 1139 1140 # Construct an encoding from the training data. 1141 if encoding is None: 1142 encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) 1143 1144 # Count how many times each feature occurs in the training data. 1145 empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) / 1146 len(train_toks)) 1147 1148 # Find the nf map, and related variables nfarray and nfident. 1149 # nf is the sum of the features for a given labeled text. 1150 # nfmap compresses this sparse set of values to a dense list. 1151 # nfarray performs the reverse operation. nfident is 1152 # nfarray multiplied by an identity matrix. 1153 nfmap = calculate_nfmap(train_toks, encoding) 1154 nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd') 1155 nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) 1156 1157 # Check for any features that are not attested in train_toks. 1158 unattested = set(numpy.nonzero(empirical_ffreq==0)[0]) 1159 1160 # Build the classifier. Start with weight=0 for each attested 1161 # feature, and weight=-infinity for each unattested feature. 1162 weights = numpy.zeros(len(empirical_ffreq), 'd') 1163 for fid in unattested: weights[fid] = numpy.NINF 1164 classifier = ConditionalExponentialClassifier(encoding, weights) 1165 1166 if trace > 0: print ' ==> Training (%d iterations)' % cutoffs['max_iter'] 1167 if trace > 2: 1168 print 1169 print ' Iteration Log Likelihood Accuracy' 1170 print ' ---------------------------------------' 1171 1172 # Old log-likelihood and accuracy; used to check if the change 1173 # in log-likelihood or accuracy is sufficient to indicate convergence. 1174 ll_old = None 1175 acc_old = None 1176 1177 # Train the classifier. 1178 try: 1179 while True: 1180 if trace > 2: 1181 ll = cutoffchecker.ll or nltk.classify.util.log_likelihood( 1182 classifier, train_toks) 1183 acc = cutoffchecker.acc or nltk.classify.util.accuracy( 1184 classifier, train_toks) 1185 iternum = cutoffchecker.iter 1186 print ' %9d %14.5f %9.3f' % (iternum, ll, acc) 1187 1188 # Calculate the deltas for this iteration, using Newton's method. 1189 deltas = calculate_deltas( 1190 train_toks, classifier, unattested, empirical_ffreq, 1191 nfmap, nfarray, nftranspose, encoding) 1192 1193 # Use the deltas to update our weights. 1194 weights = classifier.weights() 1195 weights += deltas 1196 classifier.set_weights(weights) 1197 1198 # Check the log-likelihood & accuracy cutoffs. 1199 if cutoffchecker.check(classifier, train_toks): 1200 break 1201 1202 except KeyboardInterrupt: 1203 print ' Training stopped: keyboard interrupt' 1204 except: 1205 raise 1206 1207 1208 if trace > 2: 1209 ll = nltk.classify.util.log_likelihood(classifier, train_toks) 1210 acc = nltk.classify.util.accuracy(classifier, train_toks) 1211 print ' Final %14.5f %9.3f' % (ll, acc) 1212 1213 # Return the classifier. 1214 return classifier
1215
1216 -def calculate_nfmap(train_toks, encoding):
1217 """ 1218 Construct a map that can be used to compress C{nf} (which is 1219 typically sparse). 1220 1221 M{nf(feature_vector)} is the sum of the feature values for 1222 M{feature_vector}. 1223 1224 This represents the number of features that are active for a 1225 given labeled text. This method finds all values of M{nf(t)} 1226 that are attested for at least one token in the given list of 1227 training tokens; and constructs a dictionary mapping these 1228 attested values to a continuous range M{0...N}. For example, 1229 if the only values of M{nf()} that were attested were 3, 5, 1230 and 7, then C{_nfmap} might return the dictionary {3:0, 5:1, 1231 7:2}. 1232 1233 @return: A map that can be used to compress C{nf} to a dense 1234 vector. 1235 @rtype: C{dictionary} from C{int} to C{int} 1236 """ 1237 # Map from nf to indices. This allows us to use smaller arrays. 1238 nfset = set() 1239 for tok, _ in train_toks: 1240 for label in encoding.labels(): 1241 nfset.add(sum([val for (id,val) in encoding.encode(tok,label)])) 1242 return dict([(nf, i) for (i, nf) in enumerate(nfset)])
1243
1244 -def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical, 1245 nfmap, nfarray, nftranspose, encoding):
1246 """ 1247 Calculate the update values for the classifier weights for 1248 this iteration of IIS. These update weights are the value of 1249 C{delta} that solves the equation:: 1250 1251 ffreq_empirical[i] 1252 = 1253 SUM[fs,l] (classifier.prob_classify(fs).prob(l) * 1254 feature_vector(fs,l)[i] * 1255 exp(delta[i] * nf(feature_vector(fs,l)))) 1256 1257 Where: 1258 - M{(fs,l)} is a (featureset, label) tuple from C{train_toks} 1259 - M{feature_vector(fs,l)} = C{encoding.encode(fs,l)} 1260 - M{nf(vector)} = C{sum([val for (id,val) in vector])} 1261 1262 This method uses Newton's method to solve this equation for 1263 M{delta[i]}. In particular, it starts with a guess of 1264 C{delta[i]}=1; and iteratively updates C{delta} with:: 1265 1266 delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) 1267 1268 until convergence, where M{sum1} and M{sum2} are defined as:: 1269 1270 sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) 1271 1272 sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta) * 1273 nf(feature_vector(fs,l))) 1274 1275 f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) * 1276 feature_vector(fs,l)[i] * 1277 exp(delta[i] * nf(feature_vector(fs,l)))) 1278 1279 Note that M{sum1} and M{sum2} depend on C{delta}; so they need 1280 to be re-computed each iteration. 1281 1282 The variables C{nfmap}, C{nfarray}, and C{nftranspose} are 1283 used to generate a dense encoding for M{nf(ltext)}. This 1284 allows C{_deltas} to calculate M{sum1} and M{sum2} using 1285 matrices, which yields a signifigant performance improvement. 1286 1287 @param train_toks: The set of training tokens. 1288 @type train_toks: C{list} of C{tuples} of (C{dict}, C{str}) 1289 @param classifier: The current classifier. 1290 @type classifier: C{ClassifierI} 1291 @param ffreq_empirical: An array containing the empirical 1292 frequency for each feature. The M{i}th element of this 1293 array is the empirical frequency for feature M{i}. 1294 @type ffreq_empirical: C{sequence} of C{float} 1295 @param unattested: An array that is 1 for features that are 1296 not attested in the training data; and 0 for features that 1297 are attested. In other words, C{unattested[i]==0} iff 1298 C{ffreq_empirical[i]==0}. 1299 @type unattested: C{sequence} of C{int} 1300 @param nfmap: A map that can be used to compress C{nf} to a dense 1301 vector. 1302 @type nfmap: C{dictionary} from C{int} to C{int} 1303 @param nfarray: An array that can be used to uncompress C{nf} 1304 from a dense vector. 1305 @type nfarray: C{array} of C{float} 1306 @param nftranspose: C{array} of C{float} 1307 @type nftranspose: The transpose of C{nfarray} 1308 """ 1309 # These parameters control when we decide that we've 1310 # converged. It probably should be possible to set these 1311 # manually, via keyword arguments to train. 1312 NEWTON_CONVERGE = 1e-12 1313 MAX_NEWTON = 300 1314 1315 deltas = numpy.ones(encoding.length(), 'd') 1316 1317 # Precompute the A matrix: 1318 # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) 1319 # over all label,fs s.t. num_features[label,fs]=nf 1320 A = numpy.zeros((len(nfmap), encoding.length()), 'd') 1321 1322 for tok, label in train_toks: 1323 dist = classifier.prob_classify(tok) 1324 1325 for label in encoding.labels(): 1326 # Generate the feature vector 1327 feature_vector = encoding.encode(tok,label) 1328 # Find the number of active features 1329 nf = sum([val for (id, val) in feature_vector]) 1330 # Update the A matrix 1331 for (id, val) in feature_vector: 1332 A[nfmap[nf], id] += dist.prob(label) * val 1333 A /= len(train_toks) 1334 1335 # Iteratively solve for delta. Use the following variables: 1336 # - nf_delta[x][y] = nfarray[x] * delta[y] 1337 # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) 1338 # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) 1339 # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) 1340 # exp(delta[i]nf) 1341 # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) 1342 # nf exp(delta[i]nf) 1343 for rangenum in range(MAX_NEWTON): 1344 nf_delta = numpy.outer(nfarray, deltas) 1345 exp_nf_delta = 2 ** nf_delta 1346 nf_exp_nf_delta = nftranspose * exp_nf_delta 1347 sum1 = numpy.sum(exp_nf_delta * A, axis=0) 1348 sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) 1349 1350 # Avoid division by zero. 1351 for fid in unattested: sum2[fid] += 1 1352 1353 # Update the deltas. 1354 deltas -= (ffreq_empirical - sum1) / -sum2 1355 1356 # We can stop once we converge. 1357 n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/ 1358 numpy.sum(abs(deltas))) 1359 if n_error < NEWTON_CONVERGE: 1360 return deltas 1361 1362 return deltas
1363
1364 ###################################################################### 1365 #{ Classifier Trainer: scipy algorithms (GC, LBFGSB, etc.) 1366 ###################################################################### 1367 1368 # [xx] n.b.: it's possible to supply custom trace functions, which 1369 # could be used to make trace output consistent with iis/gis. 1370 -def train_maxent_classifier_with_scipy(train_toks, trace=3, encoding=None, 1371 labels=None, algorithm='CG', 1372 sparse=True, gaussian_prior_sigma=0, 1373 **cutoffs):
1374 """ 1375 Train a new C{ConditionalExponentialClassifier}, using the given 1376 training samples, using the specified C{scipy} optimization 1377 algorithm. This C{ConditionalExponentialClassifier} will encode 1378 the model that maximizes entropy from all the models that are 1379 empirically consistent with C{train_toks}. 1380 1381 @see: L{train_maxent_classifier()} for parameter descriptions. 1382 @require: The C{scipy} package must be installed. 1383 """ 1384 try: 1385 import scipy 1386 except ImportError, e: 1387 raise ValueError('The maxent training algorithm %r requires ' 1388 'that the scipy package be installed. See ' 1389 'http://www.scipy.org/' % algorithm) 1390 try: 1391 # E.g., if libgfortran.2.dylib is not found. 1392 import scipy.sparse, scipy.maxentropy 1393 except ImportError, e: 1394 raise ValueError('Import of scipy package failed: %s' % e) 1395 1396 # Construct an encoding from the training data. 1397 if encoding is None: 1398 encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) 1399 elif labels is not None: 1400 raise ValueError('Specify encoding or labels, not both') 1401 1402 labels = encoding.labels() 1403 labelnum = dict([(label, i) for (i, label) in enumerate(labels)]) 1404 num_features = encoding.length() 1405 num_toks = len(train_toks) 1406 num_labels = len(labels) 1407 1408 # Decide whether to use a sparse matrix or a dense one. Very 1409 # limited testing has shown that the lil matrix format 1410 # (list-of-lists) performs better than csr and csc formats. 1411 # Limited testing also suggests that the sparse matrix format 1412 # doesn't save much memory over the dense format in practice 1413 # (in terms of max memory usage). 1414 if sparse: zeros = scipy.sparse.lil_matrix 1415 else: zeros = numpy.zeros 1416 1417 # Construct the 'F' matrix, which lists the feature values for 1418 # each training instance. F[i, j*len(labels)+k] is equal to the 1419 # value of the i'th feature for the feature vector corresponding 1420 # to (tok[j], label[k]). 1421 F = zeros((num_features, num_toks*num_labels)) 1422 1423 # Construct the 'N' matrix, which specifies the correct label for 1424 # each training instance. N[0, j*len(labels)+k] is equal to one 1425 # iff label[k] is the correct label for tok[j]. 1426 N = zeros((1, num_toks*num_labels)) 1427 1428 # Fill in the 'F' and 'N' matrices (just make one pass through the 1429 # training tokens.) 1430 for toknum, (featureset, label) in enumerate(train_toks): 1431 N[0, toknum*len(labels) + labelnum[label]] += 1 1432 for label2 in labels: 1433 for (fid, fval) in encoding.encode(featureset, label2): 1434 F[fid, toknum*len(labels) + labelnum[label2]] = fval 1435 1436 # Set up the scipy model, based on the matrices F and N. 1437 model = scipy.maxentropy.conditionalmodel(F, N, num_toks) 1438 # note -- model.setsmooth() is buggy. 1439 if gaussian_prior_sigma: 1440 model.sigma2 = gaussian_prior_sigma**2 1441 if algorithm == 'LBFGSB': 1442 model.log = None 1443 if trace >= 3: 1444 model.verbose = True 1445 if 'max_iter' in cutoffs: 1446 model.maxiter = cutoffs['max_iter'] 1447 if 'tolerance' in cutoffs: 1448 if algorithm == 'CG': model.avegtol = cutoffs['tolerance'] 1449 elif algorithm == 'LBFGSB': model.maxgtol = cutoffs['tolerance'] 1450 else: model.tol = cutoffs['tolerance'] 1451 1452 # Train the model. 1453 model.fit(algorithm=algorithm) 1454 1455 # Convert the model's weights from base-e to base-2 weights. 1456 weights = model.params * numpy.log2(numpy.e) 1457 1458 # Build the classifier 1459 return MaxentClassifier(encoding, weights)
1460
1461 ###################################################################### 1462 #{ Classifier Trainer: megam 1463 ###################################################################### 1464 1465 # [xx] possible extension: add support for using implicit file format; 1466 # this would need to put requirements on what encoding is used. But 1467 # we may need this for other maxent classifier trainers that require 1468 # implicit formats anyway. 1469 -def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None, 1470 labels=None, gaussian_prior_sigma=0, 1471 **kwargs):
1472 """ 1473 Train a new C{ConditionalExponentialClassifier}, using the given 1474 training samples, using the external C{megam} library. This 1475 C{ConditionalExponentialClassifier} will encode the model that 1476 maximizes entropy from all the models that are empirically 1477 consistent with C{train_toks}. 1478 1479 @see: L{train_maxent_classifier()} for parameter descriptions. 1480 @see: L{nltk.classify.megam} 1481 """ 1482 1483 explicit = True 1484 bernoulli = True 1485 if 'explicit' in kwargs: explicit = kwargs['explicit'] 1486 if 'bernoulli' in kwargs: bernoulli = kwargs['bernoulli'] 1487 1488 # Construct an encoding from the training data. 1489 if encoding is None: 1490 # Count cutoff can also be controlled by megam with the -minfc 1491 # option. Not sure where the best place for it is. 1492 count_cutoff = kwargs.get('count_cutoff', 0) 1493 encoding = BinaryMaxentFeatureEncoding.train(train_toks, count_cutoff, 1494 labels=labels, 1495 alwayson_features=True) 1496 elif labels is not None: 1497 raise ValueError('Specify encoding or labels, not both') 1498 1499 # Write a training file for megam. 1500 try: 1501 fd, trainfile_name = tempfile.mkstemp(prefix='nltk-', suffix='.gz') 1502 trainfile = gzip.open(trainfile_name, 'wb') 1503 write_megam_file(train_toks, encoding, trainfile, \ 1504 explicit=explicit, bernoulli=bernoulli) 1505 trainfile.close() 1506 except (OSError, IOError, ValueError), e: 1507 raise ValueError('Error while creating megam training file: %s' % e) 1508 1509 # Run megam on the training file. 1510 options = [] 1511 options += ['-nobias', '-repeat', '10'] 1512 if explicit: 1513 options += ['-explicit'] 1514 if not bernoulli: 1515 options += ['-fvals'] 1516 if gaussian_prior_sigma: 1517 # Lambda is just the precision of the Gaussian prior, i.e. it's the 1518 # inverse variance, so the parameter conversion is 1.0/sigma**2. 1519 # See http://www.cs.utah.edu/~hal/docs/daume04cg-bfgs.pdf. 1520 inv_variance = 1.0 / gaussian_prior_sigma**2 1521 else: 1522 inv_variance = 0 1523 options += ['-lambda', '%.2f' % inv_variance, '-tune'] 1524 if trace < 3: 1525 options += ['-quiet'] 1526 if 'max_iter' in kwargs: 1527 options += ['-maxi', '%s' % kwargs['max_iter']] 1528 if 'll_delta' in kwargs: 1529 # [xx] this is actually a perplexity delta, not a log 1530 # likelihood delta 1531 options += ['-dpp', '%s' % abs(kwargs['ll_delta'])] 1532 options += ['multiclass', trainfile_name] 1533 stdout = call_megam(options) 1534 # print './megam_i686.opt ', ' '.join(options) 1535 # Delete the training file 1536 try: os.remove(trainfile_name) 1537 except (OSError, IOError), e: 1538 print 'Warning: unable to delete %s: %s' % (trainfile_name, e) 1539 1540 # Parse the generated weight vector. 1541 weights = parse_megam_weights(stdout, encoding.length(), explicit) 1542 1543 # Convert from base-e to base-2 weights. 1544 weights *= numpy.log2(numpy.e) 1545 1546 # Build the classifier 1547 return MaxentClassifier(encoding, weights)
1548
1549 ###################################################################### 1550 #{ Classifier Trainer: tadm 1551 ###################################################################### 1552 1553 -class TadmMaxentClassifier(MaxentClassifier):
1554 @classmethod
1555 - def train(cls, train_toks, **kwargs):
1556 algorithm = kwargs.get('algorithm', 'tao_lmvm') 1557 trace = kwargs.get('trace', 3) 1558 encoding = kwargs.get('encoding', None) 1559 labels = kwargs.get('labels', None) 1560 sigma = kwargs.get('gaussian_prior_sigma', 0) 1561 count_cutoff = kwargs.get('count_cutoff', 0) 1562 max_iter = kwargs.get('max_iter') 1563 ll_delta = kwargs.get('min_lldelta') 1564 1565 # Construct an encoding from the training data. 1566 if not encoding: 1567 encoding = TadmEventMaxentFeatureEncoding.train(train_toks, 1568 count_cutoff, 1569 labels=labels) 1570 1571 trainfile_fd, trainfile_name = \ 1572 tempfile.mkstemp(prefix='nltk-tadm-events-', suffix='.gz') 1573 weightfile_fd, weightfile_name = \ 1574 tempfile.mkstemp(prefix='nltk-tadm-weights-') 1575 1576 trainfile = gzip.open(trainfile_name, 'wb') 1577 write_tadm_file(train_toks, encoding, trainfile) 1578 trainfile.close() 1579 1580 options = [] 1581 options.extend(['-monitor']) 1582 options.extend(['-method', algorithm]) 1583 if sigma: 1584 options.extend(['-l2', '%.6f' % sigma**2]) 1585 if max_iter: 1586 options.extend(['-max_it', '%d' % max_iter]) 1587 if ll_delta: 1588 options.extend(['-fatol', '%.6f' % abs(ll_delta)]) 1589 options.extend(['-events_in', trainfile_name]) 1590 options.extend(['-params_out', weightfile_name]) 1591 if trace < 3: 1592 options.extend(['2>&1']) 1593 else: 1594 options.extend(['-summary']) 1595 1596 call_tadm(options) 1597 1598 weightfile = open(weightfile_name, 'rb') 1599 weights = parse_tadm_weights(weightfile) 1600 weightfile.close() 1601 1602 os.remove(trainfile_name) 1603 os.remove(weightfile_name) 1604 1605 # Convert from base-e to base-2 weights. 1606 weights *= numpy.log2(numpy.e) 1607 1608 # Build the classifier 1609 return cls(encoding, weights)
1610
1611 ###################################################################### 1612 #{ Demo 1613 ###################################################################### 1614 -def demo():
1615 from nltk.classify.util import names_demo 1616 classifier = names_demo(MaxentClassifier.train)
1617 1618 if __name__ == '__main__': 1619 demo() 1620