Package nltk :: Module probability
[hide private]
[frames] | no frames]

Source Code for Module nltk.probability

   1  # -*- coding: utf-8 -*- 
   2  # Natural Language Toolkit: Probability and Statistics 
   3  # 
   4  # Copyright (C) 2001-2011 NLTK Project 
   5  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
   6  #         Steven Bird <sb@csse.unimelb.edu.au> (additions) 
   7  #         Trevor Cohn <tacohn@cs.mu.oz.au> (additions) 
   8  #         Peter Ljunglöf <peter.ljunglof@heatherleaf.se> (additions) 
   9  #         Liang Dong <ldong@clemson.edu> (additions) 
  10  #         Geoffrey Sampson <sampson@cantab.net> (additions) 
  11  # 
  12  # URL: <http://www.nltk.org/> 
  13  # For license information, see LICENSE.TXT 
  14  # 
  15  # $Id: probability.py 8776 2011-04-10 06:43:28Z StevenBird1 $ 
  16   
  17  _NINF = float('-1e300') 
  18   
  19  """ 
  20  Classes for representing and processing probabilistic information. 
  21   
  22  The L{FreqDist} class is used to encode X{frequency distributions}, 
  23  which count the number of times that each outcome of an experiment 
  24  occurs. 
  25   
  26  The L{ProbDistI} class defines a standard interface for X{probability 
  27  distributions}, which encode the probability of each outcome for an 
  28  experiment.  There are two types of probability distribution: 
  29   
  30    - X{derived probability distributions} are created from frequency 
  31      distributions.  They attempt to model the probability distribution 
  32      that generated the frequency distribution. 
  33    - X{analytic probability distributions} are created directly from 
  34      parameters (such as variance). 
  35   
  36  The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface 
  37  are used to encode conditional distributions.  Conditional probability 
  38  distributions can be derived or analytic; but currently the only 
  39  implementation of the C{ConditionalProbDistI} interface is 
  40  L{ConditionalProbDist}, a derived distribution. 
  41   
  42  """ 
  43   
  44  import math 
  45  import random 
  46  import warnings 
  47  from operator import itemgetter 
  48  from itertools import imap, islice 
  49   
  50  from nltk.compat import all 
  51   
  52  ##////////////////////////////////////////////////////// 
  53  ##  Frequency Distributions 
  54  ##////////////////////////////////////////////////////// 
  55   
  56  # [SB] inherit from defaultdict? 
  57  # [SB] for NLTK 3.0, inherit from collections.Counter? 
  58   
59 -class FreqDist(dict):
60 """ 61 A frequency distribution for the outcomes of an experiment. A 62 frequency distribution records the number of times each outcome of 63 an experiment has occurred. For example, a frequency distribution 64 could be used to record the frequency of each word type in a 65 document. Formally, a frequency distribution can be defined as a 66 function mapping from each sample to the number of times that 67 sample occurred as an outcome. 68 69 Frequency distributions are generally constructed by running a 70 number of experiments, and incrementing the count for a sample 71 every time it is an outcome of an experiment. For example, the 72 following code will produce a frequency distribution that encodes 73 how often each word occurs in a text: 74 75 >>> fdist = FreqDist() 76 >>> for word in tokenize.whitespace(sent): 77 ... fdist.inc(word.lower()) 78 79 An equivalent way to do this is with the initializer: 80 81 >>> fdist = FreqDist(word.lower() for word in tokenize.whitespace(sent)) 82 83 """
84 - def __init__(self, samples=None):
85 """ 86 Construct a new frequency distribution. If C{samples} is 87 given, then the frequency distribution will be initialized 88 with the count of each object in C{samples}; otherwise, it 89 will be initialized to be empty. 90 91 In particular, C{FreqDist()} returns an empty frequency 92 distribution; and C{FreqDist(samples)} first creates an empty 93 frequency distribution, and then calls C{update} with the 94 list C{samples}. 95 96 @param samples: The samples to initialize the frequency 97 distribution with. 98 @type samples: Sequence 99 """ 100 dict.__init__(self) 101 self._N = 0 102 self._reset_caches() 103 if samples: 104 self.update(samples)
105
106 - def inc(self, sample, count=1):
107 """ 108 Increment this C{FreqDist}'s count for the given 109 sample. 110 111 @param sample: The sample whose count should be incremented. 112 @type sample: any 113 @param count: The amount to increment the sample's count by. 114 @type count: C{int} 115 @rtype: None 116 @raise NotImplementedError: If C{sample} is not a 117 supported sample type. 118 """ 119 if count == 0: return 120 self[sample] = self.get(sample,0) + count
121
122 - def __setitem__(self, sample, value):
123 """ 124 Set this C{FreqDist}'s count for the given sample. 125 126 @param sample: The sample whose count should be incremented. 127 @type sample: any hashable object 128 @param count: The new value for the sample's count 129 @type count: C{int} 130 @rtype: None 131 @raise TypeError: If C{sample} is not a supported sample type. 132 """ 133 134 self._N += (value - self.get(sample, 0)) 135 dict.__setitem__(self, sample, value) 136 137 # Invalidate the caches 138 self._reset_caches()
139
140 - def N(self):
141 """ 142 @return: The total number of sample outcomes that have been 143 recorded by this C{FreqDist}. For the number of unique 144 sample values (or bins) with counts greater than zero, use 145 C{FreqDist.B()}. 146 @rtype: C{int} 147 """ 148 return self._N
149
150 - def B(self):
151 """ 152 @return: The total number of sample values (or X{bins}) that 153 have counts greater than zero. For the total 154 number of sample outcomes recorded, use C{FreqDist.N()}. 155 (FreqDist.B() is the same as len(FreqDist).) 156 @rtype: C{int} 157 """ 158 return len(self)
159 160 # deprecate this -- use keys() instead?
161 - def samples(self):
162 """ 163 @return: A list of all samples that have been recorded as 164 outcomes by this frequency distribution. Use C{count()} 165 to determine the count for each sample. 166 @rtype: C{list} 167 """ 168 return self.keys()
169
170 - def hapaxes(self):
171 """ 172 @return: A list of all samples that occur once (hapax legomena) 173 @rtype: C{list} 174 """ 175 return [item for item in self if self[item] == 1]
176
177 - def Nr(self, r, bins=None):
178 """ 179 @return: The number of samples with count r. 180 @rtype: C{int} 181 @type r: C{int} 182 @param r: A sample count. 183 @type bins: C{int} 184 @param bins: The number of possible sample outcomes. C{bins} 185 is used to calculate Nr(0). In particular, Nr(0) is 186 C{bins-self.B()}. If C{bins} is not specified, it 187 defaults to C{self.B()} (so Nr(0) will be 0). 188 """ 189 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative' 190 191 # Special case for Nr(0): 192 if r == 0: 193 if bins is None: return 0 194 else: return bins-self.B() 195 196 # We have to search the entire distribution to find Nr. Since 197 # this is an expensive operation, and is likely to be used 198 # repeatedly, cache the results. 199 if self._Nr_cache is None: 200 self._cache_Nr_values() 201 202 if r >= len(self._Nr_cache): return 0 203 return self._Nr_cache[r]
204
205 - def _cache_Nr_values(self):
206 Nr = [0] 207 for sample in self: 208 c = self.get(sample, 0) 209 if c >= len(Nr): 210 Nr += [0]*(c+1-len(Nr)) 211 Nr[c] += 1 212 self._Nr_cache = Nr
213
214 - def count(self, sample):
215 """ 216 Return the count of a given sample. The count of a sample is 217 defined as the number of times that sample outcome was 218 recorded by this C{FreqDist}. Counts are non-negative 219 integers. This method has been replaced by conventional 220 dictionary indexing; use fd[item] instead of fd.count(item). 221 222 @return: The count of a given sample. 223 @rtype: C{int} 224 @param sample: the sample whose count 225 should be returned. 226 @type sample: any. 227 """ 228 raise AttributeError, "Use indexing to look up an entry in a FreqDist, e.g. fd[item]"
229
230 - def _cumulative_frequencies(self, samples=None):
231 """ 232 Return the cumulative frequencies of the specified samples. 233 If no samples are specified, all counts are returned, starting 234 with the largest. 235 236 @return: The cumulative frequencies of the given samples. 237 @rtype: C{list} of C{float} 238 @param samples: the samples whose frequencies should be returned. 239 @type sample: any. 240 """ 241 cf = 0.0 242 if not samples: 243 samples = self.keys() 244 for sample in samples: 245 cf += self[sample] 246 yield cf
247 248 # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs, 249 # here, freq() does probs
250 - def freq(self, sample):
251 """ 252 Return the frequency of a given sample. The frequency of a 253 sample is defined as the count of that sample divided by the 254 total number of sample outcomes that have been recorded by 255 this C{FreqDist}. The count of a sample is defined as the 256 number of times that sample outcome was recorded by this 257 C{FreqDist}. Frequencies are always real numbers in the range 258 [0, 1]. 259 260 @return: The frequency of a given sample. 261 @rtype: float 262 @param sample: the sample whose frequency 263 should be returned. 264 @type sample: any 265 """ 266 if self._N is 0: 267 return 0 268 return float(self[sample]) / self._N
269
270 - def max(self):
271 """ 272 Return the sample with the greatest number of outcomes in this 273 frequency distribution. If two or more samples have the same 274 number of outcomes, return one of them; which sample is 275 returned is undefined. If no outcomes have occurred in this 276 frequency distribution, return C{None}. 277 278 @return: The sample with the maximum number of outcomes in this 279 frequency distribution. 280 @rtype: any or C{None} 281 """ 282 if self._max_cache is None: 283 self._max_cache = max([(a,b) for (b,a) in self.items()])[1] 284 return self._max_cache
285
286 - def plot(self, *args, **kwargs):
287 """ 288 Plot samples from the frequency distribution 289 displaying the most frequent sample first. If an integer 290 parameter is supplied, stop after this many samples have been 291 plotted. If two integer parameters m, n are supplied, plot a 292 subset of the samples, beginning with m and stopping at n-1. 293 For a cumulative plot, specify cumulative=True. 294 (Requires Matplotlib to be installed.) 295 296 @param title: The title for the graph 297 @type title: C{str} 298 @param cumulative: A flag to specify whether the plot is cumulative (default = False) 299 @type title: C{bool} 300 """ 301 try: 302 import pylab 303 except ImportError: 304 raise ValueError('The plot function requires the matplotlib package (aka pylab).' 305 'See http://matplotlib.sourceforge.net/') 306 307 if len(args) == 0: 308 args = [len(self)] 309 samples = list(islice(self, *args)) 310 311 cumulative = _get_kwarg(kwargs, 'cumulative', False) 312 if cumulative: 313 freqs = list(self._cumulative_frequencies(samples)) 314 ylabel = "Cumulative Counts" 315 else: 316 freqs = [self[sample] for sample in samples] 317 ylabel = "Counts" 318 # percents = [f * 100 for f in freqs] only in ProbDist? 319 320 pylab.grid(True, color="silver") 321 if not "linewidth" in kwargs: 322 kwargs["linewidth"] = 2 323 if "title" in kwargs: 324 pylab.title(kwargs["title"]) 325 del kwargs["title"] 326 pylab.plot(freqs, **kwargs) 327 pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) 328 pylab.xlabel("Samples") 329 pylab.ylabel(ylabel) 330 pylab.show()
331
332 - def tabulate(self, *args, **kwargs):
333 """ 334 Tabulate the given samples from the frequency distribution (cumulative), 335 displaying the most frequent sample first. If an integer 336 parameter is supplied, stop after this many samples have been 337 plotted. If two integer parameters m, n are supplied, plot a 338 subset of the samples, beginning with m and stopping at n-1. 339 (Requires Matplotlib to be installed.) 340 341 @param samples: The samples to plot (default is all samples) 342 @type samples: C{list} 343 """ 344 if len(args) == 0: 345 args = [len(self)] 346 samples = list(islice(self, *args)) 347 348 cumulative = _get_kwarg(kwargs, 'cumulative', False) 349 if cumulative: 350 freqs = list(self._cumulative_frequencies(samples)) 351 else: 352 freqs = [self[sample] for sample in samples] 353 # percents = [f * 100 for f in freqs] only in ProbDist? 354 355 for i in range(len(samples)): 356 print "%4s" % str(samples[i]), 357 print 358 for i in range(len(samples)): 359 print "%4d" % freqs[i], 360 print
361
362 - def sorted_samples(self):
363 raise AttributeError, "Use FreqDist.keys(), or iterate over the FreqDist to get its samples in sorted order (most frequent first)"
364
365 - def sorted(self):
366 raise AttributeError, "Use FreqDist.keys(), or iterate over the FreqDist to get its samples in sorted order (most frequent first)"
367
368 - def _sort_keys_by_value(self):
369 if not self._item_cache: 370 self._item_cache = sorted(dict.items(self), key=lambda x:(-x[1], x[0]))
371
372 - def keys(self):
373 """ 374 Return the samples sorted in decreasing order of frequency. 375 376 @return: A list of samples, in sorted order 377 @rtype: C{list} of any 378 """ 379 self._sort_keys_by_value() 380 return map(itemgetter(0), self._item_cache)
381
382 - def values(self):
383 """ 384 Return the samples sorted in decreasing order of frequency. 385 386 @return: A list of samples, in sorted order 387 @rtype: C{list} of any 388 """ 389 self._sort_keys_by_value() 390 return map(itemgetter(1), self._item_cache)
391
392 - def items(self):
393 """ 394 Return the items sorted in decreasing order of frequency. 395 396 @return: A list of items, in sorted order 397 @rtype: C{list} of C{tuple} 398 """ 399 self._sort_keys_by_value() 400 return self._item_cache[:]
401
402 - def __iter__(self):
403 """ 404 Return the samples sorted in decreasing order of frequency. 405 406 @return: An iterator over the samples, in sorted order 407 @rtype: C{iter} 408 """ 409 return iter(self.keys())
410
411 - def iterkeys(self):
412 """ 413 Return the samples sorted in decreasing order of frequency. 414 415 @return: An iterator over the samples, in sorted order 416 @rtype: C{iter} 417 """ 418 return iter(self.keys())
419
420 - def itervalues(self):
421 """ 422 Return the values sorted in decreasing order. 423 424 @return: An iterator over the values, in sorted order 425 @rtype: C{iter} 426 """ 427 return iter(self.values())
428
429 - def iteritems(self):
430 """ 431 Return the items sorted in decreasing order of frequency. 432 433 @return: An iterator over the items, in sorted order 434 @rtype: C{iter} of any 435 """ 436 self._sort_keys_by_value() 437 return iter(self._item_cache)
438 439 # sort the supplied samples 440 # if samples: 441 # items = [(sample, self[sample]) for sample in set(samples)] 442
443 - def copy(self):
444 """ 445 Create a copy of this frequency distribution. 446 447 @return: A copy of this frequency distribution object. 448 @rtype: C{FreqDist} 449 """ 450 return self.__class__(self)
451
452 - def update(self, samples):
453 """ 454 Update the frequency distribution with the provided list of samples. 455 This is a faster way to add multiple samples to the distribution. 456 457 @param samples: The samples to add. 458 @type samples: C{list} 459 """ 460 try: 461 sample_iter = samples.iteritems() 462 except: 463 sample_iter = imap(lambda x: (x,1), samples) 464 for sample, count in sample_iter: 465 self.inc(sample, count=count)
466
467 - def pop(self, other):
468 self._reset_caches() 469 return dict.pop(self, other)
470
471 - def popitem(self, other):
472 self._reset_caches() 473 return dict.popitem(self, other)
474
475 - def clear(self):
476 self._N = 0 477 self._reset_caches() 478 dict.clear(self)
479
480 - def _reset_caches(self):
481 self._Nr_cache = None 482 self._max_cache = None 483 self._item_cache = None
484
485 - def __add__(self, other):
486 clone = self.copy() 487 clone.update(other) 488 return clone
489 - def __eq__(self, other):
490 if not isinstance(other, FreqDist): return False 491 return self.items() == other.items() # items are already sorted
492 - def __ne__(self, other):
493 return not (self == other)
494 - def __le__(self, other):
495 if not isinstance(other, FreqDist): return False 496 return set(self).issubset(other) and all(self[key] <= other[key] for key in self)
497 - def __lt__(self, other):
498 if not isinstance(other, FreqDist): return False 499 return self <= other and self != other
500 - def __ge__(self, other):
501 if not isinstance(other, FreqDist): return False 502 return other <= self
503 - def __gt__(self, other):
504 if not isinstance(other, FreqDist): return False 505 return other < self
506
507 - def __repr__(self):
508 """ 509 @return: A string representation of this C{FreqDist}. 510 @rtype: string 511 """ 512 return '<FreqDist with %d outcomes>' % self.N()
513
514 - def __str__(self):
515 """ 516 @return: A string representation of this C{FreqDist}. 517 @rtype: string 518 """ 519 items = ['%r: %r' % (s, self[s]) for s in self] 520 return '<FreqDist: %s>' % ', '.join(items)
521
522 - def __getitem__(self, sample):
523 return self.get(sample, 0)
524 525 ##////////////////////////////////////////////////////// 526 ## Probability Distributions 527 ##////////////////////////////////////////////////////// 528
529 -class ProbDistI(object):
530 """ 531 A probability distribution for the outcomes of an experiment. A 532 probability distribution specifies how likely it is that an 533 experiment will have any given outcome. For example, a 534 probability distribution could be used to predict the probability 535 that a token in a document will have a given type. Formally, a 536 probability distribution can be defined as a function mapping from 537 samples to nonnegative real numbers, such that the sum of every 538 number in the function's range is 1.0. C{ProbDist}s are often 539 used to model the probability distribution of the experiment used 540 to generate a frequency distribution. 541 """ 542 SUM_TO_ONE = True 543 """True if the probabilities of the samples in this probability 544 distribution will always sum to one.""" 545
546 - def __init__(self):
547 if self.__class__ == ProbDistI: 548 raise AssertionError, "Interfaces can't be instantiated"
549
550 - def prob(self, sample):
551 """ 552 @return: the probability for a given sample. Probabilities 553 are always real numbers in the range [0, 1]. 554 @rtype: float 555 @param sample: The sample whose probability 556 should be returned. 557 @type sample: any 558 """ 559 raise AssertionError()
560
561 - def logprob(self, sample):
562 """ 563 @return: the base 2 logarithm of the probability for a given 564 sample. Log probabilities range from negitive infinity to 565 zero. 566 @rtype: float 567 @param sample: The sample whose probability 568 should be returned. 569 @type sample: any 570 """ 571 # Default definition, in terms of prob() 572 p = self.prob(sample) 573 if p == 0: 574 # Use some approximation to infinity. What this does 575 # depends on your system's float implementation. 576 return _NINF 577 else: 578 return math.log(p, 2)
579
580 - def max(self):
581 """ 582 @return: the sample with the greatest probability. If two or 583 more samples have the same probability, return one of them; 584 which sample is returned is undefined. 585 @rtype: any 586 """ 587 raise AssertionError()
588 589 # deprecate this (use keys() instead?)
590 - def samples(self):
591 """ 592 @return: A list of all samples that have nonzero 593 probabilities. Use C{prob} to find the probability of 594 each sample. 595 @rtype: C{list} 596 """ 597 raise AssertionError()
598 599 # cf self.SUM_TO_ONE
600 - def discount(self):
601 """ 602 @return: The ratio by which counts are discounted on average: c*/c 603 @rtype: C{float} 604 """ 605 return 0.0
606 607 # Subclasses should define more efficient implementations of this, 608 # where possible.
609 - def generate(self):
610 """ 611 @return: A randomly selected sample from this probability 612 distribution. The probability of returning each sample 613 C{samp} is equal to C{self.prob(samp)}. 614 """ 615 p = random.random() 616 for sample in self.samples(): 617 p -= self.prob(sample) 618 if p <= 0: return sample 619 # allow for some rounding error: 620 if p < .0001: 621 return sample 622 # we *should* never get here 623 if self.SUM_TO_ONE: 624 warnings.warn("Probability distribution %r sums to %r; generate()" 625 " is returning an arbitrary sample." % (self, 1-p)) 626 return random.choice(list(self.samples()))
627
628 -class UniformProbDist(ProbDistI):
629 """ 630 A probability distribution that assigns equal probability to each 631 sample in a given set; and a zero probability to all other 632 samples. 633 """
634 - def __init__(self, samples):
635 """ 636 Construct a new uniform probability distribution, that assigns 637 equal probability to each sample in C{samples}. 638 639 @param samples: The samples that should be given uniform 640 probability. 641 @type samples: C{list} 642 @raise ValueError: If C{samples} is empty. 643 """ 644 if len(samples) == 0: 645 raise ValueError('A Uniform probability distribution must '+ 646 'have at least one sample.') 647 self._sampleset = set(samples) 648 self._prob = 1.0/len(self._sampleset) 649 self._samples = list(self._sampleset)
650
651 - def prob(self, sample):
652 if sample in self._sampleset: return self._prob 653 else: return 0
654 - def max(self): return self._samples[0]
655 - def samples(self): return self._samples
656 - def __repr__(self):
657 return '<UniformProbDist with %d samples>' % len(self._sampleset)
658
659 -class DictionaryProbDist(ProbDistI):
660 """ 661 A probability distribution whose probabilities are directly 662 specified by a given dictionary. The given dictionary maps 663 samples to probabilities. 664 """
665 - def __init__(self, prob_dict=None, log=False, normalize=False):
666 """ 667 Construct a new probability distribution from the given 668 dictionary, which maps values to probabilities (or to log 669 probabilities, if C{log} is true). If C{normalize} is 670 true, then the probability values are scaled by a constant 671 factor such that they sum to 1. 672 """ 673 self._prob_dict = prob_dict.copy() 674 self._log = log 675 676 # Normalize the distribution, if requested. 677 if normalize: 678 if log: 679 value_sum = sum_logs(self._prob_dict.values()) 680 if value_sum <= _NINF: 681 logp = math.log(1.0/len(prob_dict), 2) 682 for x in prob_dict.keys(): 683 self._prob_dict[x] = logp 684 else: 685 for (x, p) in self._prob_dict.items(): 686 self._prob_dict[x] -= value_sum 687 else: 688 value_sum = sum(self._prob_dict.values()) 689 if value_sum == 0: 690 p = 1.0/len(prob_dict) 691 for x in prob_dict: 692 self._prob_dict[x] = p 693 else: 694 norm_factor = 1.0/value_sum 695 for (x, p) in self._prob_dict.items(): 696 self._prob_dict[x] *= norm_factor
697
698 - def prob(self, sample):
699 if self._log: 700 if sample not in self._prob_dict: return 0 701 else: return 2**(self._prob_dict[sample]) 702 else: 703 return self._prob_dict.get(sample, 0)
704
705 - def logprob(self, sample):
706 if self._log: 707 return self._prob_dict.get(sample, _NINF) 708 else: 709 if sample not in self._prob_dict: return _NINF 710 elif self._prob_dict[sample] == 0: return _NINF 711 else: return math.log(self._prob_dict[sample], 2)
712
713 - def max(self):
714 if not hasattr(self, '_max'): 715 self._max = max((p,v) for (v,p) in self._prob_dict.items())[1] 716 return self._max
717 - def samples(self):
718 return self._prob_dict.keys()
719 - def __repr__(self):
720 return '<ProbDist with %d samples>' % len(self._prob_dict)
721
722 -class MLEProbDist(ProbDistI):
723 """ 724 The maximum likelihood estimate for the probability distribution 725 of the experiment used to generate a frequency distribution. The 726 X{maximum likelihood estimate} approximates the probability of 727 each sample as the frequency of that sample in the frequency 728 distribution. 729 """
730 - def __init__(self, freqdist):
731 """ 732 Use the maximum likelihood estimate to create a probability 733 distribution for the experiment used to generate C{freqdist}. 734 735 @type freqdist: C{FreqDist} 736 @param freqdist: The frequency distribution that the 737 probability estimates should be based on. 738 """ 739 self._freqdist = freqdist
740
741 - def freqdist(self):
742 """ 743 @return: The frequency distribution that this probability 744 distribution is based on. 745 @rtype: C{FreqDist} 746 """ 747 return self._freqdist
748
749 - def prob(self, sample):
750 return self._freqdist.freq(sample)
751
752 - def max(self):
753 return self._freqdist.max()
754
755 - def samples(self):
756 return self._freqdist.keys()
757
758 - def __repr__(self):
759 """ 760 @rtype: C{string} 761 @return: A string representation of this C{ProbDist}. 762 """ 763 return '<MLEProbDist based on %d samples>' % self._freqdist.N()
764
765 -class LidstoneProbDist(ProbDistI):
766 """ 767 The Lidstone estimate for the probability distribution of the 768 experiment used to generate a frequency distribution. The 769 C{Lidstone estimate} is paramaterized by a real number M{gamma}, 770 which typically ranges from 0 to 1. The X{Lidstone estimate} 771 approximates the probability of a sample with count M{c} from an 772 experiment with M{N} outcomes and M{B} bins as 773 M{(c+gamma)/(N+B*gamma)}. This is equivalant to adding 774 M{gamma} to the count for each bin, and taking the maximum 775 likelihood estimate of the resulting frequency distribution. 776 """ 777 SUM_TO_ONE = False
778 - def __init__(self, freqdist, gamma, bins=None):
779 """ 780 Use the Lidstone estimate to create a probability distribution 781 for the experiment used to generate C{freqdist}. 782 783 @type freqdist: C{FreqDist} 784 @param freqdist: The frequency distribution that the 785 probability estimates should be based on. 786 @type gamma: C{float} 787 @param gamma: A real number used to paramaterize the 788 estimate. The Lidstone estimate is equivalant to adding 789 M{gamma} to the count for each bin, and taking the 790 maximum likelihood estimate of the resulting frequency 791 distribution. 792 @type bins: C{int} 793 @param bins: The number of sample values that can be generated 794 by the experiment that is described by the probability 795 distribution. This value must be correctly set for the 796 probabilities of the sample values to sum to one. If 797 C{bins} is not specified, it defaults to C{freqdist.B()}. 798 """ 799 if (bins == 0) or (bins is None and freqdist.N() == 0): 800 name = self.__class__.__name__[:-8] 801 raise ValueError('A %s probability distribution ' % name + 802 'must have at least one bin.') 803 if (bins is not None) and (bins < freqdist.B()): 804 name = self.__class__.__name__[:-8] 805 raise ValueError('\nThe number of bins in a %s distribution ' % name + 806 '(%d) must be greater than or equal to\n' % bins + 807 'the number of bins in the FreqDist used ' + 808 'to create it (%d).' % freqdist.N()) 809 810 self._freqdist = freqdist 811 self._gamma = float(gamma) 812 self._N = self._freqdist.N() 813 814 if bins is None: bins = freqdist.B() 815 self._bins = bins 816 817 self._divisor = self._N + bins * gamma 818 if self._divisor == 0.0: 819 # In extreme cases we force the probability to be 0, 820 # which it will be, since the count will be 0: 821 self._gamma = 0 822 self._divisor = 1
823
824 - def freqdist(self):
825 """ 826 @return: The frequency distribution that this probability 827 distribution is based on. 828 @rtype: C{FreqDist} 829 """ 830 return self._freqdist
831
832 - def prob(self, sample):
833 c = self._freqdist[sample] 834 return (c + self._gamma) / self._divisor
835
836 - def max(self):
837 # For Lidstone distributions, probability is monotonic with 838 # frequency, so the most probable sample is the one that 839 # occurs most frequently. 840 return self._freqdist.max()
841
842 - def samples(self):
843 return self._freqdist.keys()
844
845 - def discount(self):
846 gb = self._gamma * self._bins 847 return gb / (self._N + gb)
848
849 - def __repr__(self):
850 """ 851 @rtype: C{string} 852 @return: A string representation of this C{ProbDist}. 853 """ 854 return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
855 856
857 -class LaplaceProbDist(LidstoneProbDist):
858 """ 859 The Laplace estimate for the probability distribution of the 860 experiment used to generate a frequency distribution. The 861 X{Lidstone estimate} approximates the probability of a sample with 862 count M{c} from an experiment with M{N} outcomes and M{B} bins as 863 M{(c+1)/(N+B)}. This is equivalant to adding one to the count for 864 each bin, and taking the maximum likelihood estimate of the 865 resulting frequency distribution. 866 """
867 - def __init__(self, freqdist, bins=None):
868 """ 869 Use the Laplace estimate to create a probability distribution 870 for the experiment used to generate C{freqdist}. 871 872 @type freqdist: C{FreqDist} 873 @param freqdist: The frequency distribution that the 874 probability estimates should be based on. 875 @type bins: C{int} 876 @param bins: The number of sample values that can be generated 877 by the experiment that is described by the probability 878 distribution. This value must be correctly set for the 879 probabilities of the sample values to sum to one. If 880 C{bins} is not specified, it defaults to C{freqdist.B()}. 881 """ 882 LidstoneProbDist.__init__(self, freqdist, 1, bins)
883
884 - def __repr__(self):
885 """ 886 @rtype: C{string} 887 @return: A string representation of this C{ProbDist}. 888 """ 889 return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
890
891 -class ELEProbDist(LidstoneProbDist):
892 """ 893 The expected likelihood estimate for the probability distribution 894 of the experiment used to generate a frequency distribution. The 895 X{expected likelihood estimate} approximates the probability of a 896 sample with count M{c} from an experiment with M{N} outcomes and 897 M{B} bins as M{(c+0.5)/(N+B/2)}. This is equivalant to adding 0.5 898 to the count for each bin, and taking the maximum likelihood 899 estimate of the resulting frequency distribution. 900 """
901 - def __init__(self, freqdist, bins=None):
902 """ 903 Use the expected likelihood estimate to create a probability 904 distribution for the experiment used to generate C{freqdist}. 905 906 @type freqdist: C{FreqDist} 907 @param freqdist: The frequency distribution that the 908 probability estimates should be based on. 909 @type bins: C{int} 910 @param bins: The number of sample values that can be generated 911 by the experiment that is described by the probability 912 distribution. This value must be correctly set for the 913 probabilities of the sample values to sum to one. If 914 C{bins} is not specified, it defaults to C{freqdist.B()}. 915 """ 916 LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
917
918 - def __repr__(self):
919 """ 920 @rtype: C{string} 921 @return: A string representation of this C{ProbDist}. 922 """ 923 return '<ELEProbDist based on %d samples>' % self._freqdist.N()
924
925 -class HeldoutProbDist(ProbDistI):
926 """ 927 The heldout estimate for the probability distribution of the 928 experiment used to generate two frequency distributions. These 929 two frequency distributions are called the "heldout frequency 930 distribution" and the "base frequency distribution." The 931 X{heldout estimate} uses uses the X{heldout frequency 932 distribution} to predict the probability of each sample, given its 933 frequency in the X{base frequency distribution}. 934 935 In particular, the heldout estimate approximates the probability 936 for a sample that occurs M{r} times in the base distribution as 937 the average frequency in the heldout distribution of all samples 938 that occur M{r} times in the base distribution. 939 940 This average frequency is M{Tr[r]/(Nr[r]*N)}, where: 941 - M{Tr[r]} is the total count in the heldout distribution for 942 all samples that occur M{r} times in the base 943 distribution. 944 - M{Nr[r]} is the number of samples that occur M{r} times in 945 the base distribution. 946 - M{N} is the number of outcomes recorded by the heldout 947 frequency distribution. 948 949 In order to increase the efficiency of the C{prob} member 950 function, M{Tr[r]/(Nr[r]*N)} is precomputed for each value of M{r} 951 when the C{HeldoutProbDist} is created. 952 953 @type _estimate: C{list} of C{float} 954 @ivar _estimate: A list mapping from M{r}, the number of 955 times that a sample occurs in the base distribution, to the 956 probability estimate for that sample. C{_estimate[M{r}]} is 957 calculated by finding the average frequency in the heldout 958 distribution of all samples that occur M{r} times in the base 959 distribution. In particular, C{_estimate[M{r}]} = 960 M{Tr[r]/(Nr[r]*N)}. 961 @type _max_r: C{int} 962 @ivar _max_r: The maximum number of times that any sample occurs 963 in the base distribution. C{_max_r} is used to decide how 964 large C{_estimate} must be. 965 """ 966 SUM_TO_ONE = False
967 - def __init__(self, base_fdist, heldout_fdist, bins=None):
968 """ 969 Use the heldout estimate to create a probability distribution 970 for the experiment used to generate C{base_fdist} and 971 C{heldout_fdist}. 972 973 @type base_fdist: C{FreqDist} 974 @param base_fdist: The base frequency distribution. 975 @type heldout_fdist: C{FreqDist} 976 @param heldout_fdist: The heldout frequency distribution. 977 @type bins: C{int} 978 @param bins: The number of sample values that can be generated 979 by the experiment that is described by the probability 980 distribution. This value must be correctly set for the 981 probabilities of the sample values to sum to one. If 982 C{bins} is not specified, it defaults to C{freqdist.B()}. 983 """ 984 985 self._base_fdist = base_fdist 986 self._heldout_fdist = heldout_fdist 987 988 # The max number of times any sample occurs in base_fdist. 989 self._max_r = base_fdist[base_fdist.max()] 990 991 # Calculate Tr, Nr, and N. 992 Tr = self._calculate_Tr() 993 Nr = [base_fdist.Nr(r, bins) for r in range(self._max_r+1)] 994 N = heldout_fdist.N() 995 996 # Use Tr, Nr, and N to compute the probability estimate for 997 # each value of r. 998 self._estimate = self._calculate_estimate(Tr, Nr, N)
999
1000 - def _calculate_Tr(self):
1001 """ 1002 @return: the list M{Tr}, where M{Tr[r]} is the total count in 1003 C{heldout_fdist} for all samples that occur M{r} 1004 times in C{base_fdist}. 1005 @rtype: C{list} of C{float} 1006 """ 1007 Tr = [0.0] * (self._max_r+1) 1008 for sample in self._heldout_fdist: 1009 r = self._base_fdist[sample] 1010 Tr[r] += self._heldout_fdist[sample] 1011 return Tr
1012
1013 - def _calculate_estimate(self, Tr, Nr, N):
1014 """ 1015 @return: the list M{estimate}, where M{estimate[r]} is the 1016 probability estimate for any sample that occurs M{r} times 1017 in the base frequency distribution. In particular, 1018 M{estimate[r]} is M{Tr[r]/(N[r]*N)}. In the special case 1019 that M{N[r]=0}, M{estimate[r]} will never be used; so we 1020 define M{estimate[r]=None} for those cases. 1021 @rtype: C{list} of C{float} 1022 @type Tr: C{list} of C{float} 1023 @param Tr: the list M{Tr}, where M{Tr[r]} is the total count in 1024 the heldout distribution for all samples that occur M{r} 1025 times in base distribution. 1026 @type Nr: C{list} of C{float} 1027 @param Nr: The list M{Nr}, where M{Nr[r]} is the number of 1028 samples that occur M{r} times in the base distribution. 1029 @type N: C{int} 1030 @param N: The total number of outcomes recorded by the heldout 1031 frequency distribution. 1032 """ 1033 estimate = [] 1034 for r in range(self._max_r+1): 1035 if Nr[r] == 0: estimate.append(None) 1036 else: estimate.append(Tr[r]/(Nr[r]*N)) 1037 return estimate
1038
1039 - def base_fdist(self):
1040 """ 1041 @return: The base frequency distribution that this probability 1042 distribution is based on. 1043 @rtype: C{FreqDist} 1044 """ 1045 return self._base_fdist
1046
1047 - def heldout_fdist(self):
1048 """ 1049 @return: The heldout frequency distribution that this 1050 probability distribution is based on. 1051 @rtype: C{FreqDist} 1052 """ 1053 return self._heldout_fdist
1054
1055 - def samples(self):
1056 return self._base_fdist.keys()
1057
1058 - def prob(self, sample):
1059 # Use our precomputed probability estimate. 1060 r = self._base_fdist[sample] 1061 return self._estimate[r]
1062
1063 - def max(self):
1064 # Note: the Heldout estimation is *not* necessarily monotonic; 1065 # so this implementation is currently broken. However, it 1066 # should give the right answer *most* of the time. :) 1067 return self._base_fdist.max()
1068
1069 - def discount(self):
1070 raise NotImplementedError()
1071
1072 - def __repr__(self):
1073 """ 1074 @rtype: C{string} 1075 @return: A string representation of this C{ProbDist}. 1076 """ 1077 s = '<HeldoutProbDist: %d base samples; %d heldout samples>' 1078 return s % (self._base_fdist.N(), self._heldout_fdist.N())
1079
1080 -class CrossValidationProbDist(ProbDistI):
1081 """ 1082 The cross-validation estimate for the probability distribution of 1083 the experiment used to generate a set of frequency distribution. 1084 The X{cross-validation estimate} for the probability of a sample 1085 is found by averaging the held-out estimates for the sample in 1086 each pair of frequency distributions. 1087 """ 1088 SUM_TO_ONE = False
1089 - def __init__(self, freqdists, bins):
1090 """ 1091 Use the cross-validation estimate to create a probability 1092 distribution for the experiment used to generate 1093 C{freqdists}. 1094 1095 @type freqdists: C{list} of C{FreqDist} 1096 @param freqdists: A list of the frequency distributions 1097 generated by the experiment. 1098 @type bins: C{int} 1099 @param bins: The number of sample values that can be generated 1100 by the experiment that is described by the probability 1101 distribution. This value must be correctly set for the 1102 probabilities of the sample values to sum to one. If 1103 C{bins} is not specified, it defaults to C{freqdist.B()}. 1104 """ 1105 self._freqdists = freqdists 1106 1107 # Create a heldout probability distribution for each pair of 1108 # frequency distributions in freqdists. 1109 self._heldout_probdists = [] 1110 for fdist1 in freqdists: 1111 for fdist2 in freqdists: 1112 if fdist1 is not fdist2: 1113 probdist = HeldoutProbDist(fdist1, fdist2, bins) 1114 self._heldout_probdists.append(probdist)
1115
1116 - def freqdists(self):
1117 """ 1118 @rtype: C{list} of C{FreqDist} 1119 @return: The list of frequency distributions that this 1120 C{ProbDist} is based on. 1121 """ 1122 return self._freqdists
1123
1124 - def samples(self):
1125 # [xx] nb: this is not too efficient 1126 return set(sum([fd.keys() for fd in self._freqdists], []))
1127
1128 - def prob(self, sample):
1129 # Find the average probability estimate returned by each 1130 # heldout distribution. 1131 prob = 0.0 1132 for heldout_probdist in self._heldout_probdists: 1133 prob += heldout_probdist.prob(sample) 1134 return prob/len(self._heldout_probdists)
1135
1136 - def discount(self):
1137 raise NotImplementedError()
1138
1139 - def __repr__(self):
1140 """ 1141 @rtype: C{string} 1142 @return: A string representation of this C{ProbDist}. 1143 """ 1144 return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
1145
1146 -class WittenBellProbDist(ProbDistI):
1147 """ 1148 The Witten-Bell estimate of a probability distribution. This distribution 1149 allocates uniform probability mass to as yet unseen events by using the 1150 number of events that have only been seen once. The probability mass 1151 reserved for unseen events is equal to: 1152 1153 - M{T / (N + T)} 1154 1155 where M{T} is the number of observed event types and M{N} is the total 1156 number of observed events. This equates to the maximum likelihood estimate 1157 of a new type event occuring. The remaining probability mass is discounted 1158 such that all probability estimates sum to one, yielding: 1159 1160 - M{p = T / Z (N + T)}, if count = 0 1161 - M{p = c / (N + T)}, otherwise 1162 """ 1163
1164 - def __init__(self, freqdist, bins=None):
1165 """ 1166 Creates a distribution of Witten-Bell probability estimates. This 1167 distribution allocates uniform probability mass to as yet unseen 1168 events by using the number of events that have only been seen once. 1169 The probability mass reserved for unseen events is equal to: 1170 1171 - M{T / (N + T)} 1172 1173 where M{T} is the number of observed event types and M{N} is the total 1174 number of observed events. This equates to the maximum likelihood 1175 estimate of a new type event occuring. The remaining probability mass 1176 is discounted such that all probability estimates sum to one, 1177 yielding: 1178 1179 - M{p = T / Z (N + T)}, if count = 0 1180 - M{p = c / (N + T)}, otherwise 1181 1182 The parameters M{T} and M{N} are taken from the C{freqdist} parameter 1183 (the C{B()} and C{N()} values). The normalising factor M{Z} is 1184 calculated using these values along with the C{bins} parameter. 1185 1186 @param freqdist: The frequency counts upon which to base the 1187 estimation. 1188 @type freqdist: C{FreqDist} 1189 @param bins: The number of possible event types. This must be 1190 at least as large as the number of bins in the 1191 C{freqdist}. If C{None}, then it's assumed to be 1192 equal to that of the C{freqdist} 1193 @type bins: C{Int} 1194 """ 1195 assert bins == None or bins >= freqdist.B(),\ 1196 'Bins parameter must not be less than freqdist.B()' 1197 if bins == None: 1198 bins = freqdist.B() 1199 self._freqdist = freqdist 1200 self._T = self._freqdist.B() 1201 self._Z = bins - self._freqdist.B() 1202 self._N = self._freqdist.N() 1203 # self._P0 is P(0), precalculated for efficiency: 1204 if self._N==0: 1205 # if freqdist is empty, we approximate P(0) by a UniformProbDist: 1206 self._P0 = 1.0 / self._Z 1207 else: 1208 self._P0 = self._T / float(self._Z * (self._N + self._T))
1209
1210 - def prob(self, sample):
1211 # inherit docs from ProbDistI 1212 c = self._freqdist[sample] 1213 if c == 0: 1214 return self._P0 1215 else: 1216 return c / float(self._N + self._T)
1217
1218 - def max(self):
1219 return self._freqdist.max()
1220
1221 - def samples(self):
1222 return self._freqdist.keys()
1223
1224 - def freqdist(self):
1225 return self._freqdist
1226
1227 - def discount(self):
1228 raise NotImplementedError()
1229
1230 - def __repr__(self):
1231 """ 1232 @rtype: C{string} 1233 @return: A string representation of this C{ProbDist}. 1234 """ 1235 return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
1236 1237 1238 ##////////////////////////////////////////////////////// 1239 ## Good-Turing Probablity Distributions 1240 ##////////////////////////////////////////////////////// 1241 1242 # Good-Turing frequency estimation was contributed by Alan Turing and 1243 # his statistical assistant I.J. Good, during their collaboration in 1244 # the WWII. It is a statistical technique for predicting the 1245 # probability of occurrence of objects belonging to an unknown number 1246 # of species, given past observations of such objects and their 1247 # species. (In drawing balls from an urn, the 'objects' would be balls 1248 # and the 'species' would be the distinct colors of the balls (finite 1249 # but unknown in number). 1250 # 1251 # The situation frequency zero is quite common in the original 1252 # Good-Turing estimation. Bill Gale and Geoffrey Sampson present a 1253 # simple and effective approach, Simple Good-Turing. As a smoothing 1254 # curve they simply use a power curve: 1255 # 1256 # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic 1257 # relationsihp) 1258 # 1259 # They estimate a and b by simple linear regression technique on the 1260 # logarithmic form of the equation: 1261 # 1262 # log Nr = a + b*log(r) 1263 # 1264 # However, they suggest that such a simple curve is probably only 1265 # appropriate for high values of r. For low values of r, they use the 1266 # measured Nr directly. (see M&S, p.213) 1267 # 1268 # Gale and Sampson propose to use r while the difference between r and 1269 # r* is 1.96 greather than the standar deviation, and switch to r* if 1270 # it is less or equal: 1271 # 1272 # |r - r*| > 1.96 * sqrt((r + 1)^2 (Nr+1 / Nr^2) (1 + Nr+1 / Nr)) 1273 # 1274 # The 1.96 coefficient correspond to a 0.05 significance criterion, 1275 # some implementations can use a coefficient of 1.65 for a 0.1 1276 # significance criterion. 1277 # 1278
1279 -class GoodTuringProbDist(ProbDistI):
1280 """ 1281 The Good-Turing estimate of a probability distribution. This method 1282 calculates the probability mass to assign to events with zero or low 1283 counts based on the number of events with higher counts. It does so by 1284 using the smoothed count M{c*}: 1285 1286 - M{c* = (c + 1) N(c + 1) / N(c)} for c >= 1 1287 - M{things with frequency zero in training} = N(1) for c == 0 1288 1289 where M{c} is the original count, M{N(i)} is the number of event types 1290 observed with count M{i}. We can think the count of unseen as the count 1291 of frequency one. 1292 (see Jurafsky & Martin 2nd Edition, p101) 1293 1294 """
1295 - def __init__(self, freqdist, bins=None):
1296 """ 1297 @param freqdist: The frequency counts upon which to base the 1298 estimation. 1299 @type freqdist: C{FreqDist} 1300 @param bins: The number of possible event types. This must be 1301 at least as large as the number of bins in the 1302 C{freqdist}. If C{None}, then it's assumed to be 1303 equal to that of the C{freqdist} 1304 @type bins: C{Int} 1305 """ 1306 assert bins == None or bins >= freqdist.B(),\ 1307 'Bins parameter must not be less than freqdist.B()' 1308 if bins == None: 1309 bins = freqdist.B() 1310 self._freqdist = freqdist 1311 self._bins = bins
1312
1313 - def prob(self, sample):
1314 count = self._freqdist[sample] 1315 1316 # unseen sample's frequency (count zero) uses frequency one's 1317 if count == 0 and self._freqdist.N() != 0: 1318 p0 = 1.0 * self._freqdist.Nr(1) / self._freqdist.N() 1319 if self._bins == self._freqdist.B(): 1320 p0 = 0.0 1321 else: 1322 p0 = p0 / (1.0 * self._bins - self._freqdist.B()) 1323 1324 nc = self._freqdist.Nr(count) 1325 ncn = self._freqdist.Nr(count + 1) 1326 1327 # avoid divide-by-zero errors for sparse datasets 1328 if nc == 0 or self._freqdist.N() == 0: 1329 return 0 1330 1331 return 1.0 * (count + 1) * ncn / (nc * self._freqdist.N())
1332
1333 - def max(self):
1334 return self._freqdist.max()
1335
1336 - def samples(self):
1337 return self._freqdist.keys()
1338
1339 - def discount(self):
1340 """ 1341 @return: The probability mass transferred from the 1342 seen samples to the unseen samples. 1343 @rtype: C{float} 1344 """ 1345 return 1.0 * self._freqdist.Nr(1) / self._freqdist.N()
1346
1347 - def freqdist(self):
1348 return self._freqdist
1349
1350 - def __repr__(self):
1351 """ 1352 @rtype: C{string} 1353 @return: A string representation of this C{ProbDist}. 1354 """ 1355 return '<GoodTuringProbDist based on %d samples>' % self._freqdist.N()
1356 1357 1358 1359 ##////////////////////////////////////////////////////// 1360 ## Simple Good-Turing Probablity Distributions 1361 ##////////////////////////////////////////////////////// 1362
1363 -class SimpleGoodTuringProbDist(ProbDistI):
1364 """ 1365 SimpleGoodTuring ProbDist approximates from frequency to freqency of 1366 frequency into a linear line under log space by linear regression. 1367 Details of Simple Good-Turing algorithm can be found in: 1368 (1) Bill Gale and Geoffrey Sampson's joint paper 1369 "Good Turing Smoothing Without Tear", published in 1370 Journal of Quantitative Linguistics, vol. 2 pp. 217-237, 1995 1371 (2) Jurafsky & Martin's Book "Speech and Language Processing" 1372 2e Chap 4.5 p103 (log(Nc) = a + b*log(c)) 1373 (3) Website maintained by Geoffrey Sampson: 1374 http://www.grsampson.net/RGoodTur.html 1375 1376 Given a set of pair (xi, yi), where the xi denotes the freqency and 1377 yi denotes the freqency of freqency, we want to minimize their 1378 square variation. E(x) and E(y) represent the mean of xi and yi. 1379 1380 -Slope: b = sigma ((xi-E(x)*(yi-E(y))) / sigma ((xi-E(x))*(xi-E(x))) 1381 -Intercept: a = E(y)- b * E(x) 1382 1383 """
1384 - def __init__(self, freqdist, bins=None):
1385 """ 1386 @param freqdist: The frequency counts upon which to base the 1387 estimation. 1388 @type freqdist: C{FreqDist} 1389 @param bins: The number of possible event types. This must be 1390 at least as large as the number of bins in the 1391 C{freqdist}. If C{None}, then it's assumed to be 1392 equal to that of the C{freqdist} 1393 @type bins: C{Int} 1394 """ 1395 assert bins == None or bins >= freqdist.B(),\ 1396 'Bins parameter must not be less than freqdist.B()' 1397 if bins == None: 1398 bins = freqdist.B() 1399 self._freqdist = freqdist 1400 self._bins = bins 1401 r, nr = self._r_Nr() 1402 self.find_best_fit(r, nr) 1403 self._switch(r, nr) 1404 self._renormalize(r, nr)
1405
1406 - def _r_Nr(self):
1407 """ 1408 Split the frequency distribution in two list (r, Nr), where Nr(r) > 0 1409 """ 1410 r, nr = [], [] 1411 b, i = 0, 0 1412 while b != self._freqdist.B(): 1413 nr_i = self._freqdist.Nr(i) 1414 if nr_i > 0: 1415 b += nr_i 1416 r.append(i) 1417 nr.append(nr_i) 1418 i += 1 1419 return (r, nr)
1420
1421 - def find_best_fit(self, r, nr):
1422 """ 1423 Use simple linear regression to tune parameters self._slope and 1424 self._intercept in the log-log space based on count and Nr(count) 1425 (Work in log space to avoid floating point underflow.) 1426 """ 1427 # For higher sample frequencies the data points becomes horizontal 1428 # along line Nr=1. To create a more evident linear model in log-log 1429 # space, we average positive Nr values with the surrounding zero 1430 # values. (Church and Gale, 1991) 1431 1432 if not r or not nr: 1433 # Empty r or nr? 1434 return 1435 1436 zr = [] 1437 for j in range(len(r)): 1438 if j > 0: 1439 i = r[j-1] 1440 else: 1441 i = 0 1442 if j != len(r) - 1: 1443 k = r[j+1] 1444 else: 1445 k = 2 * r[j] - i 1446 zr_ = 2.0 * nr[j] / (k - i) 1447 zr.append(zr_) 1448 1449 log_r = [math.log(i) for i in r] 1450 log_zr = [math.log(i) for i in zr] 1451 1452 xy_cov = x_var = 0.0 1453 x_mean = 1.0 * sum(log_r) / len(log_r) 1454 y_mean = 1.0 * sum(log_zr) / len(log_zr) 1455 for (x, y) in zip(log_r, log_zr): 1456 xy_cov += (x - x_mean) * (y - y_mean) 1457 x_var += (x - x_mean)**2 1458 if x_var != 0: 1459 self._slope = xy_cov / x_var 1460 else: 1461 self._slope = 0.0 1462 self._intercept = y_mean - self._slope * x_mean
1463
1464 - def _switch(self, r, nr):
1465 """ 1466 Calculate the r frontier where we must switch from Nr to Sr 1467 when estimating E[Nr]. 1468 """ 1469 for i, r_ in enumerate(r): 1470 if len(r) == i + 1 or r[i+1] != r_ + 1: 1471 # We are at the end of r, or there is a gap in r 1472 self._switch_at = r_ 1473 break 1474 1475 Sr = self.smoothedNr 1476 smooth_r_star = (r_ + 1) * Sr(r_+1) / Sr(r_) 1477 unsmooth_r_star = 1.0 * (r_ + 1) * nr[i+1] / nr[i] 1478 1479 std = math.sqrt(self._variance(r_, nr[i], nr[i+1])) 1480 if abs(unsmooth_r_star-smooth_r_star) <= 1.96 * std: 1481 self._switch_at = r_ 1482 break
1483
1484 - def _variance(self, r, nr, nr_1):
1485 r = float(r) 1486 nr = float(nr) 1487 nr_1 = float(nr_1) 1488 return (r + 1.0)**2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr)
1489
1490 - def _renormalize(self, r, nr):
1491 """ 1492 It is necessary to renormalize all the probability estimates to 1493 ensure a proper probability distribution results. This can be done 1494 by keeping the estimate of the probability mass for unseen items as 1495 N(1)/N and renormalizing all the estimates for previously seen items 1496 (as Gale and Sampson (1995) propose). (See M&S P.213, 1999) 1497 """ 1498 prob_cov = 0.0 1499 for r_, nr_ in zip(r, nr): 1500 prob_cov += nr_ * self._prob_measure(r_) 1501 if prob_cov: 1502 self._renormal = (1 - self._prob_measure(0)) / prob_cov
1503
1504 - def smoothedNr(self, r):
1505 """ 1506 @return: The number of samples with count r. 1507 @rtype: C{float} 1508 @param r: The amount of freqency. 1509 @type r: C{int} 1510 """ 1511 1512 # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic 1513 # relationship) 1514 # Estimate a and b by simple linear regression technique on 1515 # the logarithmic form of the equation: log Nr = a + b*log(r) 1516 1517 return math.exp(self._intercept + self._slope * math.log(r))
1518
1519 - def prob(self, sample):
1520 """ 1521 @param sample: sample of the event 1522 @type sample: C{string} 1523 @return: The sample's probability. 1524 @rtype: C{float} 1525 """ 1526 count = self._freqdist[sample] 1527 p = self._prob_measure(count) 1528 if count == 0: 1529 if self._bins == self._freqdist.B(): 1530 p = 0.0 1531 else: 1532 p = p / (1.0 * self._bins - self._freqdist.B()) 1533 else: 1534 p = p * self._renormal 1535 return p
1536
1537 - def _prob_measure(self, count):
1538 if count == 0 and self._freqdist.N() == 0 : 1539 return 1.0 1540 elif count == 0 and self._freqdist.N() != 0: 1541 return 1.0 * self._freqdist.Nr(1) / self._freqdist.N() 1542 1543 if self._switch_at > count: 1544 Er_1 = 1.0 * self._freqdist.Nr(count+1) 1545 Er = 1.0 * self._freqdist.Nr(count) 1546 else: 1547 Er_1 = self.smoothedNr(count+1) 1548 Er = self.smoothedNr(count) 1549 1550 r_star = (count + 1) * Er_1 / Er 1551 return r_star / self._freqdist.N()
1552
1553 - def check(self):
1554 prob_sum = 0.0 1555 for i in range(0, len(self._Nr)): 1556 prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal 1557 print "Probability Sum:", prob_sum
1558 #assert prob_sum != 1.0, "probability sum should be one!" 1559
1560 - def discount(self):
1561 """ 1562 This function returns the total mass of probability transfers from the 1563 seen samples to the unseen samples. 1564 """ 1565 return 1.0 * self.smoothedNr(1) / self._freqdist.N()
1566
1567 - def max(self):
1568 return self._freqdist.max()
1569
1570 - def samples(self):
1571 return self._freqdist.keys()
1572
1573 - def freqdist(self):
1574 return self._freqdist
1575
1576 - def __repr__(self):
1577 """ 1578 @rtype: C{string} 1579 @return: A string representation of this C{ProbDist}. 1580 """ 1581 return '<SimpleGoodTuringProbDist based on %d samples>'\ 1582 % self._freqdist.N()
1583 1584
1585 -class MutableProbDist(ProbDistI):
1586 """ 1587 An mutable probdist where the probabilities may be easily modified. This 1588 simply copies an existing probdist, storing the probability values in a 1589 mutable dictionary and providing an update method. 1590 """ 1591
1592 - def __init__(self, prob_dist, samples, store_logs=True):
1593 """ 1594 Creates the mutable probdist based on the given prob_dist and using 1595 the list of samples given. These values are stored as log 1596 probabilities if the store_logs flag is set. 1597 1598 @param prob_dist: the distribution from which to garner the 1599 probabilities 1600 @type prob_dist: ProbDist 1601 @param samples: the complete set of samples 1602 @type samples: sequence of any 1603 @param store_logs: whether to store the probabilities as logarithms 1604 @type store_logs: bool 1605 """ 1606 try: 1607 import numpy 1608 except ImportError: 1609 print "Error: Please install numpy; for instructions see http://www.nltk.org/" 1610 exit() 1611 self._samples = samples 1612 self._sample_dict = dict((samples[i], i) for i in range(len(samples))) 1613 self._data = numpy.zeros(len(samples), numpy.float64) 1614 for i in range(len(samples)): 1615 if store_logs: 1616 self._data[i] = prob_dist.logprob(samples[i]) 1617 else: 1618 self._data[i] = prob_dist.prob(samples[i]) 1619 self._logs = store_logs
1620
1621 - def samples(self):
1622 # inherit documentation 1623 return self._samples
1624
1625 - def prob(self, sample):
1626 # inherit documentation 1627 i = self._sample_dict.get(sample) 1628 if i != None: 1629 if self._logs: 1630 return 2**(self._data[i]) 1631 else: 1632 return self._data[i] 1633 else: 1634 return 0.0
1635
1636 - def logprob(self, sample):
1637 # inherit documentation 1638 i = self._sample_dict.get(sample) 1639 if i != None: 1640 if self._logs: 1641 return self._data[i] 1642 else: 1643 return math.log(self._data[i], 2) 1644 else: 1645 return float('-inf')
1646
1647 - def update(self, sample, prob, log=True):
1648 """ 1649 Update the probability for the given sample. This may cause the object 1650 to stop being the valid probability distribution - the user must 1651 ensure that they update the sample probabilities such that all samples 1652 have probabilities between 0 and 1 and that all probabilities sum to 1653 one. 1654 1655 @param sample: the sample for which to update the probability 1656 @type sample: C{any} 1657 @param prob: the new probability 1658 @type prob: C{float} 1659 @param log: is the probability already logged 1660 @type log: C{bool} 1661 """ 1662 i = self._sample_dict.get(sample) 1663 assert i != None 1664 if self._logs: 1665 if log: self._data[i] = prob 1666 else: self._data[i] = math.log(prob, 2) 1667 else: 1668 if log: self._data[i] = 2**(prob) 1669 else: self._data[i] = prob
1670 1671 ##////////////////////////////////////////////////////// 1672 ## Probability Distribution Operations 1673 ##////////////////////////////////////////////////////// 1674
1675 -def log_likelihood(test_pdist, actual_pdist):
1676 if (not isinstance(test_pdist, ProbDistI) or 1677 not isinstance(actual_pdist, ProbDistI)): 1678 raise ValueError('expected a ProbDist.') 1679 # Is this right? 1680 return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2) 1681 for s in actual_pdist.keys())
1682
1683 -def entropy(pdist):
1684 probs = [pdist.prob(s) for s in pdist.samples()] 1685 return -sum([p * math.log(p,2) for p in probs])
1686 1687 ##////////////////////////////////////////////////////// 1688 ## Conditional Distributions 1689 ##////////////////////////////////////////////////////// 1690
1691 -class ConditionalFreqDist(object):
1692 """ 1693 A collection of frequency distributions for a single experiment 1694 run under different conditions. Conditional frequency 1695 distributions are used to record the number of times each sample 1696 occurred, given the condition under which the experiment was run. 1697 For example, a conditional frequency distribution could be used to 1698 record the frequency of each word (type) in a document, given its 1699 length. Formally, a conditional frequency distribution can be 1700 defined as a function that maps from each condition to the 1701 C{FreqDist} for the experiment under that condition. 1702 1703 The frequency distribution for each condition is accessed using 1704 the indexing operator: 1705 1706 >>> cfdist[3] 1707 <FreqDist with 73 outcomes> 1708 >>> cfdist[3].freq('the') 1709 0.4 1710 >>> cfdist[3]['dog'] 1711 2 1712 1713 When the indexing operator is used to access the frequency 1714 distribution for a condition that has not been accessed before, 1715 C{ConditionalFreqDist} creates a new empty C{FreqDist} for that 1716 condition. 1717 1718 Conditional frequency distributions are typically constructed by 1719 repeatedly running an experiment under a variety of conditions, 1720 and incrementing the sample outcome counts for the appropriate 1721 conditions. For example, the following code will produce a 1722 conditional frequency distribution that encodes how often each 1723 word type occurs, given the length of that word type: 1724 1725 >>> cfdist = ConditionalFreqDist() 1726 >>> for word in tokenize.whitespace(sent): 1727 ... condition = len(word) 1728 ... cfdist[condition].inc(word) 1729 1730 An equivalent way to do this is with the initializer: 1731 1732 >>> cfdist = ConditionalFreqDist((len(word), word) for word in tokenize.whitespace(sent)) 1733 1734 """
1735 - def __init__(self, cond_samples=None):
1736 """ 1737 Construct a new empty conditional frequency distribution. In 1738 particular, the count for every sample, under every condition, 1739 is zero. 1740 1741 @param cond_samples: The samples to initialize the conditional 1742 frequency distribution with 1743 @type cond_samples: Sequence of (condition, sample) tuples 1744 """ 1745 self._fdists = {} 1746 if cond_samples: 1747 for (cond, sample) in cond_samples: 1748 self[cond].inc(sample)
1749
1750 - def __getitem__(self, condition):
1751 """ 1752 @return: The frequency distribution that encodes the frequency 1753 of each sample outcome, given that the experiment was run 1754 under the given condition. If the frequency distribution for 1755 the given condition has not been accessed before, then this 1756 will create a new empty C{FreqDist} for that condition. 1757 @rtype: C{FreqDist} 1758 @param condition: The condition under which the experiment was run. 1759 @type condition: any 1760 """ 1761 # Create the conditioned freq dist, if it doesn't exist 1762 if condition not in self._fdists: 1763 self._fdists[condition] = FreqDist() 1764 return self._fdists[condition]
1765
1766 - def conditions(self):
1767 """ 1768 @return: A list of the conditions that have been accessed for 1769 this C{ConditionalFreqDist}. Use the indexing operator to 1770 access the frequency distribution for a given condition. 1771 Note that the frequency distributions for some conditions 1772 may contain zero sample outcomes. 1773 @rtype: C{list} 1774 """ 1775 return sorted(self._fdists.keys())
1776
1777 - def __len__(self):
1778 """ 1779 @return: The number of conditions that have been accessed 1780 for this C{ConditionalFreqDist}. 1781 @rtype: C{int} 1782 """ 1783 return len(self._fdists)
1784
1785 - def N(self):
1786 """ 1787 @return: The total number of sample outcomes that have been 1788 recorded by this C{ConditionalFreqDist}. 1789 @rtype: C{int} 1790 """ 1791 return sum(fdist.N() for fdist in self._fdists.values())
1792
1793 - def plot(self, *args, **kwargs):
1794 """ 1795 Plot the given samples from the conditional frequency distribution. 1796 For a cumulative plot, specify cumulative=True. 1797 (Requires Matplotlib to be installed.) 1798 1799 @param samples: The samples to plot 1800 @type samples: C{list} 1801 @param title: The title for the graph 1802 @type title: C{str} 1803 @param conditions: The conditions to plot (default is all) 1804 @type conditions: C{list} 1805 """ 1806 try: 1807 import pylab 1808 except ImportError: 1809 raise ValueError('The plot function requires the matplotlib package (aka pylab).' 1810 'See http://matplotlib.sourceforge.net/') 1811 1812 cumulative = _get_kwarg(kwargs, 'cumulative', False) 1813 conditions = _get_kwarg(kwargs, 'conditions', self.conditions()) 1814 title = _get_kwarg(kwargs, 'title', '') 1815 samples = _get_kwarg(kwargs, 'samples', 1816 sorted(set(v for c in conditions for v in self[c]))) # this computation could be wasted 1817 if not "linewidth" in kwargs: 1818 kwargs["linewidth"] = 2 1819 1820 for condition in conditions: 1821 if cumulative: 1822 freqs = list(self[condition]._cumulative_frequencies(samples)) 1823 ylabel = "Cumulative Counts" 1824 legend_loc = 'lower right' 1825 else: 1826 freqs = [self[condition][sample] for sample in samples] 1827 ylabel = "Counts" 1828 legend_loc = 'upper right' 1829 # percents = [f * 100 for f in freqs] only in ConditionalProbDist? 1830 kwargs['label'] = condition 1831 pylab.plot(freqs, *args, **kwargs) 1832 1833 pylab.legend(loc=legend_loc) 1834 pylab.grid(True, color="silver") 1835 pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) 1836 if title: 1837 pylab.title(title) 1838 pylab.xlabel("Samples") 1839 pylab.ylabel(ylabel) 1840 pylab.show()
1841
1842 - def tabulate(self, *args, **kwargs):
1843 """ 1844 Tabulate the given samples from the conditional frequency distribution. 1845 1846 @param samples: The samples to plot 1847 @type samples: C{list} 1848 @param title: The title for the graph 1849 @type title: C{str} 1850 @param conditions: The conditions to plot (default is all) 1851 @type conditions: C{list} 1852 """ 1853 1854 cumulative = _get_kwarg(kwargs, 'cumulative', False) 1855 conditions = _get_kwarg(kwargs, 'conditions', self.conditions()) 1856 samples = _get_kwarg(kwargs, 'samples', 1857 sorted(set(v for c in conditions for v in self[c]))) # this computation could be wasted 1858 1859 condition_size = max(len(str(c)) for c in conditions) 1860 print ' ' * condition_size, 1861 for s in samples: 1862 print "%4s" % str(s), 1863 print 1864 for c in conditions: 1865 print "%*s" % (condition_size, str(c)), 1866 if cumulative: 1867 freqs = list(self[c]._cumulative_frequencies(samples)) 1868 else: 1869 freqs = [self[c][sample] for sample in samples] 1870 1871 for f in freqs: 1872 print "%4d" % f, 1873 print
1874
1875 - def __eq__(self, other):
1876 if not isinstance(other, ConditionalFreqDist): return False 1877 return self.conditions() == other.conditions() \ 1878 and all(self[c] == other[c] for c in self.conditions()) # conditions are already sorted
1879 - def __ne__(self, other):
1880 return not (self == other)
1881 - def __le__(self, other):
1882 if not isinstance(other, ConditionalFreqDist): return False 1883 return set(self.conditions()).issubset(other.conditions()) \ 1884 and all(self[c] <= other[c] for c in self.conditions())
1885 - def __lt__(self, other):
1886 if not isinstance(other, ConditionalFreqDist): return False 1887 return self <= other and self != other
1888 - def __ge__(self, other):
1889 if not isinstance(other, ConditionalFreqDist): return False 1890 return other <= self
1891 - def __gt__(self, other):
1892 if not isinstance(other, ConditionalFreqDist): return False 1893 return other < self
1894
1895 - def __repr__(self):
1896 """ 1897 @return: A string representation of this 1898 C{ConditionalProbDist}. 1899 @rtype: C{string} 1900 """ 1901 return '<ConditionalProbDist with %d conditions>' % self.__len__()
1902 1903
1904 - def __repr__(self):
1905 """ 1906 @return: A string representation of this 1907 C{ConditionalFreqDist}. 1908 @rtype: C{string} 1909 """ 1910 n = len(self._fdists) 1911 return '<ConditionalFreqDist with %d conditions>' % n
1912
1913 -class ConditionalProbDistI(object):
1914 """ 1915 A collection of probability distributions for a single experiment 1916 run under different conditions. Conditional probability 1917 distributions are used to estimate the likelihood of each sample, 1918 given the condition under which the experiment was run. For 1919 example, a conditional probability distribution could be used to 1920 estimate the probability of each word type in a document, given 1921 the length of the word type. Formally, a conditional probability 1922 distribution can be defined as a function that maps from each 1923 condition to the C{ProbDist} for the experiment under that 1924 condition. 1925 """
1926 - def __init__(self):
1927 raise AssertionError, 'ConditionalProbDistI is an interface'
1928
1929 - def __getitem__(self, condition):
1930 """ 1931 @return: The probability distribution for the experiment run 1932 under the given condition. 1933 @rtype: C{ProbDistI} 1934 @param condition: The condition whose probability distribution 1935 should be returned. 1936 @type condition: any 1937 """ 1938 raise AssertionError
1939
1940 - def __len__(self):
1941 """ 1942 @return: The number of conditions that are represented by 1943 this C{ConditionalProbDist}. 1944 @rtype: C{int} 1945 """ 1946 raise AssertionError
1947
1948 - def conditions(self):
1949 """ 1950 @return: A list of the conditions that are represented by 1951 this C{ConditionalProbDist}. Use the indexing operator to 1952 access the probability distribution for a given condition. 1953 @rtype: C{list} 1954 """ 1955 raise AssertionError
1956 1957 # For now, this is the only implementation of ConditionalProbDistI; 1958 # but we would want a different implementation if we wanted to build a 1959 # conditional probability distribution analytically (e.g., a gaussian 1960 # distribution), rather than basing it on an underlying frequency 1961 # distribution.
1962 -class ConditionalProbDist(ConditionalProbDistI):
1963 """ 1964 A conditional probability distribution modelling the experiments 1965 that were used to generate a conditional frequency distribution. 1966 A C{ConditoinalProbDist} is constructed from a 1967 C{ConditionalFreqDist} and a X{C{ProbDist} factory}: 1968 1969 - The B{C{ConditionalFreqDist}} specifies the frequency 1970 distribution for each condition. 1971 - The B{C{ProbDist} factory} is a function that takes a 1972 condition's frequency distribution, and returns its 1973 probability distribution. A C{ProbDist} class's name (such as 1974 C{MLEProbDist} or C{HeldoutProbDist}) can be used to specify 1975 that class's constructor. 1976 1977 The first argument to the C{ProbDist} factory is the frequency 1978 distribution that it should model; and the remaining arguments are 1979 specified by the C{factory_args} parameter to the 1980 C{ConditionalProbDist} constructor. For example, the following 1981 code constructs a C{ConditionalProbDist}, where the probability 1982 distribution for each condition is an C{ELEProbDist} with 10 bins: 1983 1984 >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) 1985 >>> print cpdist['run'].max() 1986 'NN' 1987 >>> print cpdist['run'].prob('NN') 1988 0.0813 1989 """
1990 - def __init__(self, cfdist, probdist_factory, 1991 *factory_args, **factory_kw_args):
1992 """ 1993 Construct a new conditional probability distribution, based on 1994 the given conditional frequency distribution and C{ProbDist} 1995 factory. 1996 1997 @type cfdist: L{ConditionalFreqDist} 1998 @param cfdist: The C{ConditionalFreqDist} specifying the 1999 frequency distribution for each condition. 2000 @type probdist_factory: C{class} or C{function} 2001 @param probdist_factory: The function or class that maps 2002 a condition's frequency distribution to its probability 2003 distribution. The function is called with the frequency 2004 distribution as its first argument, 2005 C{factory_args} as its remaining arguments, and 2006 C{factory_kw_args} as keyword arguments. 2007 @type factory_args: (any) 2008 @param factory_args: Extra arguments for C{probdist_factory}. 2009 These arguments are usually used to specify extra 2010 properties for the probability distributions of individual 2011 conditions, such as the number of bins they contain. 2012 @type factory_kw_args: (any) 2013 @param factory_kw_args: Extra keyword arguments for C{probdist_factory}. 2014 """ 2015 self._probdist_factory = probdist_factory 2016 self._cfdist = cfdist 2017 self._factory_args = factory_args 2018 self._factory_kw_args = factory_kw_args 2019 2020 self._pdists = {} 2021 for c in cfdist.conditions(): 2022 pdist = probdist_factory(cfdist[c], *factory_args, 2023 **factory_kw_args) 2024 self._pdists[c] = pdist
2025
2026 - def __contains__(self, condition):
2027 return condition in self._pdists
2028
2029 - def __getitem__(self, condition):
2030 if condition not in self._pdists: 2031 # If it's a condition we haven't seen, create a new prob 2032 # dist from the empty freq dist. Typically, this will 2033 # give a uniform prob dist. 2034 pdist = self._probdist_factory(FreqDist(), *self._factory_args, 2035 **self._factory_kw_args) 2036 self._pdists[condition] = pdist 2037 2038 return self._pdists[condition]
2039
2040 - def conditions(self):
2041 return self._pdists.keys()
2042
2043 - def __len__(self):
2044 return len(self._pdists)
2045
2046 -class DictionaryConditionalProbDist(ConditionalProbDistI):
2047 """ 2048 An alternative ConditionalProbDist that simply wraps a dictionary of 2049 ProbDists rather than creating these from FreqDists. 2050 """ 2051
2052 - def __init__(self, probdist_dict):
2053 """ 2054 @param probdist_dict: a dictionary containing the probdists indexed 2055 by the conditions 2056 @type probdist_dict: dict any -> probdist 2057 """ 2058 self._dict = probdist_dict
2059
2060 - def __getitem__(self, condition):
2061 # inherit documentation 2062 # this will cause an exception for unseen conditions 2063 return self._dict[condition]
2064
2065 - def conditions(self):
2066 # inherit documentation 2067 return self._dict.keys()
2068 2069 ##////////////////////////////////////////////////////// 2070 ## Adding in log-space. 2071 ##////////////////////////////////////////////////////// 2072 2073 # If the difference is bigger than this, then just take the bigger one: 2074 _ADD_LOGS_MAX_DIFF = math.log(1e-30, 2) 2075
2076 -def add_logs(logx, logy):
2077 """ 2078 Given two numbers C{logx}=M{log(x)} and C{logy}=M{log(y)}, return 2079 M{log(x+y)}. Conceptually, this is the same as returning 2080 M{log(2**(C{logx})+2**(C{logy}))}, but the actual implementation 2081 avoids overflow errors that could result from direct computation. 2082 """ 2083 if (logx < logy + _ADD_LOGS_MAX_DIFF): 2084 return logy 2085 if (logy < logx + _ADD_LOGS_MAX_DIFF): 2086 return logx 2087 base = min(logx, logy) 2088 return base + math.log(2**(logx-base) + 2**(logy-base), 2)
2089
2090 -def sum_logs(logs):
2091 if len(logs) == 0: 2092 # Use some approximation to infinity. What this does 2093 # depends on your system's float implementation. 2094 return _NINF 2095 else: 2096 return reduce(add_logs, logs[1:], logs[0])
2097 2098 ##////////////////////////////////////////////////////// 2099 ## Probabilistic Mix-in 2100 ##////////////////////////////////////////////////////// 2101
2102 -class ProbabilisticMixIn(object):
2103 """ 2104 A mix-in class to associate probabilities with other classes 2105 (trees, rules, etc.). To use the C{ProbabilisticMixIn} class, 2106 define a new class that derives from an existing class and from 2107 ProbabilisticMixIn. You will need to define a new constructor for 2108 the new class, which explicitly calls the constructors of both its 2109 parent classes. For example: 2110 2111 >>> class A: 2112 ... def __init__(self, x, y): self.data = (x,y) 2113 ... 2114 >>> class ProbabilisticA(A, ProbabilisticMixIn): 2115 ... def __init__(self, x, y, **prob_kwarg): 2116 ... A.__init__(self, x, y) 2117 ... ProbabilisticMixIn.__init__(self, **prob_kwarg) 2118 2119 See the documentation for the ProbabilisticMixIn 2120 L{constructor<__init__>} for information about the arguments it 2121 expects. 2122 2123 You should generally also redefine the string representation 2124 methods, the comparison methods, and the hashing method. 2125 """
2126 - def __init__(self, **kwargs):
2127 """ 2128 Initialize this object's probability. This initializer should 2129 be called by subclass constructors. C{prob} should generally be 2130 the first argument for those constructors. 2131 2132 @kwparam prob: The probability associated with the object. 2133 @type prob: C{float} 2134 @kwparam logprob: The log of the probability associated with 2135 the object. 2136 @type logprob: C{float} 2137 """ 2138 if 'prob' in kwargs: 2139 if 'logprob' in kwargs: 2140 raise TypeError('Must specify either prob or logprob ' 2141 '(not both)') 2142 else: 2143 ProbabilisticMixIn.set_prob(self, kwargs['prob']) 2144 elif 'logprob' in kwargs: 2145 ProbabilisticMixIn.set_logprob(self, kwargs['logprob']) 2146 else: 2147 self.__prob = self.__logprob = None
2148
2149 - def set_prob(self, prob):
2150 """ 2151 Set the probability associated with this object to C{prob}. 2152 @param prob: The new probability 2153 @type prob: C{float} 2154 """ 2155 self.__prob = prob 2156 self.__logprob = None
2157
2158 - def set_logprob(self, logprob):
2159 """ 2160 Set the log probability associated with this object to 2161 C{logprob}. I.e., set the probability associated with this 2162 object to C{2**(logprob)}. 2163 @param logprob: The new log probability 2164 @type logprob: C{float} 2165 """ 2166 self.__logprob = logprob 2167 self.__prob = None
2168
2169 - def prob(self):
2170 """ 2171 @return: The probability associated with this object. 2172 @rtype: C{float} 2173 """ 2174 if self.__prob is None: 2175 if self.__logprob is None: return None 2176 self.__prob = 2**(self.__logprob) 2177 return self.__prob
2178
2179 - def logprob(self):
2180 """ 2181 @return: C{log(p)}, where C{p} is the probability associated 2182 with this object. 2183 2184 @rtype: C{float} 2185 """ 2186 if self.__logprob is None: 2187 if self.__prob is None: return None 2188 self.__logprob = math.log(self.__prob, 2) 2189 return self.__logprob
2190
2191 -class ImmutableProbabilisticMixIn(ProbabilisticMixIn):
2192 - def set_prob(self, prob):
2193 raise ValueError, '%s is immutable' % self.__class__.__name__
2194 - def set_logprob(self, prob):
2195 raise ValueError, '%s is immutable' % self.__class__.__name__
2196 2197 ## Helper function for processing keyword arguments 2198
2199 -def _get_kwarg(kwargs, key, default):
2200 if key in kwargs: 2201 arg = kwargs[key] 2202 del kwargs[key] 2203 else: 2204 arg = default 2205 return arg
2206 2207 ##////////////////////////////////////////////////////// 2208 ## Demonstration 2209 ##////////////////////////////////////////////////////// 2210
2211 -def _create_rand_fdist(numsamples, numoutcomes):
2212 """ 2213 Create a new frequency distribution, with random samples. The 2214 samples are numbers from 1 to C{numsamples}, and are generated by 2215 summing two numbers, each of which has a uniform distribution. 2216 """ 2217 import random 2218 fdist = FreqDist() 2219 for x in range(numoutcomes): 2220 y = (random.randint(1, (1+numsamples)/2) + 2221 random.randint(0, numsamples/2)) 2222 fdist.inc(y) 2223 return fdist
2224
2225 -def _create_sum_pdist(numsamples):
2226 """ 2227 Return the true probability distribution for the experiment 2228 C{_create_rand_fdist(numsamples, x)}. 2229 """ 2230 fdist = FreqDist() 2231 for x in range(1, (1+numsamples)/2+1): 2232 for y in range(0, numsamples/2+1): 2233 fdist.inc(x+y) 2234 return MLEProbDist(fdist)
2235
2236 -def demo(numsamples=6, numoutcomes=500):
2237 """ 2238 A demonstration of frequency distributions and probability 2239 distributions. This demonstration creates three frequency 2240 distributions with, and uses them to sample a random process with 2241 C{numsamples} samples. Each frequency distribution is sampled 2242 C{numoutcomes} times. These three frequency distributions are 2243 then used to build six probability distributions. Finally, the 2244 probability estimates of these distributions are compared to the 2245 actual probability of each sample. 2246 2247 @type numsamples: C{int} 2248 @param numsamples: The number of samples to use in each demo 2249 frequency distributions. 2250 @type numoutcomes: C{int} 2251 @param numoutcomes: The total number of outcomes for each 2252 demo frequency distribution. These outcomes are divided into 2253 C{numsamples} bins. 2254 @rtype: C{None} 2255 """ 2256 2257 # Randomly sample a stochastic process three times. 2258 fdist1 = _create_rand_fdist(numsamples, numoutcomes) 2259 fdist2 = _create_rand_fdist(numsamples, numoutcomes) 2260 fdist3 = _create_rand_fdist(numsamples, numoutcomes) 2261 2262 # Use our samples to create probability distributions. 2263 pdists = [ 2264 MLEProbDist(fdist1), 2265 LidstoneProbDist(fdist1, 0.5, numsamples), 2266 HeldoutProbDist(fdist1, fdist2, numsamples), 2267 HeldoutProbDist(fdist2, fdist1, numsamples), 2268 CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), 2269 GoodTuringProbDist(fdist1), 2270 SimpleGoodTuringProbDist(fdist1), 2271 SimpleGoodTuringProbDist(fdist1, 7), 2272 _create_sum_pdist(numsamples), 2273 ] 2274 2275 # Find the probability of each sample. 2276 vals = [] 2277 for n in range(1,numsamples+1): 2278 vals.append(tuple([n, fdist1.freq(n)] + 2279 [pdist.prob(n) for pdist in pdists])) 2280 2281 # Print the results in a formatted table. 2282 print ('%d samples (1-%d); %d outcomes were sampled for each FreqDist' % 2283 (numsamples, numsamples, numoutcomes)) 2284 print '='*9*(len(pdists)+2) 2285 FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual' 2286 print FORMATSTR % tuple(`pdist`[1:9] for pdist in pdists[:-1]) 2287 print '-'*9*(len(pdists)+2) 2288 FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f' 2289 for val in vals: 2290 print FORMATSTR % val 2291 2292 # Print the totals for each column (should all be 1.0) 2293 zvals = zip(*vals) 2294 def sum(lst): return reduce(lambda x,y:x+y, lst, 0) 2295 sums = [sum(val) for val in zvals[1:]] 2296 print '-'*9*(len(pdists)+2) 2297 FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f' 2298 print FORMATSTR % tuple(sums) 2299 print '='*9*(len(pdists)+2) 2300 2301 # Display the distributions themselves, if they're short enough. 2302 if len(`str(fdist1)`) < 70: 2303 print ' fdist1:', str(fdist1) 2304 print ' fdist2:', str(fdist2) 2305 print ' fdist3:', str(fdist3) 2306 print 2307 2308 print 'Generating:' 2309 for pdist in pdists: 2310 fdist = FreqDist(pdist.generate() for i in range(5000)) 2311 print '%20s %s' % (pdist.__class__.__name__[:20], str(fdist)[:55]) 2312 print
2313
2314 -def gt_demo():
2315 from nltk import corpus 2316 emma_words = corpus.gutenberg.words('austen-emma.txt') 2317 fd = FreqDist(emma_words) 2318 gt = GoodTuringProbDist(fd) 2319 sgt = SimpleGoodTuringProbDist(fd) 2320 katz = SimpleGoodTuringProbDist(fd, 7) 2321 print '%18s %8s %12s %14s %12s' \ 2322 % ("word", "freqency", "GoodTuring", "SimpleGoodTuring", "Katz-cutoff" ) 2323 for key in fd.keys(): 2324 print '%18s %8d %12e %14e %12e' \ 2325 % (key, fd[key], gt.prob(key), sgt.prob(key), katz.prob(key))
2326 2327 if __name__ == '__main__': 2328 demo(6, 10) 2329 demo(5, 5000) 2330 gt_demo() 2331 2332 __all__ = ['ConditionalFreqDist', 'ConditionalProbDist', 2333 'ConditionalProbDistI', 'CrossValidationProbDist', 2334 'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist', 2335 'FreqDist', 'GoodTuringProbDist', 'SimpleGoodTuringProbDist', 'HeldoutProbDist', 2336 'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist', 2337 'MLEProbDist', 'MutableProbDist', 'ProbDistI', 'ProbabilisticMixIn', 2338 'UniformProbDist', 'WittenBellProbDist', 'add_logs', 2339 'log_likelihood', 'sum_logs', 'entropy'] 2340