1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 _NINF = float('-1e300')
18
19 """
20 Classes for representing and processing probabilistic information.
21
22 The L{FreqDist} class is used to encode X{frequency distributions},
23 which count the number of times that each outcome of an experiment
24 occurs.
25
26 The L{ProbDistI} class defines a standard interface for X{probability
27 distributions}, which encode the probability of each outcome for an
28 experiment. There are two types of probability distribution:
29
30 - X{derived probability distributions} are created from frequency
31 distributions. They attempt to model the probability distribution
32 that generated the frequency distribution.
33 - X{analytic probability distributions} are created directly from
34 parameters (such as variance).
35
36 The L{ConditionalFreqDist} class and L{ConditionalProbDistI} interface
37 are used to encode conditional distributions. Conditional probability
38 distributions can be derived or analytic; but currently the only
39 implementation of the C{ConditionalProbDistI} interface is
40 L{ConditionalProbDist}, a derived distribution.
41
42 """
43
44 import math
45 import random
46 import warnings
47 from operator import itemgetter
48 from itertools import imap, islice
49
50 from nltk.compat import all
51
52
53
54
55
56
57
58
60 """
61 A frequency distribution for the outcomes of an experiment. A
62 frequency distribution records the number of times each outcome of
63 an experiment has occurred. For example, a frequency distribution
64 could be used to record the frequency of each word type in a
65 document. Formally, a frequency distribution can be defined as a
66 function mapping from each sample to the number of times that
67 sample occurred as an outcome.
68
69 Frequency distributions are generally constructed by running a
70 number of experiments, and incrementing the count for a sample
71 every time it is an outcome of an experiment. For example, the
72 following code will produce a frequency distribution that encodes
73 how often each word occurs in a text:
74
75 >>> fdist = FreqDist()
76 >>> for word in tokenize.whitespace(sent):
77 ... fdist.inc(word.lower())
78
79 An equivalent way to do this is with the initializer:
80
81 >>> fdist = FreqDist(word.lower() for word in tokenize.whitespace(sent))
82
83 """
85 """
86 Construct a new frequency distribution. If C{samples} is
87 given, then the frequency distribution will be initialized
88 with the count of each object in C{samples}; otherwise, it
89 will be initialized to be empty.
90
91 In particular, C{FreqDist()} returns an empty frequency
92 distribution; and C{FreqDist(samples)} first creates an empty
93 frequency distribution, and then calls C{update} with the
94 list C{samples}.
95
96 @param samples: The samples to initialize the frequency
97 distribution with.
98 @type samples: Sequence
99 """
100 dict.__init__(self)
101 self._N = 0
102 self._reset_caches()
103 if samples:
104 self.update(samples)
105
106 - def inc(self, sample, count=1):
107 """
108 Increment this C{FreqDist}'s count for the given
109 sample.
110
111 @param sample: The sample whose count should be incremented.
112 @type sample: any
113 @param count: The amount to increment the sample's count by.
114 @type count: C{int}
115 @rtype: None
116 @raise NotImplementedError: If C{sample} is not a
117 supported sample type.
118 """
119 if count == 0: return
120 self[sample] = self.get(sample,0) + count
121
123 """
124 Set this C{FreqDist}'s count for the given sample.
125
126 @param sample: The sample whose count should be incremented.
127 @type sample: any hashable object
128 @param count: The new value for the sample's count
129 @type count: C{int}
130 @rtype: None
131 @raise TypeError: If C{sample} is not a supported sample type.
132 """
133
134 self._N += (value - self.get(sample, 0))
135 dict.__setitem__(self, sample, value)
136
137
138 self._reset_caches()
139
141 """
142 @return: The total number of sample outcomes that have been
143 recorded by this C{FreqDist}. For the number of unique
144 sample values (or bins) with counts greater than zero, use
145 C{FreqDist.B()}.
146 @rtype: C{int}
147 """
148 return self._N
149
151 """
152 @return: The total number of sample values (or X{bins}) that
153 have counts greater than zero. For the total
154 number of sample outcomes recorded, use C{FreqDist.N()}.
155 (FreqDist.B() is the same as len(FreqDist).)
156 @rtype: C{int}
157 """
158 return len(self)
159
160
162 """
163 @return: A list of all samples that have been recorded as
164 outcomes by this frequency distribution. Use C{count()}
165 to determine the count for each sample.
166 @rtype: C{list}
167 """
168 return self.keys()
169
171 """
172 @return: A list of all samples that occur once (hapax legomena)
173 @rtype: C{list}
174 """
175 return [item for item in self if self[item] == 1]
176
177 - def Nr(self, r, bins=None):
178 """
179 @return: The number of samples with count r.
180 @rtype: C{int}
181 @type r: C{int}
182 @param r: A sample count.
183 @type bins: C{int}
184 @param bins: The number of possible sample outcomes. C{bins}
185 is used to calculate Nr(0). In particular, Nr(0) is
186 C{bins-self.B()}. If C{bins} is not specified, it
187 defaults to C{self.B()} (so Nr(0) will be 0).
188 """
189 if r < 0: raise IndexError, 'FreqDist.Nr(): r must be non-negative'
190
191
192 if r == 0:
193 if bins is None: return 0
194 else: return bins-self.B()
195
196
197
198
199 if self._Nr_cache is None:
200 self._cache_Nr_values()
201
202 if r >= len(self._Nr_cache): return 0
203 return self._Nr_cache[r]
204
206 Nr = [0]
207 for sample in self:
208 c = self.get(sample, 0)
209 if c >= len(Nr):
210 Nr += [0]*(c+1-len(Nr))
211 Nr[c] += 1
212 self._Nr_cache = Nr
213
214 - def count(self, sample):
215 """
216 Return the count of a given sample. The count of a sample is
217 defined as the number of times that sample outcome was
218 recorded by this C{FreqDist}. Counts are non-negative
219 integers. This method has been replaced by conventional
220 dictionary indexing; use fd[item] instead of fd.count(item).
221
222 @return: The count of a given sample.
223 @rtype: C{int}
224 @param sample: the sample whose count
225 should be returned.
226 @type sample: any.
227 """
228 raise AttributeError, "Use indexing to look up an entry in a FreqDist, e.g. fd[item]"
229
231 """
232 Return the cumulative frequencies of the specified samples.
233 If no samples are specified, all counts are returned, starting
234 with the largest.
235
236 @return: The cumulative frequencies of the given samples.
237 @rtype: C{list} of C{float}
238 @param samples: the samples whose frequencies should be returned.
239 @type sample: any.
240 """
241 cf = 0.0
242 if not samples:
243 samples = self.keys()
244 for sample in samples:
245 cf += self[sample]
246 yield cf
247
248
249
250 - def freq(self, sample):
251 """
252 Return the frequency of a given sample. The frequency of a
253 sample is defined as the count of that sample divided by the
254 total number of sample outcomes that have been recorded by
255 this C{FreqDist}. The count of a sample is defined as the
256 number of times that sample outcome was recorded by this
257 C{FreqDist}. Frequencies are always real numbers in the range
258 [0, 1].
259
260 @return: The frequency of a given sample.
261 @rtype: float
262 @param sample: the sample whose frequency
263 should be returned.
264 @type sample: any
265 """
266 if self._N is 0:
267 return 0
268 return float(self[sample]) / self._N
269
271 """
272 Return the sample with the greatest number of outcomes in this
273 frequency distribution. If two or more samples have the same
274 number of outcomes, return one of them; which sample is
275 returned is undefined. If no outcomes have occurred in this
276 frequency distribution, return C{None}.
277
278 @return: The sample with the maximum number of outcomes in this
279 frequency distribution.
280 @rtype: any or C{None}
281 """
282 if self._max_cache is None:
283 self._max_cache = max([(a,b) for (b,a) in self.items()])[1]
284 return self._max_cache
285
286 - def plot(self, *args, **kwargs):
287 """
288 Plot samples from the frequency distribution
289 displaying the most frequent sample first. If an integer
290 parameter is supplied, stop after this many samples have been
291 plotted. If two integer parameters m, n are supplied, plot a
292 subset of the samples, beginning with m and stopping at n-1.
293 For a cumulative plot, specify cumulative=True.
294 (Requires Matplotlib to be installed.)
295
296 @param title: The title for the graph
297 @type title: C{str}
298 @param cumulative: A flag to specify whether the plot is cumulative (default = False)
299 @type title: C{bool}
300 """
301 try:
302 import pylab
303 except ImportError:
304 raise ValueError('The plot function requires the matplotlib package (aka pylab).'
305 'See http://matplotlib.sourceforge.net/')
306
307 if len(args) == 0:
308 args = [len(self)]
309 samples = list(islice(self, *args))
310
311 cumulative = _get_kwarg(kwargs, 'cumulative', False)
312 if cumulative:
313 freqs = list(self._cumulative_frequencies(samples))
314 ylabel = "Cumulative Counts"
315 else:
316 freqs = [self[sample] for sample in samples]
317 ylabel = "Counts"
318
319
320 pylab.grid(True, color="silver")
321 if not "linewidth" in kwargs:
322 kwargs["linewidth"] = 2
323 if "title" in kwargs:
324 pylab.title(kwargs["title"])
325 del kwargs["title"]
326 pylab.plot(freqs, **kwargs)
327 pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
328 pylab.xlabel("Samples")
329 pylab.ylabel(ylabel)
330 pylab.show()
331
333 """
334 Tabulate the given samples from the frequency distribution (cumulative),
335 displaying the most frequent sample first. If an integer
336 parameter is supplied, stop after this many samples have been
337 plotted. If two integer parameters m, n are supplied, plot a
338 subset of the samples, beginning with m and stopping at n-1.
339 (Requires Matplotlib to be installed.)
340
341 @param samples: The samples to plot (default is all samples)
342 @type samples: C{list}
343 """
344 if len(args) == 0:
345 args = [len(self)]
346 samples = list(islice(self, *args))
347
348 cumulative = _get_kwarg(kwargs, 'cumulative', False)
349 if cumulative:
350 freqs = list(self._cumulative_frequencies(samples))
351 else:
352 freqs = [self[sample] for sample in samples]
353
354
355 for i in range(len(samples)):
356 print "%4s" % str(samples[i]),
357 print
358 for i in range(len(samples)):
359 print "%4d" % freqs[i],
360 print
361
363 raise AttributeError, "Use FreqDist.keys(), or iterate over the FreqDist to get its samples in sorted order (most frequent first)"
364
366 raise AttributeError, "Use FreqDist.keys(), or iterate over the FreqDist to get its samples in sorted order (most frequent first)"
367
371
373 """
374 Return the samples sorted in decreasing order of frequency.
375
376 @return: A list of samples, in sorted order
377 @rtype: C{list} of any
378 """
379 self._sort_keys_by_value()
380 return map(itemgetter(0), self._item_cache)
381
383 """
384 Return the samples sorted in decreasing order of frequency.
385
386 @return: A list of samples, in sorted order
387 @rtype: C{list} of any
388 """
389 self._sort_keys_by_value()
390 return map(itemgetter(1), self._item_cache)
391
393 """
394 Return the items sorted in decreasing order of frequency.
395
396 @return: A list of items, in sorted order
397 @rtype: C{list} of C{tuple}
398 """
399 self._sort_keys_by_value()
400 return self._item_cache[:]
401
403 """
404 Return the samples sorted in decreasing order of frequency.
405
406 @return: An iterator over the samples, in sorted order
407 @rtype: C{iter}
408 """
409 return iter(self.keys())
410
412 """
413 Return the samples sorted in decreasing order of frequency.
414
415 @return: An iterator over the samples, in sorted order
416 @rtype: C{iter}
417 """
418 return iter(self.keys())
419
421 """
422 Return the values sorted in decreasing order.
423
424 @return: An iterator over the values, in sorted order
425 @rtype: C{iter}
426 """
427 return iter(self.values())
428
430 """
431 Return the items sorted in decreasing order of frequency.
432
433 @return: An iterator over the items, in sorted order
434 @rtype: C{iter} of any
435 """
436 self._sort_keys_by_value()
437 return iter(self._item_cache)
438
439
440
441
442
444 """
445 Create a copy of this frequency distribution.
446
447 @return: A copy of this frequency distribution object.
448 @rtype: C{FreqDist}
449 """
450 return self.__class__(self)
451
453 """
454 Update the frequency distribution with the provided list of samples.
455 This is a faster way to add multiple samples to the distribution.
456
457 @param samples: The samples to add.
458 @type samples: C{list}
459 """
460 try:
461 sample_iter = samples.iteritems()
462 except:
463 sample_iter = imap(lambda x: (x,1), samples)
464 for sample, count in sample_iter:
465 self.inc(sample, count=count)
466
467 - def pop(self, other):
470
474
479
481 self._Nr_cache = None
482 self._max_cache = None
483 self._item_cache = None
484
490 if not isinstance(other, FreqDist): return False
491 return self.items() == other.items()
493 return not (self == other)
495 if not isinstance(other, FreqDist): return False
496 return set(self).issubset(other) and all(self[key] <= other[key] for key in self)
498 if not isinstance(other, FreqDist): return False
499 return self <= other and self != other
501 if not isinstance(other, FreqDist): return False
502 return other <= self
504 if not isinstance(other, FreqDist): return False
505 return other < self
506
508 """
509 @return: A string representation of this C{FreqDist}.
510 @rtype: string
511 """
512 return '<FreqDist with %d outcomes>' % self.N()
513
515 """
516 @return: A string representation of this C{FreqDist}.
517 @rtype: string
518 """
519 items = ['%r: %r' % (s, self[s]) for s in self]
520 return '<FreqDist: %s>' % ', '.join(items)
521
523 return self.get(sample, 0)
524
525
526
527
528
530 """
531 A probability distribution for the outcomes of an experiment. A
532 probability distribution specifies how likely it is that an
533 experiment will have any given outcome. For example, a
534 probability distribution could be used to predict the probability
535 that a token in a document will have a given type. Formally, a
536 probability distribution can be defined as a function mapping from
537 samples to nonnegative real numbers, such that the sum of every
538 number in the function's range is 1.0. C{ProbDist}s are often
539 used to model the probability distribution of the experiment used
540 to generate a frequency distribution.
541 """
542 SUM_TO_ONE = True
543 """True if the probabilities of the samples in this probability
544 distribution will always sum to one."""
545
547 if self.__class__ == ProbDistI:
548 raise AssertionError, "Interfaces can't be instantiated"
549
550 - def prob(self, sample):
551 """
552 @return: the probability for a given sample. Probabilities
553 are always real numbers in the range [0, 1].
554 @rtype: float
555 @param sample: The sample whose probability
556 should be returned.
557 @type sample: any
558 """
559 raise AssertionError()
560
562 """
563 @return: the base 2 logarithm of the probability for a given
564 sample. Log probabilities range from negitive infinity to
565 zero.
566 @rtype: float
567 @param sample: The sample whose probability
568 should be returned.
569 @type sample: any
570 """
571
572 p = self.prob(sample)
573 if p == 0:
574
575
576 return _NINF
577 else:
578 return math.log(p, 2)
579
581 """
582 @return: the sample with the greatest probability. If two or
583 more samples have the same probability, return one of them;
584 which sample is returned is undefined.
585 @rtype: any
586 """
587 raise AssertionError()
588
589
591 """
592 @return: A list of all samples that have nonzero
593 probabilities. Use C{prob} to find the probability of
594 each sample.
595 @rtype: C{list}
596 """
597 raise AssertionError()
598
599
601 """
602 @return: The ratio by which counts are discounted on average: c*/c
603 @rtype: C{float}
604 """
605 return 0.0
606
607
608
610 """
611 @return: A randomly selected sample from this probability
612 distribution. The probability of returning each sample
613 C{samp} is equal to C{self.prob(samp)}.
614 """
615 p = random.random()
616 for sample in self.samples():
617 p -= self.prob(sample)
618 if p <= 0: return sample
619
620 if p < .0001:
621 return sample
622
623 if self.SUM_TO_ONE:
624 warnings.warn("Probability distribution %r sums to %r; generate()"
625 " is returning an arbitrary sample." % (self, 1-p))
626 return random.choice(list(self.samples()))
627
658
660 """
661 A probability distribution whose probabilities are directly
662 specified by a given dictionary. The given dictionary maps
663 samples to probabilities.
664 """
665 - def __init__(self, prob_dict=None, log=False, normalize=False):
666 """
667 Construct a new probability distribution from the given
668 dictionary, which maps values to probabilities (or to log
669 probabilities, if C{log} is true). If C{normalize} is
670 true, then the probability values are scaled by a constant
671 factor such that they sum to 1.
672 """
673 self._prob_dict = prob_dict.copy()
674 self._log = log
675
676
677 if normalize:
678 if log:
679 value_sum = sum_logs(self._prob_dict.values())
680 if value_sum <= _NINF:
681 logp = math.log(1.0/len(prob_dict), 2)
682 for x in prob_dict.keys():
683 self._prob_dict[x] = logp
684 else:
685 for (x, p) in self._prob_dict.items():
686 self._prob_dict[x] -= value_sum
687 else:
688 value_sum = sum(self._prob_dict.values())
689 if value_sum == 0:
690 p = 1.0/len(prob_dict)
691 for x in prob_dict:
692 self._prob_dict[x] = p
693 else:
694 norm_factor = 1.0/value_sum
695 for (x, p) in self._prob_dict.items():
696 self._prob_dict[x] *= norm_factor
697
698 - def prob(self, sample):
699 if self._log:
700 if sample not in self._prob_dict: return 0
701 else: return 2**(self._prob_dict[sample])
702 else:
703 return self._prob_dict.get(sample, 0)
704
706 if self._log:
707 return self._prob_dict.get(sample, _NINF)
708 else:
709 if sample not in self._prob_dict: return _NINF
710 elif self._prob_dict[sample] == 0: return _NINF
711 else: return math.log(self._prob_dict[sample], 2)
712
714 if not hasattr(self, '_max'):
715 self._max = max((p,v) for (v,p) in self._prob_dict.items())[1]
716 return self._max
718 return self._prob_dict.keys()
720 return '<ProbDist with %d samples>' % len(self._prob_dict)
721
723 """
724 The maximum likelihood estimate for the probability distribution
725 of the experiment used to generate a frequency distribution. The
726 X{maximum likelihood estimate} approximates the probability of
727 each sample as the frequency of that sample in the frequency
728 distribution.
729 """
731 """
732 Use the maximum likelihood estimate to create a probability
733 distribution for the experiment used to generate C{freqdist}.
734
735 @type freqdist: C{FreqDist}
736 @param freqdist: The frequency distribution that the
737 probability estimates should be based on.
738 """
739 self._freqdist = freqdist
740
742 """
743 @return: The frequency distribution that this probability
744 distribution is based on.
745 @rtype: C{FreqDist}
746 """
747 return self._freqdist
748
749 - def prob(self, sample):
750 return self._freqdist.freq(sample)
751
753 return self._freqdist.max()
754
756 return self._freqdist.keys()
757
759 """
760 @rtype: C{string}
761 @return: A string representation of this C{ProbDist}.
762 """
763 return '<MLEProbDist based on %d samples>' % self._freqdist.N()
764
766 """
767 The Lidstone estimate for the probability distribution of the
768 experiment used to generate a frequency distribution. The
769 C{Lidstone estimate} is paramaterized by a real number M{gamma},
770 which typically ranges from 0 to 1. The X{Lidstone estimate}
771 approximates the probability of a sample with count M{c} from an
772 experiment with M{N} outcomes and M{B} bins as
773 M{(c+gamma)/(N+B*gamma)}. This is equivalant to adding
774 M{gamma} to the count for each bin, and taking the maximum
775 likelihood estimate of the resulting frequency distribution.
776 """
777 SUM_TO_ONE = False
778 - def __init__(self, freqdist, gamma, bins=None):
779 """
780 Use the Lidstone estimate to create a probability distribution
781 for the experiment used to generate C{freqdist}.
782
783 @type freqdist: C{FreqDist}
784 @param freqdist: The frequency distribution that the
785 probability estimates should be based on.
786 @type gamma: C{float}
787 @param gamma: A real number used to paramaterize the
788 estimate. The Lidstone estimate is equivalant to adding
789 M{gamma} to the count for each bin, and taking the
790 maximum likelihood estimate of the resulting frequency
791 distribution.
792 @type bins: C{int}
793 @param bins: The number of sample values that can be generated
794 by the experiment that is described by the probability
795 distribution. This value must be correctly set for the
796 probabilities of the sample values to sum to one. If
797 C{bins} is not specified, it defaults to C{freqdist.B()}.
798 """
799 if (bins == 0) or (bins is None and freqdist.N() == 0):
800 name = self.__class__.__name__[:-8]
801 raise ValueError('A %s probability distribution ' % name +
802 'must have at least one bin.')
803 if (bins is not None) and (bins < freqdist.B()):
804 name = self.__class__.__name__[:-8]
805 raise ValueError('\nThe number of bins in a %s distribution ' % name +
806 '(%d) must be greater than or equal to\n' % bins +
807 'the number of bins in the FreqDist used ' +
808 'to create it (%d).' % freqdist.N())
809
810 self._freqdist = freqdist
811 self._gamma = float(gamma)
812 self._N = self._freqdist.N()
813
814 if bins is None: bins = freqdist.B()
815 self._bins = bins
816
817 self._divisor = self._N + bins * gamma
818 if self._divisor == 0.0:
819
820
821 self._gamma = 0
822 self._divisor = 1
823
825 """
826 @return: The frequency distribution that this probability
827 distribution is based on.
828 @rtype: C{FreqDist}
829 """
830 return self._freqdist
831
832 - def prob(self, sample):
833 c = self._freqdist[sample]
834 return (c + self._gamma) / self._divisor
835
837
838
839
840 return self._freqdist.max()
841
843 return self._freqdist.keys()
844
846 gb = self._gamma * self._bins
847 return gb / (self._N + gb)
848
850 """
851 @rtype: C{string}
852 @return: A string representation of this C{ProbDist}.
853 """
854 return '<LidstoneProbDist based on %d samples>' % self._freqdist.N()
855
856
858 """
859 The Laplace estimate for the probability distribution of the
860 experiment used to generate a frequency distribution. The
861 X{Lidstone estimate} approximates the probability of a sample with
862 count M{c} from an experiment with M{N} outcomes and M{B} bins as
863 M{(c+1)/(N+B)}. This is equivalant to adding one to the count for
864 each bin, and taking the maximum likelihood estimate of the
865 resulting frequency distribution.
866 """
867 - def __init__(self, freqdist, bins=None):
868 """
869 Use the Laplace estimate to create a probability distribution
870 for the experiment used to generate C{freqdist}.
871
872 @type freqdist: C{FreqDist}
873 @param freqdist: The frequency distribution that the
874 probability estimates should be based on.
875 @type bins: C{int}
876 @param bins: The number of sample values that can be generated
877 by the experiment that is described by the probability
878 distribution. This value must be correctly set for the
879 probabilities of the sample values to sum to one. If
880 C{bins} is not specified, it defaults to C{freqdist.B()}.
881 """
882 LidstoneProbDist.__init__(self, freqdist, 1, bins)
883
885 """
886 @rtype: C{string}
887 @return: A string representation of this C{ProbDist}.
888 """
889 return '<LaplaceProbDist based on %d samples>' % self._freqdist.N()
890
892 """
893 The expected likelihood estimate for the probability distribution
894 of the experiment used to generate a frequency distribution. The
895 X{expected likelihood estimate} approximates the probability of a
896 sample with count M{c} from an experiment with M{N} outcomes and
897 M{B} bins as M{(c+0.5)/(N+B/2)}. This is equivalant to adding 0.5
898 to the count for each bin, and taking the maximum likelihood
899 estimate of the resulting frequency distribution.
900 """
901 - def __init__(self, freqdist, bins=None):
902 """
903 Use the expected likelihood estimate to create a probability
904 distribution for the experiment used to generate C{freqdist}.
905
906 @type freqdist: C{FreqDist}
907 @param freqdist: The frequency distribution that the
908 probability estimates should be based on.
909 @type bins: C{int}
910 @param bins: The number of sample values that can be generated
911 by the experiment that is described by the probability
912 distribution. This value must be correctly set for the
913 probabilities of the sample values to sum to one. If
914 C{bins} is not specified, it defaults to C{freqdist.B()}.
915 """
916 LidstoneProbDist.__init__(self, freqdist, 0.5, bins)
917
919 """
920 @rtype: C{string}
921 @return: A string representation of this C{ProbDist}.
922 """
923 return '<ELEProbDist based on %d samples>' % self._freqdist.N()
924
926 """
927 The heldout estimate for the probability distribution of the
928 experiment used to generate two frequency distributions. These
929 two frequency distributions are called the "heldout frequency
930 distribution" and the "base frequency distribution." The
931 X{heldout estimate} uses uses the X{heldout frequency
932 distribution} to predict the probability of each sample, given its
933 frequency in the X{base frequency distribution}.
934
935 In particular, the heldout estimate approximates the probability
936 for a sample that occurs M{r} times in the base distribution as
937 the average frequency in the heldout distribution of all samples
938 that occur M{r} times in the base distribution.
939
940 This average frequency is M{Tr[r]/(Nr[r]*N)}, where:
941 - M{Tr[r]} is the total count in the heldout distribution for
942 all samples that occur M{r} times in the base
943 distribution.
944 - M{Nr[r]} is the number of samples that occur M{r} times in
945 the base distribution.
946 - M{N} is the number of outcomes recorded by the heldout
947 frequency distribution.
948
949 In order to increase the efficiency of the C{prob} member
950 function, M{Tr[r]/(Nr[r]*N)} is precomputed for each value of M{r}
951 when the C{HeldoutProbDist} is created.
952
953 @type _estimate: C{list} of C{float}
954 @ivar _estimate: A list mapping from M{r}, the number of
955 times that a sample occurs in the base distribution, to the
956 probability estimate for that sample. C{_estimate[M{r}]} is
957 calculated by finding the average frequency in the heldout
958 distribution of all samples that occur M{r} times in the base
959 distribution. In particular, C{_estimate[M{r}]} =
960 M{Tr[r]/(Nr[r]*N)}.
961 @type _max_r: C{int}
962 @ivar _max_r: The maximum number of times that any sample occurs
963 in the base distribution. C{_max_r} is used to decide how
964 large C{_estimate} must be.
965 """
966 SUM_TO_ONE = False
967 - def __init__(self, base_fdist, heldout_fdist, bins=None):
968 """
969 Use the heldout estimate to create a probability distribution
970 for the experiment used to generate C{base_fdist} and
971 C{heldout_fdist}.
972
973 @type base_fdist: C{FreqDist}
974 @param base_fdist: The base frequency distribution.
975 @type heldout_fdist: C{FreqDist}
976 @param heldout_fdist: The heldout frequency distribution.
977 @type bins: C{int}
978 @param bins: The number of sample values that can be generated
979 by the experiment that is described by the probability
980 distribution. This value must be correctly set for the
981 probabilities of the sample values to sum to one. If
982 C{bins} is not specified, it defaults to C{freqdist.B()}.
983 """
984
985 self._base_fdist = base_fdist
986 self._heldout_fdist = heldout_fdist
987
988
989 self._max_r = base_fdist[base_fdist.max()]
990
991
992 Tr = self._calculate_Tr()
993 Nr = [base_fdist.Nr(r, bins) for r in range(self._max_r+1)]
994 N = heldout_fdist.N()
995
996
997
998 self._estimate = self._calculate_estimate(Tr, Nr, N)
999
1001 """
1002 @return: the list M{Tr}, where M{Tr[r]} is the total count in
1003 C{heldout_fdist} for all samples that occur M{r}
1004 times in C{base_fdist}.
1005 @rtype: C{list} of C{float}
1006 """
1007 Tr = [0.0] * (self._max_r+1)
1008 for sample in self._heldout_fdist:
1009 r = self._base_fdist[sample]
1010 Tr[r] += self._heldout_fdist[sample]
1011 return Tr
1012
1014 """
1015 @return: the list M{estimate}, where M{estimate[r]} is the
1016 probability estimate for any sample that occurs M{r} times
1017 in the base frequency distribution. In particular,
1018 M{estimate[r]} is M{Tr[r]/(N[r]*N)}. In the special case
1019 that M{N[r]=0}, M{estimate[r]} will never be used; so we
1020 define M{estimate[r]=None} for those cases.
1021 @rtype: C{list} of C{float}
1022 @type Tr: C{list} of C{float}
1023 @param Tr: the list M{Tr}, where M{Tr[r]} is the total count in
1024 the heldout distribution for all samples that occur M{r}
1025 times in base distribution.
1026 @type Nr: C{list} of C{float}
1027 @param Nr: The list M{Nr}, where M{Nr[r]} is the number of
1028 samples that occur M{r} times in the base distribution.
1029 @type N: C{int}
1030 @param N: The total number of outcomes recorded by the heldout
1031 frequency distribution.
1032 """
1033 estimate = []
1034 for r in range(self._max_r+1):
1035 if Nr[r] == 0: estimate.append(None)
1036 else: estimate.append(Tr[r]/(Nr[r]*N))
1037 return estimate
1038
1040 """
1041 @return: The base frequency distribution that this probability
1042 distribution is based on.
1043 @rtype: C{FreqDist}
1044 """
1045 return self._base_fdist
1046
1048 """
1049 @return: The heldout frequency distribution that this
1050 probability distribution is based on.
1051 @rtype: C{FreqDist}
1052 """
1053 return self._heldout_fdist
1054
1056 return self._base_fdist.keys()
1057
1058 - def prob(self, sample):
1059
1060 r = self._base_fdist[sample]
1061 return self._estimate[r]
1062
1064
1065
1066
1067 return self._base_fdist.max()
1068
1070 raise NotImplementedError()
1071
1073 """
1074 @rtype: C{string}
1075 @return: A string representation of this C{ProbDist}.
1076 """
1077 s = '<HeldoutProbDist: %d base samples; %d heldout samples>'
1078 return s % (self._base_fdist.N(), self._heldout_fdist.N())
1079
1081 """
1082 The cross-validation estimate for the probability distribution of
1083 the experiment used to generate a set of frequency distribution.
1084 The X{cross-validation estimate} for the probability of a sample
1085 is found by averaging the held-out estimates for the sample in
1086 each pair of frequency distributions.
1087 """
1088 SUM_TO_ONE = False
1090 """
1091 Use the cross-validation estimate to create a probability
1092 distribution for the experiment used to generate
1093 C{freqdists}.
1094
1095 @type freqdists: C{list} of C{FreqDist}
1096 @param freqdists: A list of the frequency distributions
1097 generated by the experiment.
1098 @type bins: C{int}
1099 @param bins: The number of sample values that can be generated
1100 by the experiment that is described by the probability
1101 distribution. This value must be correctly set for the
1102 probabilities of the sample values to sum to one. If
1103 C{bins} is not specified, it defaults to C{freqdist.B()}.
1104 """
1105 self._freqdists = freqdists
1106
1107
1108
1109 self._heldout_probdists = []
1110 for fdist1 in freqdists:
1111 for fdist2 in freqdists:
1112 if fdist1 is not fdist2:
1113 probdist = HeldoutProbDist(fdist1, fdist2, bins)
1114 self._heldout_probdists.append(probdist)
1115
1117 """
1118 @rtype: C{list} of C{FreqDist}
1119 @return: The list of frequency distributions that this
1120 C{ProbDist} is based on.
1121 """
1122 return self._freqdists
1123
1125
1126 return set(sum([fd.keys() for fd in self._freqdists], []))
1127
1128 - def prob(self, sample):
1129
1130
1131 prob = 0.0
1132 for heldout_probdist in self._heldout_probdists:
1133 prob += heldout_probdist.prob(sample)
1134 return prob/len(self._heldout_probdists)
1135
1137 raise NotImplementedError()
1138
1140 """
1141 @rtype: C{string}
1142 @return: A string representation of this C{ProbDist}.
1143 """
1144 return '<CrossValidationProbDist: %d-way>' % len(self._freqdists)
1145
1147 """
1148 The Witten-Bell estimate of a probability distribution. This distribution
1149 allocates uniform probability mass to as yet unseen events by using the
1150 number of events that have only been seen once. The probability mass
1151 reserved for unseen events is equal to:
1152
1153 - M{T / (N + T)}
1154
1155 where M{T} is the number of observed event types and M{N} is the total
1156 number of observed events. This equates to the maximum likelihood estimate
1157 of a new type event occuring. The remaining probability mass is discounted
1158 such that all probability estimates sum to one, yielding:
1159
1160 - M{p = T / Z (N + T)}, if count = 0
1161 - M{p = c / (N + T)}, otherwise
1162 """
1163
1164 - def __init__(self, freqdist, bins=None):
1165 """
1166 Creates a distribution of Witten-Bell probability estimates. This
1167 distribution allocates uniform probability mass to as yet unseen
1168 events by using the number of events that have only been seen once.
1169 The probability mass reserved for unseen events is equal to:
1170
1171 - M{T / (N + T)}
1172
1173 where M{T} is the number of observed event types and M{N} is the total
1174 number of observed events. This equates to the maximum likelihood
1175 estimate of a new type event occuring. The remaining probability mass
1176 is discounted such that all probability estimates sum to one,
1177 yielding:
1178
1179 - M{p = T / Z (N + T)}, if count = 0
1180 - M{p = c / (N + T)}, otherwise
1181
1182 The parameters M{T} and M{N} are taken from the C{freqdist} parameter
1183 (the C{B()} and C{N()} values). The normalising factor M{Z} is
1184 calculated using these values along with the C{bins} parameter.
1185
1186 @param freqdist: The frequency counts upon which to base the
1187 estimation.
1188 @type freqdist: C{FreqDist}
1189 @param bins: The number of possible event types. This must be
1190 at least as large as the number of bins in the
1191 C{freqdist}. If C{None}, then it's assumed to be
1192 equal to that of the C{freqdist}
1193 @type bins: C{Int}
1194 """
1195 assert bins == None or bins >= freqdist.B(),\
1196 'Bins parameter must not be less than freqdist.B()'
1197 if bins == None:
1198 bins = freqdist.B()
1199 self._freqdist = freqdist
1200 self._T = self._freqdist.B()
1201 self._Z = bins - self._freqdist.B()
1202 self._N = self._freqdist.N()
1203
1204 if self._N==0:
1205
1206 self._P0 = 1.0 / self._Z
1207 else:
1208 self._P0 = self._T / float(self._Z * (self._N + self._T))
1209
1210 - def prob(self, sample):
1211
1212 c = self._freqdist[sample]
1213 if c == 0:
1214 return self._P0
1215 else:
1216 return c / float(self._N + self._T)
1217
1219 return self._freqdist.max()
1220
1222 return self._freqdist.keys()
1223
1225 return self._freqdist
1226
1228 raise NotImplementedError()
1229
1231 """
1232 @rtype: C{string}
1233 @return: A string representation of this C{ProbDist}.
1234 """
1235 return '<WittenBellProbDist based on %d samples>' % self._freqdist.N()
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1280 """
1281 The Good-Turing estimate of a probability distribution. This method
1282 calculates the probability mass to assign to events with zero or low
1283 counts based on the number of events with higher counts. It does so by
1284 using the smoothed count M{c*}:
1285
1286 - M{c* = (c + 1) N(c + 1) / N(c)} for c >= 1
1287 - M{things with frequency zero in training} = N(1) for c == 0
1288
1289 where M{c} is the original count, M{N(i)} is the number of event types
1290 observed with count M{i}. We can think the count of unseen as the count
1291 of frequency one.
1292 (see Jurafsky & Martin 2nd Edition, p101)
1293
1294 """
1295 - def __init__(self, freqdist, bins=None):
1296 """
1297 @param freqdist: The frequency counts upon which to base the
1298 estimation.
1299 @type freqdist: C{FreqDist}
1300 @param bins: The number of possible event types. This must be
1301 at least as large as the number of bins in the
1302 C{freqdist}. If C{None}, then it's assumed to be
1303 equal to that of the C{freqdist}
1304 @type bins: C{Int}
1305 """
1306 assert bins == None or bins >= freqdist.B(),\
1307 'Bins parameter must not be less than freqdist.B()'
1308 if bins == None:
1309 bins = freqdist.B()
1310 self._freqdist = freqdist
1311 self._bins = bins
1312
1313 - def prob(self, sample):
1314 count = self._freqdist[sample]
1315
1316
1317 if count == 0 and self._freqdist.N() != 0:
1318 p0 = 1.0 * self._freqdist.Nr(1) / self._freqdist.N()
1319 if self._bins == self._freqdist.B():
1320 p0 = 0.0
1321 else:
1322 p0 = p0 / (1.0 * self._bins - self._freqdist.B())
1323
1324 nc = self._freqdist.Nr(count)
1325 ncn = self._freqdist.Nr(count + 1)
1326
1327
1328 if nc == 0 or self._freqdist.N() == 0:
1329 return 0
1330
1331 return 1.0 * (count + 1) * ncn / (nc * self._freqdist.N())
1332
1334 return self._freqdist.max()
1335
1337 return self._freqdist.keys()
1338
1340 """
1341 @return: The probability mass transferred from the
1342 seen samples to the unseen samples.
1343 @rtype: C{float}
1344 """
1345 return 1.0 * self._freqdist.Nr(1) / self._freqdist.N()
1346
1348 return self._freqdist
1349
1351 """
1352 @rtype: C{string}
1353 @return: A string representation of this C{ProbDist}.
1354 """
1355 return '<GoodTuringProbDist based on %d samples>' % self._freqdist.N()
1356
1357
1358
1359
1360
1361
1362
1364 """
1365 SimpleGoodTuring ProbDist approximates from frequency to freqency of
1366 frequency into a linear line under log space by linear regression.
1367 Details of Simple Good-Turing algorithm can be found in:
1368 (1) Bill Gale and Geoffrey Sampson's joint paper
1369 "Good Turing Smoothing Without Tear", published in
1370 Journal of Quantitative Linguistics, vol. 2 pp. 217-237, 1995
1371 (2) Jurafsky & Martin's Book "Speech and Language Processing"
1372 2e Chap 4.5 p103 (log(Nc) = a + b*log(c))
1373 (3) Website maintained by Geoffrey Sampson:
1374 http://www.grsampson.net/RGoodTur.html
1375
1376 Given a set of pair (xi, yi), where the xi denotes the freqency and
1377 yi denotes the freqency of freqency, we want to minimize their
1378 square variation. E(x) and E(y) represent the mean of xi and yi.
1379
1380 -Slope: b = sigma ((xi-E(x)*(yi-E(y))) / sigma ((xi-E(x))*(xi-E(x)))
1381 -Intercept: a = E(y)- b * E(x)
1382
1383 """
1384 - def __init__(self, freqdist, bins=None):
1385 """
1386 @param freqdist: The frequency counts upon which to base the
1387 estimation.
1388 @type freqdist: C{FreqDist}
1389 @param bins: The number of possible event types. This must be
1390 at least as large as the number of bins in the
1391 C{freqdist}. If C{None}, then it's assumed to be
1392 equal to that of the C{freqdist}
1393 @type bins: C{Int}
1394 """
1395 assert bins == None or bins >= freqdist.B(),\
1396 'Bins parameter must not be less than freqdist.B()'
1397 if bins == None:
1398 bins = freqdist.B()
1399 self._freqdist = freqdist
1400 self._bins = bins
1401 r, nr = self._r_Nr()
1402 self.find_best_fit(r, nr)
1403 self._switch(r, nr)
1404 self._renormalize(r, nr)
1405
1407 """
1408 Split the frequency distribution in two list (r, Nr), where Nr(r) > 0
1409 """
1410 r, nr = [], []
1411 b, i = 0, 0
1412 while b != self._freqdist.B():
1413 nr_i = self._freqdist.Nr(i)
1414 if nr_i > 0:
1415 b += nr_i
1416 r.append(i)
1417 nr.append(nr_i)
1418 i += 1
1419 return (r, nr)
1420
1422 """
1423 Use simple linear regression to tune parameters self._slope and
1424 self._intercept in the log-log space based on count and Nr(count)
1425 (Work in log space to avoid floating point underflow.)
1426 """
1427
1428
1429
1430
1431
1432 if not r or not nr:
1433
1434 return
1435
1436 zr = []
1437 for j in range(len(r)):
1438 if j > 0:
1439 i = r[j-1]
1440 else:
1441 i = 0
1442 if j != len(r) - 1:
1443 k = r[j+1]
1444 else:
1445 k = 2 * r[j] - i
1446 zr_ = 2.0 * nr[j] / (k - i)
1447 zr.append(zr_)
1448
1449 log_r = [math.log(i) for i in r]
1450 log_zr = [math.log(i) for i in zr]
1451
1452 xy_cov = x_var = 0.0
1453 x_mean = 1.0 * sum(log_r) / len(log_r)
1454 y_mean = 1.0 * sum(log_zr) / len(log_zr)
1455 for (x, y) in zip(log_r, log_zr):
1456 xy_cov += (x - x_mean) * (y - y_mean)
1457 x_var += (x - x_mean)**2
1458 if x_var != 0:
1459 self._slope = xy_cov / x_var
1460 else:
1461 self._slope = 0.0
1462 self._intercept = y_mean - self._slope * x_mean
1463
1465 """
1466 Calculate the r frontier where we must switch from Nr to Sr
1467 when estimating E[Nr].
1468 """
1469 for i, r_ in enumerate(r):
1470 if len(r) == i + 1 or r[i+1] != r_ + 1:
1471
1472 self._switch_at = r_
1473 break
1474
1475 Sr = self.smoothedNr
1476 smooth_r_star = (r_ + 1) * Sr(r_+1) / Sr(r_)
1477 unsmooth_r_star = 1.0 * (r_ + 1) * nr[i+1] / nr[i]
1478
1479 std = math.sqrt(self._variance(r_, nr[i], nr[i+1]))
1480 if abs(unsmooth_r_star-smooth_r_star) <= 1.96 * std:
1481 self._switch_at = r_
1482 break
1483
1485 r = float(r)
1486 nr = float(nr)
1487 nr_1 = float(nr_1)
1488 return (r + 1.0)**2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr)
1489
1491 """
1492 It is necessary to renormalize all the probability estimates to
1493 ensure a proper probability distribution results. This can be done
1494 by keeping the estimate of the probability mass for unseen items as
1495 N(1)/N and renormalizing all the estimates for previously seen items
1496 (as Gale and Sampson (1995) propose). (See M&S P.213, 1999)
1497 """
1498 prob_cov = 0.0
1499 for r_, nr_ in zip(r, nr):
1500 prob_cov += nr_ * self._prob_measure(r_)
1501 if prob_cov:
1502 self._renormal = (1 - self._prob_measure(0)) / prob_cov
1503
1505 """
1506 @return: The number of samples with count r.
1507 @rtype: C{float}
1508 @param r: The amount of freqency.
1509 @type r: C{int}
1510 """
1511
1512
1513
1514
1515
1516
1517 return math.exp(self._intercept + self._slope * math.log(r))
1518
1519 - def prob(self, sample):
1520 """
1521 @param sample: sample of the event
1522 @type sample: C{string}
1523 @return: The sample's probability.
1524 @rtype: C{float}
1525 """
1526 count = self._freqdist[sample]
1527 p = self._prob_measure(count)
1528 if count == 0:
1529 if self._bins == self._freqdist.B():
1530 p = 0.0
1531 else:
1532 p = p / (1.0 * self._bins - self._freqdist.B())
1533 else:
1534 p = p * self._renormal
1535 return p
1536
1538 if count == 0 and self._freqdist.N() == 0 :
1539 return 1.0
1540 elif count == 0 and self._freqdist.N() != 0:
1541 return 1.0 * self._freqdist.Nr(1) / self._freqdist.N()
1542
1543 if self._switch_at > count:
1544 Er_1 = 1.0 * self._freqdist.Nr(count+1)
1545 Er = 1.0 * self._freqdist.Nr(count)
1546 else:
1547 Er_1 = self.smoothedNr(count+1)
1548 Er = self.smoothedNr(count)
1549
1550 r_star = (count + 1) * Er_1 / Er
1551 return r_star / self._freqdist.N()
1552
1554 prob_sum = 0.0
1555 for i in range(0, len(self._Nr)):
1556 prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal
1557 print "Probability Sum:", prob_sum
1558
1559
1561 """
1562 This function returns the total mass of probability transfers from the
1563 seen samples to the unseen samples.
1564 """
1565 return 1.0 * self.smoothedNr(1) / self._freqdist.N()
1566
1568 return self._freqdist.max()
1569
1571 return self._freqdist.keys()
1572
1574 return self._freqdist
1575
1577 """
1578 @rtype: C{string}
1579 @return: A string representation of this C{ProbDist}.
1580 """
1581 return '<SimpleGoodTuringProbDist based on %d samples>'\
1582 % self._freqdist.N()
1583
1584
1586 """
1587 An mutable probdist where the probabilities may be easily modified. This
1588 simply copies an existing probdist, storing the probability values in a
1589 mutable dictionary and providing an update method.
1590 """
1591
1592 - def __init__(self, prob_dist, samples, store_logs=True):
1593 """
1594 Creates the mutable probdist based on the given prob_dist and using
1595 the list of samples given. These values are stored as log
1596 probabilities if the store_logs flag is set.
1597
1598 @param prob_dist: the distribution from which to garner the
1599 probabilities
1600 @type prob_dist: ProbDist
1601 @param samples: the complete set of samples
1602 @type samples: sequence of any
1603 @param store_logs: whether to store the probabilities as logarithms
1604 @type store_logs: bool
1605 """
1606 try:
1607 import numpy
1608 except ImportError:
1609 print "Error: Please install numpy; for instructions see http://www.nltk.org/"
1610 exit()
1611 self._samples = samples
1612 self._sample_dict = dict((samples[i], i) for i in range(len(samples)))
1613 self._data = numpy.zeros(len(samples), numpy.float64)
1614 for i in range(len(samples)):
1615 if store_logs:
1616 self._data[i] = prob_dist.logprob(samples[i])
1617 else:
1618 self._data[i] = prob_dist.prob(samples[i])
1619 self._logs = store_logs
1620
1622
1623 return self._samples
1624
1625 - def prob(self, sample):
1626
1627 i = self._sample_dict.get(sample)
1628 if i != None:
1629 if self._logs:
1630 return 2**(self._data[i])
1631 else:
1632 return self._data[i]
1633 else:
1634 return 0.0
1635
1637
1638 i = self._sample_dict.get(sample)
1639 if i != None:
1640 if self._logs:
1641 return self._data[i]
1642 else:
1643 return math.log(self._data[i], 2)
1644 else:
1645 return float('-inf')
1646
1647 - def update(self, sample, prob, log=True):
1648 """
1649 Update the probability for the given sample. This may cause the object
1650 to stop being the valid probability distribution - the user must
1651 ensure that they update the sample probabilities such that all samples
1652 have probabilities between 0 and 1 and that all probabilities sum to
1653 one.
1654
1655 @param sample: the sample for which to update the probability
1656 @type sample: C{any}
1657 @param prob: the new probability
1658 @type prob: C{float}
1659 @param log: is the probability already logged
1660 @type log: C{bool}
1661 """
1662 i = self._sample_dict.get(sample)
1663 assert i != None
1664 if self._logs:
1665 if log: self._data[i] = prob
1666 else: self._data[i] = math.log(prob, 2)
1667 else:
1668 if log: self._data[i] = 2**(prob)
1669 else: self._data[i] = prob
1670
1671
1672
1673
1674
1676 if (not isinstance(test_pdist, ProbDistI) or
1677 not isinstance(actual_pdist, ProbDistI)):
1678 raise ValueError('expected a ProbDist.')
1679
1680 return sum(actual_pdist.prob(s) * math.log(test_pdist.prob(s), 2)
1681 for s in actual_pdist.keys())
1682
1684 probs = [pdist.prob(s) for s in pdist.samples()]
1685 return -sum([p * math.log(p,2) for p in probs])
1686
1687
1688
1689
1690
1692 """
1693 A collection of frequency distributions for a single experiment
1694 run under different conditions. Conditional frequency
1695 distributions are used to record the number of times each sample
1696 occurred, given the condition under which the experiment was run.
1697 For example, a conditional frequency distribution could be used to
1698 record the frequency of each word (type) in a document, given its
1699 length. Formally, a conditional frequency distribution can be
1700 defined as a function that maps from each condition to the
1701 C{FreqDist} for the experiment under that condition.
1702
1703 The frequency distribution for each condition is accessed using
1704 the indexing operator:
1705
1706 >>> cfdist[3]
1707 <FreqDist with 73 outcomes>
1708 >>> cfdist[3].freq('the')
1709 0.4
1710 >>> cfdist[3]['dog']
1711 2
1712
1713 When the indexing operator is used to access the frequency
1714 distribution for a condition that has not been accessed before,
1715 C{ConditionalFreqDist} creates a new empty C{FreqDist} for that
1716 condition.
1717
1718 Conditional frequency distributions are typically constructed by
1719 repeatedly running an experiment under a variety of conditions,
1720 and incrementing the sample outcome counts for the appropriate
1721 conditions. For example, the following code will produce a
1722 conditional frequency distribution that encodes how often each
1723 word type occurs, given the length of that word type:
1724
1725 >>> cfdist = ConditionalFreqDist()
1726 >>> for word in tokenize.whitespace(sent):
1727 ... condition = len(word)
1728 ... cfdist[condition].inc(word)
1729
1730 An equivalent way to do this is with the initializer:
1731
1732 >>> cfdist = ConditionalFreqDist((len(word), word) for word in tokenize.whitespace(sent))
1733
1734 """
1735 - def __init__(self, cond_samples=None):
1736 """
1737 Construct a new empty conditional frequency distribution. In
1738 particular, the count for every sample, under every condition,
1739 is zero.
1740
1741 @param cond_samples: The samples to initialize the conditional
1742 frequency distribution with
1743 @type cond_samples: Sequence of (condition, sample) tuples
1744 """
1745 self._fdists = {}
1746 if cond_samples:
1747 for (cond, sample) in cond_samples:
1748 self[cond].inc(sample)
1749
1751 """
1752 @return: The frequency distribution that encodes the frequency
1753 of each sample outcome, given that the experiment was run
1754 under the given condition. If the frequency distribution for
1755 the given condition has not been accessed before, then this
1756 will create a new empty C{FreqDist} for that condition.
1757 @rtype: C{FreqDist}
1758 @param condition: The condition under which the experiment was run.
1759 @type condition: any
1760 """
1761
1762 if condition not in self._fdists:
1763 self._fdists[condition] = FreqDist()
1764 return self._fdists[condition]
1765
1767 """
1768 @return: A list of the conditions that have been accessed for
1769 this C{ConditionalFreqDist}. Use the indexing operator to
1770 access the frequency distribution for a given condition.
1771 Note that the frequency distributions for some conditions
1772 may contain zero sample outcomes.
1773 @rtype: C{list}
1774 """
1775 return sorted(self._fdists.keys())
1776
1778 """
1779 @return: The number of conditions that have been accessed
1780 for this C{ConditionalFreqDist}.
1781 @rtype: C{int}
1782 """
1783 return len(self._fdists)
1784
1786 """
1787 @return: The total number of sample outcomes that have been
1788 recorded by this C{ConditionalFreqDist}.
1789 @rtype: C{int}
1790 """
1791 return sum(fdist.N() for fdist in self._fdists.values())
1792
1793 - def plot(self, *args, **kwargs):
1794 """
1795 Plot the given samples from the conditional frequency distribution.
1796 For a cumulative plot, specify cumulative=True.
1797 (Requires Matplotlib to be installed.)
1798
1799 @param samples: The samples to plot
1800 @type samples: C{list}
1801 @param title: The title for the graph
1802 @type title: C{str}
1803 @param conditions: The conditions to plot (default is all)
1804 @type conditions: C{list}
1805 """
1806 try:
1807 import pylab
1808 except ImportError:
1809 raise ValueError('The plot function requires the matplotlib package (aka pylab).'
1810 'See http://matplotlib.sourceforge.net/')
1811
1812 cumulative = _get_kwarg(kwargs, 'cumulative', False)
1813 conditions = _get_kwarg(kwargs, 'conditions', self.conditions())
1814 title = _get_kwarg(kwargs, 'title', '')
1815 samples = _get_kwarg(kwargs, 'samples',
1816 sorted(set(v for c in conditions for v in self[c])))
1817 if not "linewidth" in kwargs:
1818 kwargs["linewidth"] = 2
1819
1820 for condition in conditions:
1821 if cumulative:
1822 freqs = list(self[condition]._cumulative_frequencies(samples))
1823 ylabel = "Cumulative Counts"
1824 legend_loc = 'lower right'
1825 else:
1826 freqs = [self[condition][sample] for sample in samples]
1827 ylabel = "Counts"
1828 legend_loc = 'upper right'
1829
1830 kwargs['label'] = condition
1831 pylab.plot(freqs, *args, **kwargs)
1832
1833 pylab.legend(loc=legend_loc)
1834 pylab.grid(True, color="silver")
1835 pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90)
1836 if title:
1837 pylab.title(title)
1838 pylab.xlabel("Samples")
1839 pylab.ylabel(ylabel)
1840 pylab.show()
1841
1843 """
1844 Tabulate the given samples from the conditional frequency distribution.
1845
1846 @param samples: The samples to plot
1847 @type samples: C{list}
1848 @param title: The title for the graph
1849 @type title: C{str}
1850 @param conditions: The conditions to plot (default is all)
1851 @type conditions: C{list}
1852 """
1853
1854 cumulative = _get_kwarg(kwargs, 'cumulative', False)
1855 conditions = _get_kwarg(kwargs, 'conditions', self.conditions())
1856 samples = _get_kwarg(kwargs, 'samples',
1857 sorted(set(v for c in conditions for v in self[c])))
1858
1859 condition_size = max(len(str(c)) for c in conditions)
1860 print ' ' * condition_size,
1861 for s in samples:
1862 print "%4s" % str(s),
1863 print
1864 for c in conditions:
1865 print "%*s" % (condition_size, str(c)),
1866 if cumulative:
1867 freqs = list(self[c]._cumulative_frequencies(samples))
1868 else:
1869 freqs = [self[c][sample] for sample in samples]
1870
1871 for f in freqs:
1872 print "%4d" % f,
1873 print
1874
1880 return not (self == other)
1886 if not isinstance(other, ConditionalFreqDist): return False
1887 return self <= other and self != other
1894
1896 """
1897 @return: A string representation of this
1898 C{ConditionalProbDist}.
1899 @rtype: C{string}
1900 """
1901 return '<ConditionalProbDist with %d conditions>' % self.__len__()
1902
1903
1905 """
1906 @return: A string representation of this
1907 C{ConditionalFreqDist}.
1908 @rtype: C{string}
1909 """
1910 n = len(self._fdists)
1911 return '<ConditionalFreqDist with %d conditions>' % n
1912
1914 """
1915 A collection of probability distributions for a single experiment
1916 run under different conditions. Conditional probability
1917 distributions are used to estimate the likelihood of each sample,
1918 given the condition under which the experiment was run. For
1919 example, a conditional probability distribution could be used to
1920 estimate the probability of each word type in a document, given
1921 the length of the word type. Formally, a conditional probability
1922 distribution can be defined as a function that maps from each
1923 condition to the C{ProbDist} for the experiment under that
1924 condition.
1925 """
1927 raise AssertionError, 'ConditionalProbDistI is an interface'
1928
1930 """
1931 @return: The probability distribution for the experiment run
1932 under the given condition.
1933 @rtype: C{ProbDistI}
1934 @param condition: The condition whose probability distribution
1935 should be returned.
1936 @type condition: any
1937 """
1938 raise AssertionError
1939
1941 """
1942 @return: The number of conditions that are represented by
1943 this C{ConditionalProbDist}.
1944 @rtype: C{int}
1945 """
1946 raise AssertionError
1947
1949 """
1950 @return: A list of the conditions that are represented by
1951 this C{ConditionalProbDist}. Use the indexing operator to
1952 access the probability distribution for a given condition.
1953 @rtype: C{list}
1954 """
1955 raise AssertionError
1956
1957
1958
1959
1960
1961
1963 """
1964 A conditional probability distribution modelling the experiments
1965 that were used to generate a conditional frequency distribution.
1966 A C{ConditoinalProbDist} is constructed from a
1967 C{ConditionalFreqDist} and a X{C{ProbDist} factory}:
1968
1969 - The B{C{ConditionalFreqDist}} specifies the frequency
1970 distribution for each condition.
1971 - The B{C{ProbDist} factory} is a function that takes a
1972 condition's frequency distribution, and returns its
1973 probability distribution. A C{ProbDist} class's name (such as
1974 C{MLEProbDist} or C{HeldoutProbDist}) can be used to specify
1975 that class's constructor.
1976
1977 The first argument to the C{ProbDist} factory is the frequency
1978 distribution that it should model; and the remaining arguments are
1979 specified by the C{factory_args} parameter to the
1980 C{ConditionalProbDist} constructor. For example, the following
1981 code constructs a C{ConditionalProbDist}, where the probability
1982 distribution for each condition is an C{ELEProbDist} with 10 bins:
1983
1984 >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10)
1985 >>> print cpdist['run'].max()
1986 'NN'
1987 >>> print cpdist['run'].prob('NN')
1988 0.0813
1989 """
1990 - def __init__(self, cfdist, probdist_factory,
1991 *factory_args, **factory_kw_args):
1992 """
1993 Construct a new conditional probability distribution, based on
1994 the given conditional frequency distribution and C{ProbDist}
1995 factory.
1996
1997 @type cfdist: L{ConditionalFreqDist}
1998 @param cfdist: The C{ConditionalFreqDist} specifying the
1999 frequency distribution for each condition.
2000 @type probdist_factory: C{class} or C{function}
2001 @param probdist_factory: The function or class that maps
2002 a condition's frequency distribution to its probability
2003 distribution. The function is called with the frequency
2004 distribution as its first argument,
2005 C{factory_args} as its remaining arguments, and
2006 C{factory_kw_args} as keyword arguments.
2007 @type factory_args: (any)
2008 @param factory_args: Extra arguments for C{probdist_factory}.
2009 These arguments are usually used to specify extra
2010 properties for the probability distributions of individual
2011 conditions, such as the number of bins they contain.
2012 @type factory_kw_args: (any)
2013 @param factory_kw_args: Extra keyword arguments for C{probdist_factory}.
2014 """
2015 self._probdist_factory = probdist_factory
2016 self._cfdist = cfdist
2017 self._factory_args = factory_args
2018 self._factory_kw_args = factory_kw_args
2019
2020 self._pdists = {}
2021 for c in cfdist.conditions():
2022 pdist = probdist_factory(cfdist[c], *factory_args,
2023 **factory_kw_args)
2024 self._pdists[c] = pdist
2025
2027 return condition in self._pdists
2028
2030 if condition not in self._pdists:
2031
2032
2033
2034 pdist = self._probdist_factory(FreqDist(), *self._factory_args,
2035 **self._factory_kw_args)
2036 self._pdists[condition] = pdist
2037
2038 return self._pdists[condition]
2039
2041 return self._pdists.keys()
2042
2044 return len(self._pdists)
2045
2047 """
2048 An alternative ConditionalProbDist that simply wraps a dictionary of
2049 ProbDists rather than creating these from FreqDists.
2050 """
2051
2053 """
2054 @param probdist_dict: a dictionary containing the probdists indexed
2055 by the conditions
2056 @type probdist_dict: dict any -> probdist
2057 """
2058 self._dict = probdist_dict
2059
2061
2062
2063 return self._dict[condition]
2064
2066
2067 return self._dict.keys()
2068
2069
2070
2071
2072
2073
2074 _ADD_LOGS_MAX_DIFF = math.log(1e-30, 2)
2075
2077 """
2078 Given two numbers C{logx}=M{log(x)} and C{logy}=M{log(y)}, return
2079 M{log(x+y)}. Conceptually, this is the same as returning
2080 M{log(2**(C{logx})+2**(C{logy}))}, but the actual implementation
2081 avoids overflow errors that could result from direct computation.
2082 """
2083 if (logx < logy + _ADD_LOGS_MAX_DIFF):
2084 return logy
2085 if (logy < logx + _ADD_LOGS_MAX_DIFF):
2086 return logx
2087 base = min(logx, logy)
2088 return base + math.log(2**(logx-base) + 2**(logy-base), 2)
2089
2091 if len(logs) == 0:
2092
2093
2094 return _NINF
2095 else:
2096 return reduce(add_logs, logs[1:], logs[0])
2097
2098
2099
2100
2101
2103 """
2104 A mix-in class to associate probabilities with other classes
2105 (trees, rules, etc.). To use the C{ProbabilisticMixIn} class,
2106 define a new class that derives from an existing class and from
2107 ProbabilisticMixIn. You will need to define a new constructor for
2108 the new class, which explicitly calls the constructors of both its
2109 parent classes. For example:
2110
2111 >>> class A:
2112 ... def __init__(self, x, y): self.data = (x,y)
2113 ...
2114 >>> class ProbabilisticA(A, ProbabilisticMixIn):
2115 ... def __init__(self, x, y, **prob_kwarg):
2116 ... A.__init__(self, x, y)
2117 ... ProbabilisticMixIn.__init__(self, **prob_kwarg)
2118
2119 See the documentation for the ProbabilisticMixIn
2120 L{constructor<__init__>} for information about the arguments it
2121 expects.
2122
2123 You should generally also redefine the string representation
2124 methods, the comparison methods, and the hashing method.
2125 """
2127 """
2128 Initialize this object's probability. This initializer should
2129 be called by subclass constructors. C{prob} should generally be
2130 the first argument for those constructors.
2131
2132 @kwparam prob: The probability associated with the object.
2133 @type prob: C{float}
2134 @kwparam logprob: The log of the probability associated with
2135 the object.
2136 @type logprob: C{float}
2137 """
2138 if 'prob' in kwargs:
2139 if 'logprob' in kwargs:
2140 raise TypeError('Must specify either prob or logprob '
2141 '(not both)')
2142 else:
2143 ProbabilisticMixIn.set_prob(self, kwargs['prob'])
2144 elif 'logprob' in kwargs:
2145 ProbabilisticMixIn.set_logprob(self, kwargs['logprob'])
2146 else:
2147 self.__prob = self.__logprob = None
2148
2150 """
2151 Set the probability associated with this object to C{prob}.
2152 @param prob: The new probability
2153 @type prob: C{float}
2154 """
2155 self.__prob = prob
2156 self.__logprob = None
2157
2159 """
2160 Set the log probability associated with this object to
2161 C{logprob}. I.e., set the probability associated with this
2162 object to C{2**(logprob)}.
2163 @param logprob: The new log probability
2164 @type logprob: C{float}
2165 """
2166 self.__logprob = logprob
2167 self.__prob = None
2168
2170 """
2171 @return: The probability associated with this object.
2172 @rtype: C{float}
2173 """
2174 if self.__prob is None:
2175 if self.__logprob is None: return None
2176 self.__prob = 2**(self.__logprob)
2177 return self.__prob
2178
2180 """
2181 @return: C{log(p)}, where C{p} is the probability associated
2182 with this object.
2183
2184 @rtype: C{float}
2185 """
2186 if self.__logprob is None:
2187 if self.__prob is None: return None
2188 self.__logprob = math.log(self.__prob, 2)
2189 return self.__logprob
2190
2193 raise ValueError, '%s is immutable' % self.__class__.__name__
2195 raise ValueError, '%s is immutable' % self.__class__.__name__
2196
2197
2198
2206
2207
2208
2209
2210
2212 """
2213 Create a new frequency distribution, with random samples. The
2214 samples are numbers from 1 to C{numsamples}, and are generated by
2215 summing two numbers, each of which has a uniform distribution.
2216 """
2217 import random
2218 fdist = FreqDist()
2219 for x in range(numoutcomes):
2220 y = (random.randint(1, (1+numsamples)/2) +
2221 random.randint(0, numsamples/2))
2222 fdist.inc(y)
2223 return fdist
2224
2226 """
2227 Return the true probability distribution for the experiment
2228 C{_create_rand_fdist(numsamples, x)}.
2229 """
2230 fdist = FreqDist()
2231 for x in range(1, (1+numsamples)/2+1):
2232 for y in range(0, numsamples/2+1):
2233 fdist.inc(x+y)
2234 return MLEProbDist(fdist)
2235
2236 -def demo(numsamples=6, numoutcomes=500):
2237 """
2238 A demonstration of frequency distributions and probability
2239 distributions. This demonstration creates three frequency
2240 distributions with, and uses them to sample a random process with
2241 C{numsamples} samples. Each frequency distribution is sampled
2242 C{numoutcomes} times. These three frequency distributions are
2243 then used to build six probability distributions. Finally, the
2244 probability estimates of these distributions are compared to the
2245 actual probability of each sample.
2246
2247 @type numsamples: C{int}
2248 @param numsamples: The number of samples to use in each demo
2249 frequency distributions.
2250 @type numoutcomes: C{int}
2251 @param numoutcomes: The total number of outcomes for each
2252 demo frequency distribution. These outcomes are divided into
2253 C{numsamples} bins.
2254 @rtype: C{None}
2255 """
2256
2257
2258 fdist1 = _create_rand_fdist(numsamples, numoutcomes)
2259 fdist2 = _create_rand_fdist(numsamples, numoutcomes)
2260 fdist3 = _create_rand_fdist(numsamples, numoutcomes)
2261
2262
2263 pdists = [
2264 MLEProbDist(fdist1),
2265 LidstoneProbDist(fdist1, 0.5, numsamples),
2266 HeldoutProbDist(fdist1, fdist2, numsamples),
2267 HeldoutProbDist(fdist2, fdist1, numsamples),
2268 CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples),
2269 GoodTuringProbDist(fdist1),
2270 SimpleGoodTuringProbDist(fdist1),
2271 SimpleGoodTuringProbDist(fdist1, 7),
2272 _create_sum_pdist(numsamples),
2273 ]
2274
2275
2276 vals = []
2277 for n in range(1,numsamples+1):
2278 vals.append(tuple([n, fdist1.freq(n)] +
2279 [pdist.prob(n) for pdist in pdists]))
2280
2281
2282 print ('%d samples (1-%d); %d outcomes were sampled for each FreqDist' %
2283 (numsamples, numsamples, numoutcomes))
2284 print '='*9*(len(pdists)+2)
2285 FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual'
2286 print FORMATSTR % tuple(`pdist`[1:9] for pdist in pdists[:-1])
2287 print '-'*9*(len(pdists)+2)
2288 FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f'
2289 for val in vals:
2290 print FORMATSTR % val
2291
2292
2293 zvals = zip(*vals)
2294 def sum(lst): return reduce(lambda x,y:x+y, lst, 0)
2295 sums = [sum(val) for val in zvals[1:]]
2296 print '-'*9*(len(pdists)+2)
2297 FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f'
2298 print FORMATSTR % tuple(sums)
2299 print '='*9*(len(pdists)+2)
2300
2301
2302 if len(`str(fdist1)`) < 70:
2303 print ' fdist1:', str(fdist1)
2304 print ' fdist2:', str(fdist2)
2305 print ' fdist3:', str(fdist3)
2306 print
2307
2308 print 'Generating:'
2309 for pdist in pdists:
2310 fdist = FreqDist(pdist.generate() for i in range(5000))
2311 print '%20s %s' % (pdist.__class__.__name__[:20], str(fdist)[:55])
2312 print
2313
2326
2327 if __name__ == '__main__':
2328 demo(6, 10)
2329 demo(5, 5000)
2330 gt_demo()
2331
2332 __all__ = ['ConditionalFreqDist', 'ConditionalProbDist',
2333 'ConditionalProbDistI', 'CrossValidationProbDist',
2334 'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist',
2335 'FreqDist', 'GoodTuringProbDist', 'SimpleGoodTuringProbDist', 'HeldoutProbDist',
2336 'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist',
2337 'MLEProbDist', 'MutableProbDist', 'ProbDistI', 'ProbabilisticMixIn',
2338 'UniformProbDist', 'WittenBellProbDist', 'add_logs',
2339 'log_likelihood', 'sum_logs', 'entropy']
2340