1
2
3
4
5
6
7
8
9
10
11
12
13
14 """
15 The Punkt sentence tokenizer. The algorithm for this tokenizer is
16 described in Kiss & Strunk (2006)::
17
18 Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence
19 Boundary Detection. Computational Linguistics 32: 485-525.
20 """
21
22
23
24
25
26 import re
27 import math
28
29 from nltk.compat import defaultdict
30 from nltk.probability import FreqDist
31
32 from api import TokenizerI
33
34
35
36
37
38
39
40
41 _ORTHO_BEG_UC = 1 << 1
42 """Orthogaphic context: beginning of a sentence with upper case."""
43
44 _ORTHO_MID_UC = 1 << 2
45 """Orthogaphic context: middle of a sentence with upper case."""
46
47 _ORTHO_UNK_UC = 1 << 3
48 """Orthogaphic context: unknown position in a sentence with upper case."""
49
50 _ORTHO_BEG_LC = 1 << 4
51 """Orthogaphic context: beginning of a sentence with lower case."""
52
53 _ORTHO_MID_LC = 1 << 5
54 """Orthogaphic context: middle of a sentence with lower case."""
55
56 _ORTHO_UNK_LC = 1 << 6
57 """Orthogaphic context: unknown position in a sentence with lower case."""
58
59 _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC
60 """Orthogaphic context: occurs with upper case."""
61
62 _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC
63 """Orthogaphic context: occurs with lower case."""
64
65 _ORTHO_MAP = {
66 ('initial', 'upper'): _ORTHO_BEG_UC,
67 ('internal', 'upper'): _ORTHO_MID_UC,
68 ('unknown', 'upper'): _ORTHO_UNK_UC,
69 ('initial', 'lower'): _ORTHO_BEG_LC,
70 ('internal', 'lower'): _ORTHO_MID_LC,
71 ('unknown', 'lower'): _ORTHO_UNK_LC,
72 }
73 """A map from context position and first-letter case to the
74 appropriate orthographic context flag."""
84 """
85 Stores variables, mostly regular expressions, which may be
86 language-dependent for correct application of the algorithm.
87 An extension of this class may modify its properties to suit
88 a language other than English; an instance can then be passed
89 as an argument to PunktSentenceTokenizer and PunktTrainer
90 constructors.
91 """
92
93 __slots__ = ('_re_period_context', '_re_word_tokenizer')
94
100
103
104 sent_end_chars = ('.', '?', '!')
105 """Characters which are candidates for sentence boundaries"""
106
107 @property
110
111 internal_punctuation = ',:;'
112 """sentence internal punctuation, which indicates an abbreviation if
113 preceded by a period-final token."""
114
115 re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)',
116 re.MULTILINE)
117 """Used to realign punctuation that should be included in a sentence
118 although it follows the period (or ?, !)."""
119
120 _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]"
121 """Excludes some characters from starting word tokens"""
122
123 _re_non_word_chars = r"(?:[?!)\";}\]\*:@\'\({\[])"
124 """Characters that cannot appear within words"""
125
126 _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)"
127 """Hyphen and ellipsis are multi-character punctuation"""
128
129 _word_tokenize_fmt = r'''(
130 %(MultiChar)s
131 |
132 (?=%(WordStart)s)\S+? # Accept word characters until end is found
133 (?= # Sequences marking a word's end
134 \s| # White-space
135 $| # End-of-string
136 %(NonWord)s|%(MultiChar)s| # Punctuation
137 ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word
138 )
139 |
140 \S
141 )'''
142 """Format of a regular expression to split punctuation from words,
143 excluding period."""
144
160
164
165 _period_context_fmt = r"""
166 \S* # some word material
167 %(SentEndChars)s # a potential sentence ending
168 (?=(?P<after_tok>
169 %(NonWord)s # either other punctuation
170 |
171 \s+(?P<next_tok>\S+) # or whitespace and some other token
172 ))"""
173 """Format of a regular expression to find contexts including possible
174 sentence boundaries. Matches token which the possible sentence boundary
175 ends, and matches the following token within a lookahead expression."""
176
178 """Compiles and returns a regular expression to find contexts
179 including possible sentence boundaries."""
180 try:
181 return self._re_period_context
182 except:
183 self._re_period_context = re.compile(
184 self._period_context_fmt %
185 {
186 'NonWord': self._re_non_word_chars,
187 'SentEndChars': self._re_sent_end_chars,
188 },
189 re.UNICODE | re.VERBOSE)
190 return self._re_period_context
191
192
193 _re_non_punct = re.compile(r'[^\W\d]', re.UNICODE)
194 """Matches token types that are not merely punctuation. (Types for
195 numeric tokens are changed to ##number## and hence contain alpha.)"""
206
208 self._lang_vars = lang_vars
209
212
213
214
215
216
217
218
219
220
221 -def _pair_iter(it):
222 """
223 Yields pairs of tokens from the given iterator such that each input
224 token will appear as the first element in a yielded tuple. The last
225 pair will have None as its second element.
226 """
227 it = iter(it)
228 prev = it.next()
229 for el in it:
230 yield (prev, el)
231 prev = el
232 yield (prev, None)
233
239 """Stores data used to perform sentence boundary detection with punkt."""
240
242 self.abbrev_types = set()
243 """A set of word types for known abbreviations."""
244
245 self.collocations = set()
246 """A set of word type tuples for known common collocations
247 where the first word ends in a period. E.g., ('S.', 'Bach')
248 is a common collocation in a text that discusses 'Johann
249 S. Bach'. These count as negative evidence for sentence
250 boundaries."""
251
252 self.sent_starters = set()
253 """A set of word types for words that often appear at the
254 beginning of sentences."""
255
256 self.ortho_context = defaultdict(int)
257 """A dictionary mapping word types to the set of orthographic
258 contexts that word type appears in. Contexts are represented
259 by adding orthographic context flags: ..."""
260
262 self.abbrev_types = set()
263
266
268 self.sent_starters = set()
269
271 self.ortho_context = defaultdict(int)
272
273 - def add_ortho_context(self, typ, flag):
274 self.ortho_context[typ] |= flag
275
281 """Stores a token of text with annotations produced during
282 sentence boundary detection."""
283
284 _properties = [
285 'parastart', 'linestart',
286 'sentbreak', 'abbr', 'ellipsis'
287 ]
288 __slots__ = ['tok', 'type', 'period_final'] + _properties
289
299
300
301
302
303
304 _RE_ELLIPSIS = re.compile(r'\.\.+$')
305 _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$')
306 _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE)
307 _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE)
308
309
310
311
312
314 """Returns a case-normalized representation of the token."""
315 return self._RE_NUMERIC.sub('##number##', tok.lower())
316
317 @property
319 """
320 The type with its final period removed if it has one.
321 """
322 if len(self.type) > 1 and self.type[-1] == '.':
323 return self.type[:-1]
324 return self.type
325
326 @property
328 """
329 The type with its final period removed if it is marked as a
330 sentence break.
331 """
332 if self.sentbreak:
333 return self.type_no_period
334 return self.type
335
336 @property
338 """True if the token's first character is uppercase."""
339 return self.tok[0].isupper()
340
341 @property
343 """True if the token's first character is lowercase."""
344 return self.tok[0].islower()
345
346 @property
353
354 @property
356 """True if the token text is that of an ellipsis."""
357 return self._RE_ELLIPSIS.match(self.tok)
358
359 @property
361 """True if the token text is that of a number."""
362 return self.type.startswith('##number##')
363
364 @property
366 """True if the token text is that of an initial."""
367 return self._RE_INITIAL.match(self.tok)
368
369 @property
371 """True if the token text is all alphabetic."""
372 return self._RE_ALPHA.match(self.tok)
373
374 @property
378
379
380
381
382
384 """
385 A string representation of the token that can reproduce it
386 with eval(), which lists all the token's non-default
387 annotations.
388 """
389 if self.type != self.tok:
390 typestr = ' type=%s,' % repr(self.type)
391 else:
392 typestr = ''
393
394 propvals = ', '.join(
395 '%s=%s' % (p, repr(getattr(self, p)))
396 for p in self._properties
397 if getattr(self, p)
398 )
399
400 return '%s(%s,%s %s)' % (self.__class__.__name__,
401 repr(self.tok), typestr, propvals)
402
404 """
405 A string representation akin to that used by Kiss and Strunk.
406 """
407 res = self.tok
408 if self.abbr:
409 res += '<A>'
410 if self.ellipsis:
411 res += '<E>'
412 if self.sentbreak:
413 res += '<S>'
414 return res
415
421 """
422 Includes common components of PunktTrainer and PunktSentenceTokenizer.
423 """
424
427 self._params = params
428 self._lang_vars = lang_vars
429 self._Token = token_cls
430 """The collection of parameters that determines the behavior
431 of the punkt tokenizer."""
432
433
434
435
436
438 """
439 Divide the given text into tokens, using the punkt word
440 segmentation regular expression, and generate the resulting list
441 of tokens augmented as three-tuples with two boolean values for whether
442 the given token occurs at the start of a paragraph or a new line,
443 respectively.
444 """
445 parastart = False
446 for line in plaintext.split('\n'):
447 if line.strip():
448 line_toks = iter(self._lang_vars.word_tokenize(line))
449
450 yield self._Token(line_toks.next(),
451 parastart=parastart, linestart=True)
452 parastart = False
453
454 for t in line_toks:
455 yield self._Token(t)
456 else:
457 parastart = True
458
459
460
461
462
463
465 """
466 Perform the first pass of annotation, which makes decisions
467 based purely based on the word type of each word:
468
469 - '?', '!', and '.' are marked as sentence breaks.
470 - sequences of two or more periods are marked as ellipsis.
471 - any word ending in '.' that's a known abbreviation is
472 marked as an abbreviation.
473 - any other word ending in '.' is marked as a sentence break.
474
475 Return these annotations as a tuple of three sets:
476
477 - sentbreak_toks: The indices of all sentence breaks.
478 - abbrev_toks: The indices of all abbreviations.
479 - ellipsis_toks: The indices of all ellipsis marks.
480 """
481 for aug_tok in tokens:
482 self._first_pass_annotation(aug_tok)
483 yield aug_tok
484
505
512 """Learns parameters used in Punkt sentence boundary detection."""
513
516
517 _PunktBaseClass.__init__(self, lang_vars=lang_vars,
518 token_cls=token_cls)
519
520 self._type_fdist = FreqDist()
521 """A frequency distribution giving the frequency of each
522 case-normalized token type in the training data."""
523
524 self._num_period_toks = 0
525 """The number of words ending in period in the training data."""
526
527 self._collocation_fdist = FreqDist()
528 """A frequency distribution giving the frequency of all
529 bigrams in the training data where the first word ends in a
530 period. Bigrams are encoded as tuples of word types.
531 Especially common collocations are extracted from this
532 frequency distribution, and stored in
533 L{_params}.L{collocations <PunktParameters.collocations>}."""
534
535 self._sent_starter_fdist = FreqDist()
536 """A frequency distribution giving the frequency of all words
537 that occur at the training data at the beginning of a sentence
538 (after the first pass of annotation). Especially common
539 sentence starters are extracted from this frequency
540 distribution, and stored in L{_params}.L{sent_starters
541 <PunktParameters.sent_starters>}.
542 """
543
544 self._sentbreak_count = 0
545 """The total number of sentence breaks identified in training, used for
546 calculating the frequent sentence starter heuristic."""
547
548 self._finalized = True
549 """A flag as to whether the training has been finalized by finding
550 collocations and sentence starters, or whether finalize_training()
551 still needs to be called."""
552
553 if train_text:
554 self.train(train_text, verbose, finalize=True)
555
557 """
558 Calculates and returns parameters for sentence boundary detection as
559 derived from training."""
560 if not self._finalized:
561 self.finalize_training()
562 return self._params
563
564
565
566
567
568 ABBREV = 0.3
569 """cut-off value whether a 'token' is an abbreviation"""
570
571 IGNORE_ABBREV_PENALTY = False
572 """allows the disabling of the abbreviation penalty heuristic, which
573 exponentially disadvantages words that are found at times without a
574 final period."""
575
576 ABBREV_BACKOFF = 5
577 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm"""
578
579 COLLOCATION = 7.88
580 """minimal log-likelihood value that two tokens need to be considered
581 as a collocation"""
582
583 SENT_STARTER = 30
584 """minimal log-likelihood value that a token requires to be considered
585 as a frequent sentence starter"""
586
587 INCLUDE_ALL_COLLOCS = False
588 """this includes as potential collocations all word pairs where the first
589 word ends in a period. It may be useful in corpora where there is a lot
590 of variation that makes abbreviations like Mr difficult to identify."""
591
592 INCLUDE_ABBREV_COLLOCS = False
593 """this includes as potential collocations all word pairs where the first
594 word is an abbreviation. Such collocations override the orthographic
595 heuristic, but not the sentence starter heuristic. This is overridden by
596 INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials
597 and ordinals are considered."""
598 """"""
599
600 MIN_COLLOC_FREQ = 1
601 """this sets a minimum bound on the number of times a bigram needs to
602 appear before it can be considered a collocation, in addition to log
603 likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True."""
604
605
606
607
608
609 - def train(self, text, verbose=False, finalize=True):
610 """
611 Collects training data from a given text. If finalize is True, it
612 will determine all the parameters for sentence boundary detection. If
613 not, this will be delayed until get_params() or finalize_training() is
614 called. If verbose is True, abbreviations found will be listed.
615 """
616
617
618 self._train_tokens(self._tokenize_words(text), verbose)
619 if finalize:
620 self.finalize_training(verbose)
621
622 - def train_tokens(self, tokens, verbose=False, finalize=True):
623 """
624 Collects training data from a given list of tokens.
625 """
626 self._train_tokens((self._Token(t) for t in tokens), verbose)
627 if finalize:
628 self.finalize_training(verbose)
629
691
694
696 """
697 Uses data that has been gathered in training to determine likely
698 collocations and sentence starters.
699 """
700 self._params.clear_sent_starters()
701 for typ, ll in self._find_sent_starters():
702 self._params.sent_starters.add(typ)
703 if verbose:
704 print (' Sent Starter: [%6.4f] %r' % (ll, typ))
705
706 self._params.clear_collocations()
707 for (typ1, typ2), ll in self._find_collocations():
708 self._params.collocations.add( (typ1,typ2) )
709 if verbose:
710 print (' Collocation: [%6.4f] %r+%r' %
711 (ll, typ1, typ2))
712
713 self._finalized = True
714
715
716
717
718
719 - def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2,
720 sentstart_thresh=2):
721 """
722 Allows memory use to be reduced after much training by removing data
723 about rare tokens that are unlikely to have a statistical effect with
724 further training. Entries occurring above the given thresholds will be
725 retained.
726 """
727 if ortho_thresh > 1:
728 old_oc = self._params.ortho_context
729 self._params.clear_ortho_context()
730 for tok, count in self._type_fdist.iteritems():
731 if count >= ortho_thresh:
732 self._params.ortho_context[tok] = old_oc[tok]
733
734 self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh)
735 self._collocation_fdist = self._freq_threshold(
736 self._collocation_fdist, colloc_thres)
737 self._sent_starter_fdist = self._freq_threshold(
738 self._sent_starter_fdist, sentstart_thresh)
739
741 """
742 Returns a FreqDist containing only data with counts below a given
743 threshold, as well as a mapping (None -> count_removed).
744 """
745
746
747 res = FreqDist()
748 num_removed = 0
749 for tok, count in fdist.iteritems():
750 if count < threshold:
751 num_removed += 1
752 else:
753 res.inc(tok, count)
754 res.inc(None, num_removed)
755 return res
756
757
758
759
760
804
805
806
807
808
810 """
811 (Re)classifies each given token if
812 - it is period-final and not a known abbreviation; or
813 - it is not period-final and is otherwise a known abbreviation
814 by checking whether its previous classification still holds according
815 to the heuristics of section 3.
816 Yields triples (abbr, score, is_add) where abbr is the type in question,
817 score is its log-likelihood with penalties applied, and is_add specifies
818 whether the present type is a candidate for inclusion or exclusion as an
819 abbreviation, such that:
820 - (is_add and score >= 0.3) suggests a new abbreviation; and
821 - (not is_add and score < 0.3) suggests excluding an abbreviation.
822 """
823
824
825
826
827 for typ in types:
828
829
830 if not _re_non_punct.search(typ) or typ == '##number##':
831 continue
832
833 if typ.endswith('.'):
834 if typ in self._params.abbrev_types:
835 continue
836 typ = typ[:-1]
837 is_add = True
838 else:
839 if typ not in self._params.abbrev_types:
840 continue
841 is_add = False
842
843
844
845 num_periods = typ.count('.') + 1
846 num_nonperiods = len(typ) - num_periods + 1
847
848
849
850
851
852
853 count_with_period = self._type_fdist[typ + '.']
854 count_without_period = self._type_fdist[typ]
855 ll = self._dunning_log_likelihood(
856 count_with_period + count_without_period,
857 self._num_period_toks, count_with_period,
858 self._type_fdist.N())
859
860
861
862
863
864
865 f_length = math.exp(-num_nonperiods)
866 f_periods = num_periods
867 f_penalty = (int(self.IGNORE_ABBREV_PENALTY)
868 or math.pow(num_nonperiods, -count_without_period))
869 score = ll * f_length * f_periods * f_penalty
870
871 yield typ, score, is_add
872
874 """
875 Recalculates abbreviations given type frequencies, despite no prior
876 determination of abbreviations.
877 This fails to include abbreviations otherwise found as "rare".
878 """
879 self._params.clear_abbrevs()
880 tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.'))
881 for abbr, score, is_add in self._reclassify_abbrev_types(tokens):
882 if score >= self.ABBREV:
883 self._params.abbrev_types.add(abbr)
884
885
886
887
889 """
890 A word type is counted as a rare abbreviation if...
891 - it's not already marked as an abbreviation
892 - it occurs fewer than ABBREV_BACKOFF times
893 - either it is followed by a sentence-internal punctuation
894 mark, *or* it is followed by a lower-case word that
895 sometimes appears with upper case, but never occurs with
896 lower case at the beginning of sentences.
897 """
898 if cur_tok.abbr or not cur_tok.sentbreak:
899 return False
900
901
902
903 typ = cur_tok.type_no_sentperiod
904
905
906
907 count = self._type_fdist[typ] + self._type_fdist[typ[:-1]]
908 if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF):
909 return False
910
911
912
913
914 if next_tok.tok[:1] in self._lang_vars.internal_punctuation:
915 return True
916
917
918
919
920
921
922
923 elif next_tok.first_lower:
924 typ2 = next_tok.type_no_sentperiod
925 typ2ortho_context = self._params.ortho_context[typ2]
926 if ( (typ2ortho_context & _ORTHO_BEG_UC) and
927 not (typ2ortho_context & _ORTHO_MID_UC) ):
928 return True
929
930
931
932
933
934
935 @staticmethod
937 """
938 A function that calculates the modified Dunning log-likelihood
939 ratio scores for abbreviation candidates. The details of how
940 this works is available in the paper.
941 """
942 p1 = float(count_b) / N
943 p2 = 0.99
944
945 null_hypo = (float(count_ab) * math.log(p1) +
946 (count_a - count_ab) * math.log(1.0 - p1))
947 alt_hypo = (float(count_ab) * math.log(p2) +
948 (count_a - count_ab) * math.log(1.0 - p2))
949
950 likelihood = null_hypo - alt_hypo
951
952 return (-2.0 * likelihood)
953
954 @staticmethod
956 """
957 A function that will just compute log-likelihood estimate, in
958 the original paper it's decribed in algorithm 6 and 7.
959
960 This *should* be the original Dunning log-likelihood values,
961 unlike the previous log_l function where it used modified
962 Dunning log-likelihood values
963 """
964 import math
965
966 p = 1.0 * count_b / N
967 p1 = 1.0 * count_ab / count_a
968 p2 = 1.0 * (count_b - count_ab) / (N - count_a)
969
970 summand1 = (count_ab * math.log(p) +
971 (count_a - count_ab) * math.log(1.0 - p))
972
973 summand2 = ((count_b - count_ab) * math.log(p) +
974 (N - count_a - count_b + count_ab) * math.log(1.0 - p))
975
976 if count_a == count_ab:
977 summand3 = 0
978 else:
979 summand3 = (count_ab * math.log(p1) +
980 (count_a - count_ab) * math.log(1.0 - p1))
981
982 if count_b == count_ab:
983 summand4 = 0
984 else:
985 summand4 = ((count_b - count_ab) * math.log(p2) +
986 (N - count_a - count_b + count_ab) * math.log(1.0 - p2))
987
988 likelihood = summand1 + summand2 - summand3 - summand4
989
990 return (-2.0 * likelihood)
991
992
993
994
995
1007
1009 """
1010 Generates likely collocations and their log-likelihood.
1011 """
1012 for types, col_count in self._collocation_fdist.iteritems():
1013 try:
1014 typ1, typ2 = types
1015 except TypeError:
1016
1017 continue
1018 if typ2 in self._params.sent_starters:
1019 continue
1020
1021 typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.']
1022 typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.']
1023 if (typ1_count > 1 and typ2_count > 1
1024 and self.MIN_COLLOC_FREQ <
1025 col_count <= min(typ1_count, typ2_count)):
1026
1027 ll = self._col_log_likelihood(typ1_count, typ2_count,
1028 col_count, self._type_fdist.N())
1029
1030 if (ll >= self.COLLOCATION and
1031 (float(self._type_fdist.N())/typ1_count >
1032 float(typ2_count)/col_count)):
1033 yield (typ1, typ2), ll
1034
1035
1036
1037
1038
1040 """
1041 Returns True given a token and the token that preceds it if it
1042 seems clear that the token is beginning a sentence.
1043 """
1044
1045
1046
1047 return ( prev_tok.sentbreak and
1048 not (prev_tok.is_number or prev_tok.is_initial) and
1049 cur_tok.is_alpha )
1050
1052 """
1053 Uses collocation heuristics for each candidate token to
1054 determine if it frequently starts sentences.
1055 """
1056 for (typ, typ_at_break_count) in self._sent_starter_fdist.iteritems():
1057 if not typ:
1058 continue
1059
1060 typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.']
1061 if typ_count < typ_at_break_count:
1062
1063 continue
1064
1065 ll = self._col_log_likelihood(self._sentbreak_count, typ_count,
1066 typ_at_break_count,
1067 self._type_fdist.N())
1068
1069 if (ll >= self.SENT_STARTER and
1070 float(self._type_fdist.N())/self._sentbreak_count >
1071 float(typ_count)/typ_at_break_count):
1072
1073 yield typ, ll
1074
1076 """
1077 Returns the number of sentence breaks marked in a given set of
1078 augmented tokens.
1079 """
1080 return sum(1 for aug_tok in tokens if aug_tok.sentbreak)
1081
1089 """
1090 A sentence tokenizer which uses an unsupervised algorithm to build
1091 a model for abbreviation words, collocations, and words that start
1092 sentences; and then uses that model to find sentence boundaries.
1093 This approach has been shown to work well for many European
1094 languages.
1095 """
1098 """
1099 train_text can either be the sole training text for this sentence
1100 boundary detector, or can be a PunktParameters object.
1101 """
1102 _PunktBaseClass.__init__(self, lang_vars=lang_vars,
1103 token_cls=token_cls)
1104
1105 if train_text:
1106 self._params = self.train(train_text, verbose)
1107
1108 - def train(self, train_text, verbose=False):
1109 """
1110 Derives parameters from a given training text, or uses the parameters
1111 given. Repeated calls to this method destroy previous parameters. For
1112 incremental training, instantiate a separate PunktTrainer instance.
1113 """
1114 if type(train_text) not in (type(''), type(u'')):
1115 return train_text
1116 return PunktTrainer(train_text, lang_vars=self._lang_vars,
1117 token_cls=self._Token).get_params()
1118
1119
1120
1121
1122
1123 - def tokenize(self, text, realign_boundaries=False):
1124 """
1125 Given a text, returns a list of the sentences in that text.
1126 """
1127 return list(self.sentences_from_text(text, realign_boundaries))
1128
1130 """
1131 Given a text, returns a list of the (start, end) spans of sentences
1132 in the text.
1133 """
1134 return [(sl.start, sl.stop) for sl in self._slices_from_text(text)]
1135
1136 - def sentences_from_text(self, text, realign_boundaries=False):
1137 """
1138 Given a text, generates the sentences in that text by only
1139 testing candidate sentence breaks. If realign_boundaries is
1140 True, includes in the sentence closing punctuation that
1141 follows the period.
1142 """
1143 sents = [text[sl] for sl in self._slices_from_text(text)]
1144 if realign_boundaries:
1145 sents = self._realign_boundaries(sents)
1146 return sents
1147
1148 - def _slices_from_text(self, text):
1149 last_break = 0
1150 for match in self._lang_vars.period_context_re().finditer(text):
1151 context = match.group() + match.group('after_tok')
1152 if self.text_contains_sentbreak(context):
1153 yield slice(last_break, match.end())
1154 if match.group('next_tok'):
1155
1156 last_break = match.start('next_tok')
1157 else:
1158
1159 last_break = match.end()
1160 yield slice(last_break, len(text))
1161
1163 """
1164 Attempts to realign punctuation that falls after the period but
1165 should otherwise be included in the same sentence.
1166
1167 For example: "(Sent1.) Sent2." will otherwise be split as::
1168
1169 ["(Sent1.", ") Sent1."].
1170
1171 This method will produce::
1172
1173 ["(Sent1.)", "Sent2."].
1174 """
1175 realign = 0
1176 for s1, s2 in _pair_iter(sents):
1177 s1 = s1[realign:]
1178 if not s2:
1179 if s1:
1180 yield s1
1181 continue
1182
1183 m = self._lang_vars.re_boundary_realignment.match(s2)
1184 if m:
1185 yield s1 + m.group(0).strip()
1186 realign = m.end()
1187 else:
1188 realign = 0
1189 if s1:
1190 yield s1
1191
1193 """
1194 Returns True if the given text includes a sentence break.
1195 """
1196 found = False
1197 for t in self._annotate_tokens(self._tokenize_words(text)):
1198 if found:
1199 return True
1200 if t.sentbreak:
1201 found = True
1202 return False
1203
1205 """
1206 Given a text, generates the sentences in that text. Annotates all
1207 tokens, rather than just those with possible sentence breaks. Should
1208 produce the same results as L{sentences_from_text}.
1209 """
1210 tokens = self._annotate_tokens(self._tokenize_words(text))
1211 return self._build_sentence_list(text, tokens)
1212
1214 """
1215 Given a sequence of tokens, generates lists of tokens, each list
1216 corresponding to a sentence.
1217 """
1218 tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens))
1219 sentence = []
1220 for aug_tok in tokens:
1221 sentence.append(aug_tok.tok)
1222 if aug_tok.sentbreak:
1223 yield sentence
1224 sentence = []
1225 if sentence:
1226 yield sentence
1227
1229 """
1230 Given a set of tokens augmented with markers for line-start and
1231 paragraph-start, returns an iterator through those tokens with full
1232 annotation including predicted sentence breaks.
1233 """
1234
1235
1236 tokens = self._annotate_first_pass(tokens)
1237
1238
1239
1240
1241 tokens = self._annotate_second_pass(tokens)
1242
1243
1244
1245
1246
1247 return tokens
1248
1250 """
1251 Given the original text and the list of augmented word tokens,
1252 construct and return a tokenized list of sentence strings.
1253 """
1254
1255
1256
1257
1258
1259 pos = 0
1260
1261
1262 WS_REGEXP = re.compile(r'\s*')
1263
1264 sentence = ''
1265 for aug_tok in tokens:
1266 tok = aug_tok.tok
1267
1268
1269 ws = WS_REGEXP.match(text, pos).group()
1270 pos += len(ws)
1271
1272
1273
1274
1275
1276
1277 if text[pos:pos+len(tok)] != tok:
1278 pat = '\s*'.join(re.escape(c) for c in tok)
1279 m = re.compile(pat).match(text,pos)
1280 if m: tok = m.group()
1281
1282
1283 assert text[pos:pos+len(tok)] == tok
1284 pos += len(tok)
1285
1286
1287
1288
1289 if sentence:
1290 sentence += ws + tok
1291 else:
1292 sentence += tok
1293
1294
1295 if aug_tok.sentbreak:
1296 yield sentence
1297 sentence = ''
1298
1299
1300 if sentence:
1301 yield sentence
1302
1303
1304 - def dump(self, tokens):
1305 print 'writing to /tmp/punkt.new...'
1306 out = open('/tmp/punkt.new', 'w')
1307 for aug_tok in tokens:
1308 if aug_tok.parastart:
1309 out.write('\n\n')
1310 elif aug_tok.linestart:
1311 out.write('\n')
1312 else:
1313 out.write(' ')
1314
1315 out.write(str(aug_tok))
1316 out.close()
1317
1318
1319
1320
1321
1322 PUNCTUATION = tuple(';:,.!?')
1323
1324
1325
1326
1327
1329 """
1330 Performs a token-based classification (section 4) over the given
1331 tokens, making use of the orthographic heuristic (4.1.1), collocation
1332 heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
1333 """
1334 for t1, t2 in _pair_iter(tokens):
1335 self._second_pass_annotation(t1, t2)
1336 yield t1
1337
1339 """
1340 Performs token-based classification over a pair of contiguous tokens
1341 returning an updated augmented token for the first of them.
1342 """
1343
1344 if not aug_tok2:
1345 return
1346
1347 tok = aug_tok1.tok
1348 if not aug_tok1.period_final:
1349
1350 return
1351
1352 typ = aug_tok1.type_no_period
1353 next_tok = aug_tok2.tok
1354 next_typ = aug_tok2.type_no_sentperiod
1355 tok_is_initial = aug_tok1.is_initial
1356
1357
1358
1359
1360
1361
1362
1363 if (typ, next_typ) in self._params.collocations:
1364 aug_tok1.sentbreak = False
1365 aug_tok1.abbr = True
1366 return
1367
1368
1369
1370
1371 if ( (aug_tok1.abbr or aug_tok1.ellipsis) and
1372 (not tok_is_initial) ):
1373
1374
1375
1376 is_sent_starter = self._ortho_heuristic(aug_tok2)
1377 if is_sent_starter == True:
1378 aug_tok1.sentbreak = True
1379 return
1380
1381
1382
1383
1384
1385 if ( aug_tok2.first_upper and
1386 next_typ in self._params.sent_starters):
1387 aug_tok1.sentbreak = True
1388 return
1389
1390
1391
1392
1393 if tok_is_initial or typ == '##number##':
1394
1395
1396
1397
1398 is_sent_starter = self._ortho_heuristic(aug_tok2)
1399
1400 if is_sent_starter == False:
1401 aug_tok1.sentbreak = False
1402 aug_tok1.abbr = True
1403 return
1404
1405
1406
1407
1408 if ( is_sent_starter == 'unknown' and tok_is_initial and
1409 aug_tok2.first_upper and
1410 not (self._params.ortho_context[next_typ] & _ORTHO_LC) ):
1411 aug_tok1.sentbreak = False
1412 aug_tok1.abbr = True
1413 return
1414
1415 return
1416
1418 """
1419 Decide whether the given token is the first token in a sentence.
1420 """
1421
1422 if aug_tok.tok in self.PUNCTUATION:
1423 return False
1424
1425 ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod]
1426
1427
1428
1429
1430 if ( aug_tok.first_upper and
1431 (ortho_context & _ORTHO_LC) and
1432 not (ortho_context & _ORTHO_MID_UC) ):
1433 return True
1434
1435
1436
1437
1438
1439 if ( aug_tok.first_lower and
1440 ((ortho_context & _ORTHO_UC) or
1441 not (ortho_context & _ORTHO_BEG_LC)) ):
1442 return False
1443
1444
1445 return 'unknown'
1446
1447
1448 -def main(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
1449 """Builds a punkt model and applies it to the same text"""
1450 cleanup = lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ')
1451 trainer = train_cls()
1452 trainer.INCLUDE_ALL_COLLOCS = True
1453 trainer.train(text)
1454 sbd = tok_cls(trainer.get_params())
1455 for l in sbd.sentences_from_text(text, realign_boundaries=True):
1456 print cleanup(l)
1457
1458
1459 if __name__ == '__main__':
1460 import sys
1461 main(sys.stdin.read())
1462