Package nltk :: Package tokenize :: Module punkt
[hide private]
[frames] | no frames]

Source Code for Module nltk.tokenize.punkt

   1  # Natural Language Toolkit: Punkt sentence tokenizer 
   2  # 
   3  # Copyright (C) 2001-2011 NLTK Project 
   4  # Algorithm: Kiss & Strunk (2006) 
   5  # Author: Willy <willy@csse.unimelb.edu.au> (original Python port) 
   6  #         Steven Bird <sb@csse.unimelb.edu.au> (additions) 
   7  #         Edward Loper <edloper@gradient.cis.upenn.edu> (rewrite) 
   8  #         Joel Nothman <jnothman@student.usyd.edu.au> (almost rewrite) 
   9  # URL: <http://www.nltk.org/> 
  10  # For license information, see LICENSE.TXT 
  11  # 
  12  # $Id: probability.py 4865 2007-07-11 22:6:07Z edloper $ 
  13   
  14  """ 
  15  The Punkt sentence tokenizer.  The algorithm for this tokenizer is 
  16  described in Kiss & Strunk (2006):: 
  17   
  18    Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence 
  19      Boundary Detection.  Computational Linguistics 32: 485-525. 
  20  """ 
  21   
  22  # TODO: Make orthographic heuristic less susceptible to overtraining 
  23  # TODO: Frequent sentence starters optionally exclude always-capitalised words 
  24  # FIXME: Problem with ending string with e.g. '!!!' -> '!! !' 
  25   
  26  import re 
  27  import math 
  28   
  29  from nltk.compat import defaultdict 
  30  from nltk.probability import FreqDist 
  31   
  32  from api import TokenizerI 
  33   
  34  ###################################################################### 
  35  #{ Orthographic Context Constants 
  36  ###################################################################### 
  37  # The following constants are used to describe the orthographic 
  38  # contexts in which a word can occur.  BEG=beginning, MID=middle, 
  39  # UNK=unknown, UC=uppercase, LC=lowercase, NC=no case. 
  40   
  41  _ORTHO_BEG_UC    = 1 << 1 
  42  """Orthogaphic context: beginning of a sentence with upper case.""" 
  43   
  44  _ORTHO_MID_UC    = 1 << 2 
  45  """Orthogaphic context: middle of a sentence with upper case.""" 
  46   
  47  _ORTHO_UNK_UC    = 1 << 3 
  48  """Orthogaphic context: unknown position in a sentence with upper case.""" 
  49   
  50  _ORTHO_BEG_LC    = 1 << 4 
  51  """Orthogaphic context: beginning of a sentence with lower case.""" 
  52   
  53  _ORTHO_MID_LC    = 1 << 5 
  54  """Orthogaphic context: middle of a sentence with lower case.""" 
  55   
  56  _ORTHO_UNK_LC    = 1 << 6 
  57  """Orthogaphic context: unknown position in a sentence with lower case.""" 
  58   
  59  _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC 
  60  """Orthogaphic context: occurs with upper case.""" 
  61   
  62  _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC 
  63  """Orthogaphic context: occurs with lower case.""" 
  64   
  65  _ORTHO_MAP = { 
  66          ('initial',  'upper'): _ORTHO_BEG_UC, 
  67          ('internal', 'upper'): _ORTHO_MID_UC, 
  68          ('unknown',  'upper'): _ORTHO_UNK_UC, 
  69          ('initial',  'lower'): _ORTHO_BEG_LC, 
  70          ('internal', 'lower'): _ORTHO_MID_LC, 
  71          ('unknown',  'lower'): _ORTHO_UNK_LC, 
  72  } 
  73  """A map from context position and first-letter case to the 
  74  appropriate orthographic context flag.""" 
75 76 #} (end orthographic context constants) 77 ###################################################################### 78 79 ###################################################################### 80 #{ Language-dependent variables 81 ###################################################################### 82 83 -class PunktLanguageVars(object):
84 """ 85 Stores variables, mostly regular expressions, which may be 86 language-dependent for correct application of the algorithm. 87 An extension of this class may modify its properties to suit 88 a language other than English; an instance can then be passed 89 as an argument to PunktSentenceTokenizer and PunktTrainer 90 constructors. 91 """ 92 93 __slots__ = ('_re_period_context', '_re_word_tokenizer') 94
95 - def __getstate__(self):
96 # All modifications to the class are performed by inheritance. 97 # Non-default parameters to be pickled must be defined in the inherited 98 # class. 99 return 1
100
101 - def __setstate__(self, state):
102 return 1
103 104 sent_end_chars = ('.', '?', '!') 105 """Characters which are candidates for sentence boundaries""" 106 107 @property
108 - def _re_sent_end_chars(self):
109 return '[%s]' % re.escape(''.join(self.sent_end_chars))
110 111 internal_punctuation = ',:;' # might want to extend this.. 112 """sentence internal punctuation, which indicates an abbreviation if 113 preceded by a period-final token.""" 114 115 re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', 116 re.MULTILINE) 117 """Used to realign punctuation that should be included in a sentence 118 although it follows the period (or ?, !).""" 119 120 _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]" 121 """Excludes some characters from starting word tokens""" 122 123 _re_non_word_chars = r"(?:[?!)\";}\]\*:@\'\({\[])" 124 """Characters that cannot appear within words""" 125 126 _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)" 127 """Hyphen and ellipsis are multi-character punctuation""" 128 129 _word_tokenize_fmt = r'''( 130 %(MultiChar)s 131 | 132 (?=%(WordStart)s)\S+? # Accept word characters until end is found 133 (?= # Sequences marking a word's end 134 \s| # White-space 135 $| # End-of-string 136 %(NonWord)s|%(MultiChar)s| # Punctuation 137 ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word 138 ) 139 | 140 \S 141 )''' 142 """Format of a regular expression to split punctuation from words, 143 excluding period.""" 144
145 - def _word_tokenizer_re(self):
146 """Compiles and returns a regular expression for word tokenization""" 147 try: 148 return self._re_word_tokenizer 149 except AttributeError: 150 self._re_word_tokenizer = re.compile( 151 self._word_tokenize_fmt % 152 { 153 'NonWord': self._re_non_word_chars, 154 'MultiChar': self._re_multi_char_punct, 155 'WordStart': self._re_word_start, 156 }, 157 re.UNICODE | re.VERBOSE 158 ) 159 return self._re_word_tokenizer
160
161 - def word_tokenize(self, s):
162 """Tokenize a string to split of punctuation other than periods""" 163 return self._word_tokenizer_re().findall(s)
164 165 _period_context_fmt = r""" 166 \S* # some word material 167 %(SentEndChars)s # a potential sentence ending 168 (?=(?P<after_tok> 169 %(NonWord)s # either other punctuation 170 | 171 \s+(?P<next_tok>\S+) # or whitespace and some other token 172 ))""" 173 """Format of a regular expression to find contexts including possible 174 sentence boundaries. Matches token which the possible sentence boundary 175 ends, and matches the following token within a lookahead expression.""" 176
177 - def period_context_re(self):
178 """Compiles and returns a regular expression to find contexts 179 including possible sentence boundaries.""" 180 try: 181 return self._re_period_context 182 except: 183 self._re_period_context = re.compile( 184 self._period_context_fmt % 185 { 186 'NonWord': self._re_non_word_chars, 187 'SentEndChars': self._re_sent_end_chars, 188 }, 189 re.UNICODE | re.VERBOSE) 190 return self._re_period_context
191 192 193 _re_non_punct = re.compile(r'[^\W\d]', re.UNICODE) 194 """Matches token types that are not merely punctuation. (Types for 195 numeric tokens are changed to ##number## and hence contain alpha.)"""
196 197 #} 198 ###################################################################### 199 200 201 ###################################################################### 202 #{ Punkt Word Tokenizer 203 ###################################################################### 204 205 -class PunktWordTokenizer(TokenizerI):
206 # Retained for backward compatibility
207 - def __init__(self, lang_vars=PunktLanguageVars()):
208 self._lang_vars = lang_vars
209
210 - def tokenize(self, text):
211 return self._lang_vars.word_tokenize(text)
212
213 #} 214 ###################################################################### 215 216 217 #//////////////////////////////////////////////////////////// 218 #{ Helper Functions 219 #//////////////////////////////////////////////////////////// 220 221 -def _pair_iter(it):
222 """ 223 Yields pairs of tokens from the given iterator such that each input 224 token will appear as the first element in a yielded tuple. The last 225 pair will have None as its second element. 226 """ 227 it = iter(it) 228 prev = it.next() 229 for el in it: 230 yield (prev, el) 231 prev = el 232 yield (prev, None)
233
234 ###################################################################### 235 #{ Punkt Parameters 236 ###################################################################### 237 238 -class PunktParameters(object):
239 """Stores data used to perform sentence boundary detection with punkt.""" 240
241 - def __init__(self):
242 self.abbrev_types = set() 243 """A set of word types for known abbreviations.""" 244 245 self.collocations = set() 246 """A set of word type tuples for known common collocations 247 where the first word ends in a period. E.g., ('S.', 'Bach') 248 is a common collocation in a text that discusses 'Johann 249 S. Bach'. These count as negative evidence for sentence 250 boundaries.""" 251 252 self.sent_starters = set() 253 """A set of word types for words that often appear at the 254 beginning of sentences.""" 255 256 self.ortho_context = defaultdict(int) 257 """A dictionary mapping word types to the set of orthographic 258 contexts that word type appears in. Contexts are represented 259 by adding orthographic context flags: ..."""
260
261 - def clear_abbrevs(self):
262 self.abbrev_types = set()
263
264 - def clear_collocations(self):
265 self.collocations = set()
266
267 - def clear_sent_starters(self):
268 self.sent_starters = set()
269
270 - def clear_ortho_context(self):
271 self.ortho_context = defaultdict(int)
272
273 - def add_ortho_context(self, typ, flag):
274 self.ortho_context[typ] |= flag
275
276 ###################################################################### 277 #{ PunktToken 278 ###################################################################### 279 280 -class PunktToken(object):
281 """Stores a token of text with annotations produced during 282 sentence boundary detection.""" 283 284 _properties = [ 285 'parastart', 'linestart', 286 'sentbreak', 'abbr', 'ellipsis' 287 ] 288 __slots__ = ['tok', 'type', 'period_final'] + _properties 289
290 - def __init__(self, tok, **params):
291 self.tok = tok 292 self.type = self._get_type(tok) 293 self.period_final = tok.endswith('.') 294 295 for p in self._properties: 296 setattr(self, p, None) 297 for k, v in params.iteritems(): 298 setattr(self, k, v)
299 300 #//////////////////////////////////////////////////////////// 301 #{ Regular expressions for properties 302 #//////////////////////////////////////////////////////////// 303 # Note: [A-Za-z] is approximated by [^\W\d] in the general case. 304 _RE_ELLIPSIS = re.compile(r'\.\.+$') 305 _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$') 306 _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE) 307 _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE) 308 309 #//////////////////////////////////////////////////////////// 310 #{ Derived properties 311 #//////////////////////////////////////////////////////////// 312
313 - def _get_type(self, tok):
314 """Returns a case-normalized representation of the token.""" 315 return self._RE_NUMERIC.sub('##number##', tok.lower())
316 317 @property
318 - def type_no_period(self):
319 """ 320 The type with its final period removed if it has one. 321 """ 322 if len(self.type) > 1 and self.type[-1] == '.': 323 return self.type[:-1] 324 return self.type
325 326 @property
327 - def type_no_sentperiod(self):
328 """ 329 The type with its final period removed if it is marked as a 330 sentence break. 331 """ 332 if self.sentbreak: 333 return self.type_no_period 334 return self.type
335 336 @property
337 - def first_upper(self):
338 """True if the token's first character is uppercase.""" 339 return self.tok[0].isupper()
340 341 @property
342 - def first_lower(self):
343 """True if the token's first character is lowercase.""" 344 return self.tok[0].islower()
345 346 @property
347 - def first_case(self):
348 if self.first_lower: 349 return 'lower' 350 elif self.first_upper: 351 return 'upper' 352 return 'none'
353 354 @property
355 - def is_ellipsis(self):
356 """True if the token text is that of an ellipsis.""" 357 return self._RE_ELLIPSIS.match(self.tok)
358 359 @property
360 - def is_number(self):
361 """True if the token text is that of a number.""" 362 return self.type.startswith('##number##')
363 364 @property
365 - def is_initial(self):
366 """True if the token text is that of an initial.""" 367 return self._RE_INITIAL.match(self.tok)
368 369 @property
370 - def is_alpha(self):
371 """True if the token text is all alphabetic.""" 372 return self._RE_ALPHA.match(self.tok)
373 374 @property
375 - def is_non_punct(self):
376 """True if the token is either a number or is alphabetic.""" 377 return _re_non_punct.search(self.type)
378 379 #//////////////////////////////////////////////////////////// 380 #{ String representation 381 #//////////////////////////////////////////////////////////// 382
383 - def __repr__(self):
384 """ 385 A string representation of the token that can reproduce it 386 with eval(), which lists all the token's non-default 387 annotations. 388 """ 389 if self.type != self.tok: 390 typestr = ' type=%s,' % repr(self.type) 391 else: 392 typestr = '' 393 394 propvals = ', '.join( 395 '%s=%s' % (p, repr(getattr(self, p))) 396 for p in self._properties 397 if getattr(self, p) 398 ) 399 400 return '%s(%s,%s %s)' % (self.__class__.__name__, 401 repr(self.tok), typestr, propvals)
402
403 - def __str__(self):
404 """ 405 A string representation akin to that used by Kiss and Strunk. 406 """ 407 res = self.tok 408 if self.abbr: 409 res += '<A>' 410 if self.ellipsis: 411 res += '<E>' 412 if self.sentbreak: 413 res += '<S>' 414 return res
415
416 ###################################################################### 417 #{ Punkt base class 418 ###################################################################### 419 420 -class _PunktBaseClass(object):
421 """ 422 Includes common components of PunktTrainer and PunktSentenceTokenizer. 423 """ 424
425 - def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken, 426 params=PunktParameters()):
427 self._params = params 428 self._lang_vars = lang_vars 429 self._Token = token_cls 430 """The collection of parameters that determines the behavior 431 of the punkt tokenizer."""
432 433 #//////////////////////////////////////////////////////////// 434 #{ Word tokenization 435 #//////////////////////////////////////////////////////////// 436
437 - def _tokenize_words(self, plaintext):
438 """ 439 Divide the given text into tokens, using the punkt word 440 segmentation regular expression, and generate the resulting list 441 of tokens augmented as three-tuples with two boolean values for whether 442 the given token occurs at the start of a paragraph or a new line, 443 respectively. 444 """ 445 parastart = False 446 for line in plaintext.split('\n'): 447 if line.strip(): 448 line_toks = iter(self._lang_vars.word_tokenize(line)) 449 450 yield self._Token(line_toks.next(), 451 parastart=parastart, linestart=True) 452 parastart = False 453 454 for t in line_toks: 455 yield self._Token(t) 456 else: 457 parastart = True
458 459 460 #//////////////////////////////////////////////////////////// 461 #{ Annotation Procedures 462 #//////////////////////////////////////////////////////////// 463
464 - def _annotate_first_pass(self, tokens):
465 """ 466 Perform the first pass of annotation, which makes decisions 467 based purely based on the word type of each word: 468 469 - '?', '!', and '.' are marked as sentence breaks. 470 - sequences of two or more periods are marked as ellipsis. 471 - any word ending in '.' that's a known abbreviation is 472 marked as an abbreviation. 473 - any other word ending in '.' is marked as a sentence break. 474 475 Return these annotations as a tuple of three sets: 476 477 - sentbreak_toks: The indices of all sentence breaks. 478 - abbrev_toks: The indices of all abbreviations. 479 - ellipsis_toks: The indices of all ellipsis marks. 480 """ 481 for aug_tok in tokens: 482 self._first_pass_annotation(aug_tok) 483 yield aug_tok
484
485 - def _first_pass_annotation(self, aug_tok):
486 """ 487 Performs type-based annotation on a single token. 488 """ 489 490 tok = aug_tok.tok 491 492 if tok in self._lang_vars.sent_end_chars: 493 aug_tok.sentbreak = True 494 elif aug_tok.is_ellipsis: 495 aug_tok.ellipsis = True 496 elif aug_tok.period_final and not tok.endswith('..'): 497 if (tok[:-1].lower() in self._params.abbrev_types or 498 tok[:-1].lower().split('-')[-1] in self._params.abbrev_types): 499 500 aug_tok.abbr = True 501 else: 502 aug_tok.sentbreak = True 503 504 return
505
506 ###################################################################### 507 #{ Punkt Trainer 508 ###################################################################### 509 510 511 -class PunktTrainer(_PunktBaseClass):
512 """Learns parameters used in Punkt sentence boundary detection.""" 513
514 - def __init__(self, train_text=None, verbose=False, 515 lang_vars=PunktLanguageVars(), token_cls=PunktToken):
516 517 _PunktBaseClass.__init__(self, lang_vars=lang_vars, 518 token_cls=token_cls) 519 520 self._type_fdist = FreqDist() 521 """A frequency distribution giving the frequency of each 522 case-normalized token type in the training data.""" 523 524 self._num_period_toks = 0 525 """The number of words ending in period in the training data.""" 526 527 self._collocation_fdist = FreqDist() 528 """A frequency distribution giving the frequency of all 529 bigrams in the training data where the first word ends in a 530 period. Bigrams are encoded as tuples of word types. 531 Especially common collocations are extracted from this 532 frequency distribution, and stored in 533 L{_params}.L{collocations <PunktParameters.collocations>}.""" 534 535 self._sent_starter_fdist = FreqDist() 536 """A frequency distribution giving the frequency of all words 537 that occur at the training data at the beginning of a sentence 538 (after the first pass of annotation). Especially common 539 sentence starters are extracted from this frequency 540 distribution, and stored in L{_params}.L{sent_starters 541 <PunktParameters.sent_starters>}. 542 """ 543 544 self._sentbreak_count = 0 545 """The total number of sentence breaks identified in training, used for 546 calculating the frequent sentence starter heuristic.""" 547 548 self._finalized = True 549 """A flag as to whether the training has been finalized by finding 550 collocations and sentence starters, or whether finalize_training() 551 still needs to be called.""" 552 553 if train_text: 554 self.train(train_text, verbose, finalize=True)
555
556 - def get_params(self):
557 """ 558 Calculates and returns parameters for sentence boundary detection as 559 derived from training.""" 560 if not self._finalized: 561 self.finalize_training() 562 return self._params
563 564 #//////////////////////////////////////////////////////////// 565 #{ Customization Variables 566 #//////////////////////////////////////////////////////////// 567 568 ABBREV = 0.3 569 """cut-off value whether a 'token' is an abbreviation""" 570 571 IGNORE_ABBREV_PENALTY = False 572 """allows the disabling of the abbreviation penalty heuristic, which 573 exponentially disadvantages words that are found at times without a 574 final period.""" 575 576 ABBREV_BACKOFF = 5 577 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" 578 579 COLLOCATION = 7.88 580 """minimal log-likelihood value that two tokens need to be considered 581 as a collocation""" 582 583 SENT_STARTER = 30 584 """minimal log-likelihood value that a token requires to be considered 585 as a frequent sentence starter""" 586 587 INCLUDE_ALL_COLLOCS = False 588 """this includes as potential collocations all word pairs where the first 589 word ends in a period. It may be useful in corpora where there is a lot 590 of variation that makes abbreviations like Mr difficult to identify.""" 591 592 INCLUDE_ABBREV_COLLOCS = False 593 """this includes as potential collocations all word pairs where the first 594 word is an abbreviation. Such collocations override the orthographic 595 heuristic, but not the sentence starter heuristic. This is overridden by 596 INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials 597 and ordinals are considered.""" 598 """""" 599 600 MIN_COLLOC_FREQ = 1 601 """this sets a minimum bound on the number of times a bigram needs to 602 appear before it can be considered a collocation, in addition to log 603 likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" 604 605 #//////////////////////////////////////////////////////////// 606 #{ Training.. 607 #//////////////////////////////////////////////////////////// 608
609 - def train(self, text, verbose=False, finalize=True):
610 """ 611 Collects training data from a given text. If finalize is True, it 612 will determine all the parameters for sentence boundary detection. If 613 not, this will be delayed until get_params() or finalize_training() is 614 called. If verbose is True, abbreviations found will be listed. 615 """ 616 # Break the text into tokens; record which token indices correspond to 617 # line starts and paragraph starts; and determine their types. 618 self._train_tokens(self._tokenize_words(text), verbose) 619 if finalize: 620 self.finalize_training(verbose)
621
622 - def train_tokens(self, tokens, verbose=False, finalize=True):
623 """ 624 Collects training data from a given list of tokens. 625 """ 626 self._train_tokens((self._Token(t) for t in tokens), verbose) 627 if finalize: 628 self.finalize_training(verbose)
629
630 - def _train_tokens(self, tokens, verbose):
631 self._finalized = False 632 633 # Ensure tokens are a list 634 tokens = list(tokens) 635 636 # Find the frequency of each case-normalized type. (Don't 637 # strip off final periods.) Also keep track of the number of 638 # tokens that end in periods. 639 for aug_tok in tokens: 640 self._type_fdist.inc(aug_tok.type) 641 if aug_tok.period_final: 642 self._num_period_toks += 1 643 644 # Look for new abbreviations, and for types that no longer are 645 unique_types = self._unique_types(tokens) 646 for abbr, score, is_add in self._reclassify_abbrev_types(unique_types): 647 if score >= self.ABBREV: 648 if is_add: 649 self._params.abbrev_types.add(abbr) 650 if verbose: 651 print (' Abbreviation: [%6.4f] %s' % 652 (score, abbr)) 653 else: 654 if not is_add: 655 self._params.abbrev_types.remove(abbr) 656 if verbose: 657 print (' Removed abbreviation: [%6.4f] %s' % 658 (score, abbr)) 659 660 # Make a preliminary pass through the document, marking likely 661 # sentence breaks, abbreviations, and ellipsis tokens. 662 tokens = list(self._annotate_first_pass(tokens)) 663 664 # Check what contexts each word type can appear in, given the 665 # case of its first letter. 666 self._get_orthography_data(tokens) 667 668 # We need total number of sentence breaks to find sentence starters 669 self._sentbreak_count += self._get_sentbreak_count(tokens) 670 671 # The remaining heuristics relate to pairs of tokens where the first 672 # ends in a period. 673 for aug_tok1, aug_tok2 in _pair_iter(tokens): 674 if not aug_tok1.period_final or not aug_tok2: 675 continue 676 677 # Is the first token a rare abbreviation? 678 if self._is_rare_abbrev_type(aug_tok1, aug_tok2): 679 self._params.abbrev_types.add(aug_tok1.type_no_period) 680 if verbose: 681 print (' Rare Abbrev: %s' % aug_tok1.type) 682 683 # Does second token have a high likelihood of starting a sentence? 684 if self._is_potential_sent_starter(aug_tok2, aug_tok1): 685 self._sent_starter_fdist.inc(aug_tok2.type) 686 687 # Is this bigram a potential collocation? 688 if self._is_potential_collocation(aug_tok1, aug_tok2): 689 self._collocation_fdist.inc( 690 (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod))
691
692 - def _unique_types(self, tokens):
693 return set(aug_tok.type for aug_tok in tokens)
694
695 - def finalize_training(self, verbose=False):
696 """ 697 Uses data that has been gathered in training to determine likely 698 collocations and sentence starters. 699 """ 700 self._params.clear_sent_starters() 701 for typ, ll in self._find_sent_starters(): 702 self._params.sent_starters.add(typ) 703 if verbose: 704 print (' Sent Starter: [%6.4f] %r' % (ll, typ)) 705 706 self._params.clear_collocations() 707 for (typ1, typ2), ll in self._find_collocations(): 708 self._params.collocations.add( (typ1,typ2) ) 709 if verbose: 710 print (' Collocation: [%6.4f] %r+%r' % 711 (ll, typ1, typ2)) 712 713 self._finalized = True
714 715 #//////////////////////////////////////////////////////////// 716 #{ Overhead reduction 717 #//////////////////////////////////////////////////////////// 718
719 - def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2, 720 sentstart_thresh=2):
721 """ 722 Allows memory use to be reduced after much training by removing data 723 about rare tokens that are unlikely to have a statistical effect with 724 further training. Entries occurring above the given thresholds will be 725 retained. 726 """ 727 if ortho_thresh > 1: 728 old_oc = self._params.ortho_context 729 self._params.clear_ortho_context() 730 for tok, count in self._type_fdist.iteritems(): 731 if count >= ortho_thresh: 732 self._params.ortho_context[tok] = old_oc[tok] 733 734 self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh) 735 self._collocation_fdist = self._freq_threshold( 736 self._collocation_fdist, colloc_thres) 737 self._sent_starter_fdist = self._freq_threshold( 738 self._sent_starter_fdist, sentstart_thresh)
739
740 - def _freq_threshold(self, fdist, threshold):
741 """ 742 Returns a FreqDist containing only data with counts below a given 743 threshold, as well as a mapping (None -> count_removed). 744 """ 745 # We assume that there is more data below the threshold than above it 746 # and so create a new FreqDist rather than working in place. 747 res = FreqDist() 748 num_removed = 0 749 for tok, count in fdist.iteritems(): 750 if count < threshold: 751 num_removed += 1 752 else: 753 res.inc(tok, count) 754 res.inc(None, num_removed) 755 return res
756 757 #//////////////////////////////////////////////////////////// 758 #{ Orthographic data 759 #//////////////////////////////////////////////////////////// 760
761 - def _get_orthography_data(self, tokens):
762 """ 763 Collect information about whether each token type occurs 764 with different case patterns (i) overall, (ii) at 765 sentence-initial positions, and (iii) at sentence-internal 766 positions. 767 """ 768 # 'initial' or 'internal' or 'unknown' 769 context = 'internal' 770 tokens = list(tokens) 771 772 for aug_tok in tokens: 773 # If we encounter a paragraph break, then it's a good sign 774 # that it's a sentence break. But err on the side of 775 # caution (by not positing a sentence break) if we just 776 # saw an abbreviation. 777 if aug_tok.parastart and context != 'unknown': 778 context = 'initial' 779 780 # If we're at the beginning of a line, then err on the 781 # side of calling our context 'initial'. 782 if aug_tok.linestart and context == 'internal': 783 context = 'unknown' 784 785 # Find the case-normalized type of the token. If it's a 786 # sentence-final token, strip off the period. 787 typ = aug_tok.type_no_sentperiod 788 789 # Update the orthographic context table. 790 flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0) 791 if flag: 792 self._params.add_ortho_context(typ, flag) 793 794 # Decide whether the next word is at a sentence boundary. 795 if aug_tok.sentbreak: 796 if not (aug_tok.is_number or aug_tok.is_initial): 797 context = 'initial' 798 else: 799 context = 'unknown' 800 elif aug_tok.ellipsis or aug_tok.abbr: 801 context = 'unknown' 802 else: 803 context = 'internal'
804 805 #//////////////////////////////////////////////////////////// 806 #{ Abbreviations 807 #//////////////////////////////////////////////////////////// 808
809 - def _reclassify_abbrev_types(self, types):
810 """ 811 (Re)classifies each given token if 812 - it is period-final and not a known abbreviation; or 813 - it is not period-final and is otherwise a known abbreviation 814 by checking whether its previous classification still holds according 815 to the heuristics of section 3. 816 Yields triples (abbr, score, is_add) where abbr is the type in question, 817 score is its log-likelihood with penalties applied, and is_add specifies 818 whether the present type is a candidate for inclusion or exclusion as an 819 abbreviation, such that: 820 - (is_add and score >= 0.3) suggests a new abbreviation; and 821 - (not is_add and score < 0.3) suggests excluding an abbreviation. 822 """ 823 # (While one could recalculate abbreviations from all .-final tokens at 824 # every iteration, in cases requiring efficiency, the number of tokens 825 # in the present training document will be much less.) 826 827 for typ in types: 828 # Check some basic conditions, to rule out words that are 829 # clearly not abbrev_types. 830 if not _re_non_punct.search(typ) or typ == '##number##': 831 continue 832 833 if typ.endswith('.'): 834 if typ in self._params.abbrev_types: 835 continue 836 typ = typ[:-1] 837 is_add = True 838 else: 839 if typ not in self._params.abbrev_types: 840 continue 841 is_add = False 842 843 # Count how many periods & nonperiods are in the 844 # candidate. 845 num_periods = typ.count('.') + 1 846 num_nonperiods = len(typ) - num_periods + 1 847 848 # Let <a> be the candidate without the period, and <b> 849 # be the period. Find a log likelihood ratio that 850 # indicates whether <ab> occurs as a single unit (high 851 # value of ll), or as two independent units <a> and 852 # <b> (low value of ll). 853 count_with_period = self._type_fdist[typ + '.'] 854 count_without_period = self._type_fdist[typ] 855 ll = self._dunning_log_likelihood( 856 count_with_period + count_without_period, 857 self._num_period_toks, count_with_period, 858 self._type_fdist.N()) 859 860 # Apply three scaling factors to 'tweak' the basic log 861 # likelihood ratio: 862 # F_length: long word -> less likely to be an abbrev 863 # F_periods: more periods -> more likely to be an abbrev 864 # F_penalty: penalize occurances w/o a period 865 f_length = math.exp(-num_nonperiods) 866 f_periods = num_periods 867 f_penalty = (int(self.IGNORE_ABBREV_PENALTY) 868 or math.pow(num_nonperiods, -count_without_period)) 869 score = ll * f_length * f_periods * f_penalty 870 871 yield typ, score, is_add
872
873 - def find_abbrev_types(self):
874 """ 875 Recalculates abbreviations given type frequencies, despite no prior 876 determination of abbreviations. 877 This fails to include abbreviations otherwise found as "rare". 878 """ 879 self._params.clear_abbrevs() 880 tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.')) 881 for abbr, score, is_add in self._reclassify_abbrev_types(tokens): 882 if score >= self.ABBREV: 883 self._params.abbrev_types.add(abbr)
884 885 # This function combines the work done by the original code's 886 # functions `count_orthography_context`, `get_orthography_count`, 887 # and `get_rare_abbreviations`.
888 - def _is_rare_abbrev_type(self, cur_tok, next_tok):
889 """ 890 A word type is counted as a rare abbreviation if... 891 - it's not already marked as an abbreviation 892 - it occurs fewer than ABBREV_BACKOFF times 893 - either it is followed by a sentence-internal punctuation 894 mark, *or* it is followed by a lower-case word that 895 sometimes appears with upper case, but never occurs with 896 lower case at the beginning of sentences. 897 """ 898 if cur_tok.abbr or not cur_tok.sentbreak: 899 return False 900 901 # Find the case-normalized type of the token. If it's 902 # a sentence-final token, strip off the period. 903 typ = cur_tok.type_no_sentperiod 904 905 # Proceed only if the type hasn't been categorized as an 906 # abbreviation already, and is sufficiently rare... 907 count = self._type_fdist[typ] + self._type_fdist[typ[:-1]] 908 if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF): 909 return False 910 911 # Record this token as an abbreviation if the next 912 # token is a sentence-internal punctuation mark. 913 # [XX] :1 or check the whole thing?? 914 if next_tok.tok[:1] in self._lang_vars.internal_punctuation: 915 return True 916 917 # Record this type as an abbreviation if the next 918 # token... (i) starts with a lower case letter, 919 # (ii) sometimes occurs with an uppercase letter, 920 # and (iii) never occus with an uppercase letter 921 # sentence-internally. 922 # [xx] should the check for (ii) be modified?? 923 elif next_tok.first_lower: 924 typ2 = next_tok.type_no_sentperiod 925 typ2ortho_context = self._params.ortho_context[typ2] 926 if ( (typ2ortho_context & _ORTHO_BEG_UC) and 927 not (typ2ortho_context & _ORTHO_MID_UC) ): 928 return True
929 930 #//////////////////////////////////////////////////////////// 931 #{ Log Likelihoods 932 #//////////////////////////////////////////////////////////// 933 934 # helper for _reclassify_abbrev_types: 935 @staticmethod
936 - def _dunning_log_likelihood(count_a, count_b, count_ab, N):
937 """ 938 A function that calculates the modified Dunning log-likelihood 939 ratio scores for abbreviation candidates. The details of how 940 this works is available in the paper. 941 """ 942 p1 = float(count_b) / N 943 p2 = 0.99 944 945 null_hypo = (float(count_ab) * math.log(p1) + 946 (count_a - count_ab) * math.log(1.0 - p1)) 947 alt_hypo = (float(count_ab) * math.log(p2) + 948 (count_a - count_ab) * math.log(1.0 - p2)) 949 950 likelihood = null_hypo - alt_hypo 951 952 return (-2.0 * likelihood)
953 954 @staticmethod
955 - def _col_log_likelihood(count_a, count_b, count_ab, N):
956 """ 957 A function that will just compute log-likelihood estimate, in 958 the original paper it's decribed in algorithm 6 and 7. 959 960 This *should* be the original Dunning log-likelihood values, 961 unlike the previous log_l function where it used modified 962 Dunning log-likelihood values 963 """ 964 import math 965 966 p = 1.0 * count_b / N 967 p1 = 1.0 * count_ab / count_a 968 p2 = 1.0 * (count_b - count_ab) / (N - count_a) 969 970 summand1 = (count_ab * math.log(p) + 971 (count_a - count_ab) * math.log(1.0 - p)) 972 973 summand2 = ((count_b - count_ab) * math.log(p) + 974 (N - count_a - count_b + count_ab) * math.log(1.0 - p)) 975 976 if count_a == count_ab: 977 summand3 = 0 978 else: 979 summand3 = (count_ab * math.log(p1) + 980 (count_a - count_ab) * math.log(1.0 - p1)) 981 982 if count_b == count_ab: 983 summand4 = 0 984 else: 985 summand4 = ((count_b - count_ab) * math.log(p2) + 986 (N - count_a - count_b + count_ab) * math.log(1.0 - p2)) 987 988 likelihood = summand1 + summand2 - summand3 - summand4 989 990 return (-2.0 * likelihood)
991 992 #//////////////////////////////////////////////////////////// 993 #{ Collocation Finder 994 #//////////////////////////////////////////////////////////// 995
996 - def _is_potential_collocation(self, aug_tok1, aug_tok2):
997 """ 998 Returns True if the pair of tokens may form a collocation given 999 log-likelihood statistics. 1000 """ 1001 return ((self.INCLUDE_ALL_COLLOCS or 1002 (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) or 1003 (aug_tok1.sentbreak and 1004 (aug_tok1.is_number or aug_tok1.is_initial))) 1005 and aug_tok1.is_non_punct 1006 and aug_tok2.is_non_punct)
1007
1008 - def _find_collocations(self):
1009 """ 1010 Generates likely collocations and their log-likelihood. 1011 """ 1012 for types, col_count in self._collocation_fdist.iteritems(): 1013 try: 1014 typ1, typ2 = types 1015 except TypeError: 1016 # types may be None after calling freq_threshold() 1017 continue 1018 if typ2 in self._params.sent_starters: 1019 continue 1020 1021 typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.'] 1022 typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.'] 1023 if (typ1_count > 1 and typ2_count > 1 1024 and self.MIN_COLLOC_FREQ < 1025 col_count <= min(typ1_count, typ2_count)): 1026 1027 ll = self._col_log_likelihood(typ1_count, typ2_count, 1028 col_count, self._type_fdist.N()) 1029 # Filter out the not-so-collocative 1030 if (ll >= self.COLLOCATION and 1031 (float(self._type_fdist.N())/typ1_count > 1032 float(typ2_count)/col_count)): 1033 yield (typ1, typ2), ll
1034 1035 #//////////////////////////////////////////////////////////// 1036 #{ Sentence-Starter Finder 1037 #//////////////////////////////////////////////////////////// 1038
1039 - def _is_potential_sent_starter(self, cur_tok, prev_tok):
1040 """ 1041 Returns True given a token and the token that preceds it if it 1042 seems clear that the token is beginning a sentence. 1043 """ 1044 # If a token (i) is preceeded by a sentece break that is 1045 # not a potential ordinal number or initial, and (ii) is 1046 # alphabetic, then it is a a sentence-starter. 1047 return ( prev_tok.sentbreak and 1048 not (prev_tok.is_number or prev_tok.is_initial) and 1049 cur_tok.is_alpha )
1050
1051 - def _find_sent_starters(self):
1052 """ 1053 Uses collocation heuristics for each candidate token to 1054 determine if it frequently starts sentences. 1055 """ 1056 for (typ, typ_at_break_count) in self._sent_starter_fdist.iteritems(): 1057 if not typ: 1058 continue 1059 1060 typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.'] 1061 if typ_count < typ_at_break_count: 1062 # needed after freq_threshold 1063 continue 1064 1065 ll = self._col_log_likelihood(self._sentbreak_count, typ_count, 1066 typ_at_break_count, 1067 self._type_fdist.N()) 1068 1069 if (ll >= self.SENT_STARTER and 1070 float(self._type_fdist.N())/self._sentbreak_count > 1071 float(typ_count)/typ_at_break_count): 1072 1073 yield typ, ll
1074
1075 - def _get_sentbreak_count(self, tokens):
1076 """ 1077 Returns the number of sentence breaks marked in a given set of 1078 augmented tokens. 1079 """ 1080 return sum(1 for aug_tok in tokens if aug_tok.sentbreak)
1081
1082 1083 ###################################################################### 1084 #{ Punkt Sentence Tokenizer 1085 ###################################################################### 1086 1087 1088 -class PunktSentenceTokenizer(_PunktBaseClass,TokenizerI):
1089 """ 1090 A sentence tokenizer which uses an unsupervised algorithm to build 1091 a model for abbreviation words, collocations, and words that start 1092 sentences; and then uses that model to find sentence boundaries. 1093 This approach has been shown to work well for many European 1094 languages. 1095 """
1096 - def __init__(self, train_text=None, verbose=False, 1097 lang_vars=PunktLanguageVars(), token_cls=PunktToken):
1098 """ 1099 train_text can either be the sole training text for this sentence 1100 boundary detector, or can be a PunktParameters object. 1101 """ 1102 _PunktBaseClass.__init__(self, lang_vars=lang_vars, 1103 token_cls=token_cls) 1104 1105 if train_text: 1106 self._params = self.train(train_text, verbose)
1107
1108 - def train(self, train_text, verbose=False):
1109 """ 1110 Derives parameters from a given training text, or uses the parameters 1111 given. Repeated calls to this method destroy previous parameters. For 1112 incremental training, instantiate a separate PunktTrainer instance. 1113 """ 1114 if type(train_text) not in (type(''), type(u'')): 1115 return train_text 1116 return PunktTrainer(train_text, lang_vars=self._lang_vars, 1117 token_cls=self._Token).get_params()
1118 1119 #//////////////////////////////////////////////////////////// 1120 #{ Tokenization 1121 #//////////////////////////////////////////////////////////// 1122
1123 - def tokenize(self, text, realign_boundaries=False):
1124 """ 1125 Given a text, returns a list of the sentences in that text. 1126 """ 1127 return list(self.sentences_from_text(text, realign_boundaries))
1128
1129 - def span_tokenize(self, text):
1130 """ 1131 Given a text, returns a list of the (start, end) spans of sentences 1132 in the text. 1133 """ 1134 return [(sl.start, sl.stop) for sl in self._slices_from_text(text)]
1135
1136 - def sentences_from_text(self, text, realign_boundaries=False):
1137 """ 1138 Given a text, generates the sentences in that text by only 1139 testing candidate sentence breaks. If realign_boundaries is 1140 True, includes in the sentence closing punctuation that 1141 follows the period. 1142 """ 1143 sents = [text[sl] for sl in self._slices_from_text(text)] 1144 if realign_boundaries: 1145 sents = self._realign_boundaries(sents) 1146 return sents
1147
1148 - def _slices_from_text(self, text):
1149 last_break = 0 1150 for match in self._lang_vars.period_context_re().finditer(text): 1151 context = match.group() + match.group('after_tok') 1152 if self.text_contains_sentbreak(context): 1153 yield slice(last_break, match.end()) 1154 if match.group('next_tok'): 1155 # next sentence starts after whitespace 1156 last_break = match.start('next_tok') 1157 else: 1158 # next sentence starts at following punctuation 1159 last_break = match.end() 1160 yield slice(last_break, len(text))
1161
1162 - def _realign_boundaries(self, sents):
1163 """ 1164 Attempts to realign punctuation that falls after the period but 1165 should otherwise be included in the same sentence. 1166 1167 For example: "(Sent1.) Sent2." will otherwise be split as:: 1168 1169 ["(Sent1.", ") Sent1."]. 1170 1171 This method will produce:: 1172 1173 ["(Sent1.)", "Sent2."]. 1174 """ 1175 realign = 0 1176 for s1, s2 in _pair_iter(sents): 1177 s1 = s1[realign:] 1178 if not s2: 1179 if s1: 1180 yield s1 1181 continue 1182 1183 m = self._lang_vars.re_boundary_realignment.match(s2) 1184 if m: 1185 yield s1 + m.group(0).strip() 1186 realign = m.end() 1187 else: 1188 realign = 0 1189 if s1: 1190 yield s1
1191
1192 - def text_contains_sentbreak(self, text):
1193 """ 1194 Returns True if the given text includes a sentence break. 1195 """ 1196 found = False # used to ignore last token 1197 for t in self._annotate_tokens(self._tokenize_words(text)): 1198 if found: 1199 return True 1200 if t.sentbreak: 1201 found = True 1202 return False
1203
1204 - def sentences_from_text_legacy(self, text):
1205 """ 1206 Given a text, generates the sentences in that text. Annotates all 1207 tokens, rather than just those with possible sentence breaks. Should 1208 produce the same results as L{sentences_from_text}. 1209 """ 1210 tokens = self._annotate_tokens(self._tokenize_words(text)) 1211 return self._build_sentence_list(text, tokens)
1212
1213 - def sentences_from_tokens(self, tokens):
1214 """ 1215 Given a sequence of tokens, generates lists of tokens, each list 1216 corresponding to a sentence. 1217 """ 1218 tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens)) 1219 sentence = [] 1220 for aug_tok in tokens: 1221 sentence.append(aug_tok.tok) 1222 if aug_tok.sentbreak: 1223 yield sentence 1224 sentence = [] 1225 if sentence: 1226 yield sentence
1227
1228 - def _annotate_tokens(self, tokens):
1229 """ 1230 Given a set of tokens augmented with markers for line-start and 1231 paragraph-start, returns an iterator through those tokens with full 1232 annotation including predicted sentence breaks. 1233 """ 1234 # Make a preliminary pass through the document, marking likely 1235 # sentence breaks, abbreviations, and ellipsis tokens. 1236 tokens = self._annotate_first_pass(tokens) 1237 1238 # Make a second pass through the document, using token context 1239 # information to change our preliminary decisions about where 1240 # sentence breaks, abbreviations, and ellipsis occurs. 1241 tokens = self._annotate_second_pass(tokens) 1242 1243 ## [XX] TESTING 1244 #tokens = list(tokens) 1245 #self.dump(tokens) 1246 1247 return tokens
1248
1249 - def _build_sentence_list(self, text, tokens):
1250 """ 1251 Given the original text and the list of augmented word tokens, 1252 construct and return a tokenized list of sentence strings. 1253 """ 1254 # Most of the work here is making sure that we put the right 1255 # pieces of whitespace back in all the right places. 1256 1257 # Our position in the source text, used to keep track of which 1258 # whitespace to add: 1259 pos = 0 1260 1261 # A regular expression that finds pieces of whitespace: 1262 WS_REGEXP = re.compile(r'\s*') 1263 1264 sentence = '' 1265 for aug_tok in tokens: 1266 tok = aug_tok.tok 1267 1268 # Find the whitespace before this token, and update pos. 1269 ws = WS_REGEXP.match(text, pos).group() 1270 pos += len(ws) 1271 1272 # Some of the rules used by the punkt word tokenizer 1273 # strip whitespace out of the text, resulting in tokens 1274 # that contain whitespace in the source text. If our 1275 # token doesn't match, see if adding whitespace helps. 1276 # If so, then use the version with whitespace. 1277 if text[pos:pos+len(tok)] != tok: 1278 pat = '\s*'.join(re.escape(c) for c in tok) 1279 m = re.compile(pat).match(text,pos) 1280 if m: tok = m.group() 1281 1282 # Move our position pointer to the end of the token. 1283 assert text[pos:pos+len(tok)] == tok 1284 pos += len(tok) 1285 1286 # Add this token. If it's not at the beginning of the 1287 # sentence, then include any whitespace that separated it 1288 # from the previous token. 1289 if sentence: 1290 sentence += ws + tok 1291 else: 1292 sentence += tok 1293 1294 # If we're at a sentence break, then start a new sentence. 1295 if aug_tok.sentbreak: 1296 yield sentence 1297 sentence = '' 1298 1299 # If the last sentence is emtpy, discard it. 1300 if sentence: 1301 yield sentence
1302 1303 # [XX] TESTING
1304 - def dump(self, tokens):
1305 print 'writing to /tmp/punkt.new...' 1306 out = open('/tmp/punkt.new', 'w') 1307 for aug_tok in tokens: 1308 if aug_tok.parastart: 1309 out.write('\n\n') 1310 elif aug_tok.linestart: 1311 out.write('\n') 1312 else: 1313 out.write(' ') 1314 1315 out.write(str(aug_tok)) 1316 out.close()
1317 1318 #//////////////////////////////////////////////////////////// 1319 #{ Customization Variables 1320 #//////////////////////////////////////////////////////////// 1321 1322 PUNCTUATION = tuple(';:,.!?') 1323 1324 #//////////////////////////////////////////////////////////// 1325 #{ Annotation Procedures 1326 #//////////////////////////////////////////////////////////// 1327
1328 - def _annotate_second_pass(self, tokens):
1329 """ 1330 Performs a token-based classification (section 4) over the given 1331 tokens, making use of the orthographic heuristic (4.1.1), collocation 1332 heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3). 1333 """ 1334 for t1, t2 in _pair_iter(tokens): 1335 self._second_pass_annotation(t1, t2) 1336 yield t1
1337
1338 - def _second_pass_annotation(self, aug_tok1, aug_tok2):
1339 """ 1340 Performs token-based classification over a pair of contiguous tokens 1341 returning an updated augmented token for the first of them. 1342 """ 1343 # Is it the last token? We can't do anything then. 1344 if not aug_tok2: 1345 return 1346 1347 tok = aug_tok1.tok 1348 if not aug_tok1.period_final: 1349 # We only care about words ending in periods. 1350 return 1351 1352 typ = aug_tok1.type_no_period 1353 next_tok = aug_tok2.tok 1354 next_typ = aug_tok2.type_no_sentperiod 1355 tok_is_initial = aug_tok1.is_initial 1356 1357 # [4.1.2. Collocational Heuristic] If there's a 1358 # collocation between the word before and after the 1359 # period, then label tok as an abbreviation and NOT 1360 # a sentence break. Note that collocations with 1361 # frequent sentence starters as their second word are 1362 # excluded in training. 1363 if (typ, next_typ) in self._params.collocations: 1364 aug_tok1.sentbreak = False 1365 aug_tok1.abbr = True 1366 return 1367 1368 # [4.2. Token-Based Reclassification of Abbreviations] If 1369 # the token is an abbreviation or an ellipsis, then decide 1370 # whether we should *also* classify it as a sentbreak. 1371 if ( (aug_tok1.abbr or aug_tok1.ellipsis) and 1372 (not tok_is_initial) ): 1373 # [4.1.1. Orthographic Heuristic] Check if there's 1374 # orthogrpahic evidence about whether the next word 1375 # starts a sentence or not. 1376 is_sent_starter = self._ortho_heuristic(aug_tok2) 1377 if is_sent_starter == True: 1378 aug_tok1.sentbreak = True 1379 return 1380 1381 # [4.1.3. Frequent Sentence Starter Heruistic] If the 1382 # next word is capitalized, and is a member of the 1383 # frequent-sentence-starters list, then label tok as a 1384 # sentence break. 1385 if ( aug_tok2.first_upper and 1386 next_typ in self._params.sent_starters): 1387 aug_tok1.sentbreak = True 1388 return 1389 1390 # [4.3. Token-Based Detection of Initials and Ordinals] 1391 # Check if any initials or ordinals tokens that are marked 1392 # as sentbreaks should be reclassified as abbreviations. 1393 if tok_is_initial or typ == '##number##': 1394 1395 # [4.1.1. Orthographic Heuristic] Check if there's 1396 # orthogrpahic evidence about whether the next word 1397 # starts a sentence or not. 1398 is_sent_starter = self._ortho_heuristic(aug_tok2) 1399 1400 if is_sent_starter == False: 1401 aug_tok1.sentbreak = False 1402 aug_tok1.abbr = True 1403 return 1404 1405 # Special heuristic for initials: if orthogrpahic 1406 # heuristc is unknown, and next word is always 1407 # capitalized, then mark as abbrev (eg: J. Bach). 1408 if ( is_sent_starter == 'unknown' and tok_is_initial and 1409 aug_tok2.first_upper and 1410 not (self._params.ortho_context[next_typ] & _ORTHO_LC) ): 1411 aug_tok1.sentbreak = False 1412 aug_tok1.abbr = True 1413 return 1414 1415 return
1416
1417 - def _ortho_heuristic(self, aug_tok):
1418 """ 1419 Decide whether the given token is the first token in a sentence. 1420 """ 1421 # Sentences don't start with punctuation marks: 1422 if aug_tok.tok in self.PUNCTUATION: 1423 return False 1424 1425 ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod] 1426 1427 # If the word is capitalized, occurs at least once with a 1428 # lower case first letter, and never occurs with an upper case 1429 # first letter sentence-internally, then it's a sentence starter. 1430 if ( aug_tok.first_upper and 1431 (ortho_context & _ORTHO_LC) and 1432 not (ortho_context & _ORTHO_MID_UC) ): 1433 return True 1434 1435 # If the word is lower case, and either (a) we've seen it used 1436 # with upper case, or (b) we've never seen it used 1437 # sentence-initially with lower case, then it's not a sentence 1438 # starter. 1439 if ( aug_tok.first_lower and 1440 ((ortho_context & _ORTHO_UC) or 1441 not (ortho_context & _ORTHO_BEG_LC)) ): 1442 return False 1443 1444 # Otherwise, we're not sure. 1445 return 'unknown'
1446
1447 1448 -def main(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer):
1449 """Builds a punkt model and applies it to the same text""" 1450 cleanup = lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ') 1451 trainer = train_cls() 1452 trainer.INCLUDE_ALL_COLLOCS = True 1453 trainer.train(text) 1454 sbd = tok_cls(trainer.get_params()) 1455 for l in sbd.sentences_from_text(text, realign_boundaries=True): 1456 print cleanup(l)
1457 1458 1459 if __name__ == '__main__': 1460 import sys 1461 main(sys.stdin.read()) 1462