Package nltk :: Package corpus :: Package reader :: Module bracket_parse
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.bracket_parse

  1  # Natural Language Toolkit: Penn Treebank Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  import sys 
 10   
 11  from nltk.tree import Tree 
 12   
 13  from util import * 
 14  from api import * 
 15   
 16  """ 
 17  Corpus reader for corpora that consist of parenthesis-delineated parse trees. 
 18  """ 
 19   
 20  # we use [^\s()]+ instead of \S+? to avoid matching () 
 21  TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)') 
 22  WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)') 
 23  EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(') 
 24   
25 -class BracketParseCorpusReader(SyntaxCorpusReader):
26 """ 27 Reader for corpora that consist of parenthesis-delineated parse 28 trees. 29 """
30 - def __init__(self, root, fileids, comment_char=None, 31 detect_blocks='unindented_paren', encoding=None, 32 tag_mapping_function=None):
33 """ 34 @param root: The root directory for this corpus. 35 @param fileids: A list or regexp specifying the fileids in this corpus. 36 @param comment_char: The character which can appear at the start of 37 a line to indicate that the rest of the line is a comment. 38 @param detect_blocks: The method that is used to find blocks 39 in the corpus; can be 'unindented_paren' (every unindented 40 parenthesis starts a new parse) or 'sexpr' (brackets are 41 matched). 42 """ 43 CorpusReader.__init__(self, root, fileids, encoding) 44 self._comment_char = comment_char 45 self._detect_blocks = detect_blocks 46 self._tag_mapping_function = tag_mapping_function
47
48 - def _read_block(self, stream):
49 if self._detect_blocks == 'sexpr': 50 return read_sexpr_block(stream, comment_char=self._comment_char) 51 elif self._detect_blocks == 'blankline': 52 return read_blankline_block(stream) 53 elif self._detect_blocks == 'unindented_paren': 54 # Tokens start with unindented left parens. 55 toks = read_regexp_block(stream, start_re=r'^\(') 56 # Strip any comments out of the tokens. 57 if self._comment_char: 58 toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char), 59 '', tok) 60 for tok in toks] 61 return toks 62 else: 63 assert 0, 'bad block type'
64
65 - def _normalize(self, t):
66 # If there's an empty set of brackets surrounding the actual 67 # parse, then strip them off. 68 if EMPTY_BRACKETS.match(t): 69 t = t.strip()[1:-1] 70 # Replace leaves of the form (!), (,), with (! !), (, ,) 71 t = re.sub(r"\((.)\)", r"(\1 \1)", t) 72 # Replace leaves of the form (tag word root) with (tag word) 73 t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) 74 return t
75
76 - def _parse(self, t):
77 try: 78 return Tree.parse(self._normalize(t)) 79 80 except ValueError, e: 81 sys.stderr.write("Bad tree detected; trying to recover...\n") 82 # Try to recover, if we can: 83 if e.args == ('mismatched parens',): 84 for n in range(1, 5): 85 try: 86 v = Tree.parse(self._normalize(t+')'*n)) 87 sys.stderr.write(" Recovered by adding %d close " 88 "paren(s)\n" % n) 89 return v 90 except ValueError: pass 91 # Try something else: 92 sys.stderr.write(" Recovered by returning a flat parse.\n") 93 #sys.stderr.write(' '.join(t.split())+'\n') 94 return Tree('S', self._tag(t))
95
96 - def _tag(self, t, simplify_tags=False):
97 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))] 98 if simplify_tags: 99 tagged_sent = [(w, self._tag_mapping_function(t)) 100 for (w,t) in tagged_sent] 101 return tagged_sent
102
103 - def _word(self, t):
104 return WORD.findall(self._normalize(t))
105
106 -class AlpinoCorpusReader(BracketParseCorpusReader):
107 """ 108 Reader for the Alpino Dutch Treebank. 109 """
110 - def __init__(self, root, encoding=None, tag_mapping_function=None):
111 BracketParseCorpusReader.__init__(self, root, 'alpino\.xml', 112 detect_blocks='blankline', 113 encoding=encoding, 114 tag_mapping_function=tag_mapping_function)
115
116 - def _normalize(self, t):
117 if t[:10] != "<alpino_ds": 118 return "" 119 # convert XML to sexpr notation 120 t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t) 121 t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t) 122 t = re.sub(r" </node>", r")", t) 123 t = re.sub(r"<sentence>.*</sentence>", r"", t) 124 t = re.sub(r"</?alpino_ds.*>", r"", t) 125 return t
126