1
2
3
4
5
6
7
8
9 import sys
10
11 from nltk.tree import Tree
12
13 from util import *
14 from api import *
15
16 """
17 Corpus reader for corpora that consist of parenthesis-delineated parse trees.
18 """
19
20
21 TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)')
22 WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)')
23 EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(')
24
26 """
27 Reader for corpora that consist of parenthesis-delineated parse
28 trees.
29 """
30 - def __init__(self, root, fileids, comment_char=None,
31 detect_blocks='unindented_paren', encoding=None,
32 tag_mapping_function=None):
33 """
34 @param root: The root directory for this corpus.
35 @param fileids: A list or regexp specifying the fileids in this corpus.
36 @param comment_char: The character which can appear at the start of
37 a line to indicate that the rest of the line is a comment.
38 @param detect_blocks: The method that is used to find blocks
39 in the corpus; can be 'unindented_paren' (every unindented
40 parenthesis starts a new parse) or 'sexpr' (brackets are
41 matched).
42 """
43 CorpusReader.__init__(self, root, fileids, encoding)
44 self._comment_char = comment_char
45 self._detect_blocks = detect_blocks
46 self._tag_mapping_function = tag_mapping_function
47
49 if self._detect_blocks == 'sexpr':
50 return read_sexpr_block(stream, comment_char=self._comment_char)
51 elif self._detect_blocks == 'blankline':
52 return read_blankline_block(stream)
53 elif self._detect_blocks == 'unindented_paren':
54
55 toks = read_regexp_block(stream, start_re=r'^\(')
56
57 if self._comment_char:
58 toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char),
59 '', tok)
60 for tok in toks]
61 return toks
62 else:
63 assert 0, 'bad block type'
64
75
77 try:
78 return Tree.parse(self._normalize(t))
79
80 except ValueError, e:
81 sys.stderr.write("Bad tree detected; trying to recover...\n")
82
83 if e.args == ('mismatched parens',):
84 for n in range(1, 5):
85 try:
86 v = Tree.parse(self._normalize(t+')'*n))
87 sys.stderr.write(" Recovered by adding %d close "
88 "paren(s)\n" % n)
89 return v
90 except ValueError: pass
91
92 sys.stderr.write(" Recovered by returning a flat parse.\n")
93
94 return Tree('S', self._tag(t))
95
96 - def _tag(self, t, simplify_tags=False):
97 tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(self._normalize(t))]
98 if simplify_tags:
99 tagged_sent = [(w, self._tag_mapping_function(t))
100 for (w,t) in tagged_sent]
101 return tagged_sent
102
105
107 """
108 Reader for the Alpino Dutch Treebank.
109 """
110 - def __init__(self, root, encoding=None, tag_mapping_function=None):
115
117 if t[:10] != "<alpino_ds":
118 return ""
119
120 t = re.sub(r' <node .*? cat="(\w+)".*>', r"(\1", t)
121 t = re.sub(r' <node .*? pos="(\w+)".*? word="([^"]+)".*/>', r"(\1 \2)", t)
122 t = re.sub(r" </node>", r")", t)
123 t = re.sub(r"<sentence>.*</sentence>", r"", t)
124 t = re.sub(r"</?alpino_ds.*>", r"", t)
125 return t
126