1
2
3
4
5
6
7
8
9 """
10 A reader for corpora that contain chunked (and optionally tagged)
11 documents.
12 """
13
14 import os.path, codecs
15
16 import nltk
17 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
18 from nltk.tree import Tree
19 from nltk.tokenize import *
20 from util import *
21 from api import *
22
24 """
25 Reader for chunked (and optionally tagged) corpora. Paragraphs
26 are split using a block reader. They are then tokenized into
27 sentences using a sentence tokenizer. Finally, these sentences
28 are parsed into chunk trees using a string-to-chunktree conversion
29 function. Each of these steps can be performed using a default
30 function or a custom function. By default, paragraphs are split
31 on blank lines; sentences are listed one per line; and sentences
32 are parsed into chunk trees using L{nltk.chunk.tagstr2tree}.
33 """
39 """
40 @param root: The root directory for this corpus.
41 @param fileids: A list or regexp specifying the fileids in this corpus.
42 """
43 CorpusReader.__init__(self, root, fileids, encoding)
44
45 self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader)
46 """Arguments for corpus views generated by this corpus: a tuple
47 (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
48
49 - def raw(self, fileids=None):
57
58 - def words(self, fileids=None):
59 """
60 @return: the given file(s) as a list of words
61 and punctuation symbols.
62 @rtype: C{list} of C{str}
63 """
64 return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
65 for (f, enc) in self.abspaths(fileids, True)])
66
67 - def sents(self, fileids=None):
68 """
69 @return: the given file(s) as a list of
70 sentences or utterances, each encoded as a list of word
71 strings.
72 @rtype: C{list} of (C{list} of C{str})
73 """
74 return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
75 for (f, enc) in self.abspaths(fileids, True)])
76
77 - def paras(self, fileids=None):
78 """
79 @return: the given file(s) as a list of
80 paragraphs, each encoded as a list of sentences, which are
81 in turn encoded as lists of word strings.
82 @rtype: C{list} of (C{list} of (C{list} of C{str}))
83 """
84 return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
85 for (f, enc) in self.abspaths(fileids, True)])
86
88 """
89 @return: the given file(s) as a list of tagged
90 words and punctuation symbols, encoded as tuples
91 C{(word,tag)}.
92 @rtype: C{list} of C{(str,str)}
93 """
94 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args)
95 for (f, enc) in self.abspaths(fileids, True)])
96
98 """
99 @return: the given file(s) as a list of
100 sentences, each encoded as a list of C{(word,tag)} tuples.
101
102 @rtype: C{list} of (C{list} of C{(str,str)})
103 """
104 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args)
105 for (f, enc) in self.abspaths(fileids, True)])
106
108 """
109 @return: the given file(s) as a list of
110 paragraphs, each encoded as a list of sentences, which are
111 in turn encoded as lists of C{(word,tag)} tuples.
112 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
113 """
114 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args)
115 for (f, enc) in self.abspaths(fileids, True)])
116
118 """
119 @return: the given file(s) as a list of tagged
120 words and chunks. Words are encoded as C{(word, tag)}
121 tuples (if the corpus has tags) or word strings (if the
122 corpus has no tags). Chunks are encoded as depth-one
123 trees over C{(word,tag)} tuples or word strings.
124 @rtype: C{list} of (C{(str,str)} and L{Tree})
125 """
126 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args)
127 for (f, enc) in self.abspaths(fileids, True)])
128
130 """
131 @return: the given file(s) as a list of
132 sentences, each encoded as a shallow C{Tree}. The leaves
133 of these trees are encoded as C{(word, tag)} tuples (if
134 the corpus has tags) or word strings (if the corpus has no
135 tags).
136 @rtype: C{list} of L{Tree}
137 """
138 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args)
139 for (f, enc) in self.abspaths(fileids, True)])
140
142 """
143 @return: the given file(s) as a list of
144 paragraphs, each encoded as a list of sentences, which are
145 in turn encoded as a shallow C{Tree}. The leaves of these
146 trees are encoded as C{(word, tag)} tuples (if the corpus
147 has tags) or word strings (if the corpus has no tags).
148 @rtype: C{list} of (C{list} of L{Tree})
149 """
150 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args)
151 for (f, enc) in self.abspaths(fileids, True)])
152
156
158 - def __init__(self, fileid, encoding, tagged, group_by_sent,
159 group_by_para, chunked, str2chunktree, sent_tokenizer,
160 para_block_reader):
161 StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
162 self._tagged = tagged
163 self._group_by_sent = group_by_sent
164 self._group_by_para = group_by_para
165 self._chunked = chunked
166 self._str2chunktree = str2chunktree
167 self._sent_tokenizer = sent_tokenizer
168 self._para_block_reader = para_block_reader
169
171 block = []
172 for para_str in self._para_block_reader(stream):
173 para = []
174 for sent_str in self._sent_tokenizer.tokenize(para_str):
175 sent = self._str2chunktree(sent_str)
176
177
178 if not self._tagged:
179 sent = self._untag(sent)
180
181
182 if not self._chunked:
183 sent = sent.leaves()
184
185
186 if self._group_by_sent:
187 para.append(sent)
188 else:
189 para.extend(sent)
190
191
192 if self._group_by_para:
193 block.append(para)
194 else:
195 block.extend(para)
196
197
198 return block
199
209