1
2
3
4
5
6
7
8
9
10 """
11 A reader for corpora whose documents contain part-of-speech-tagged words.
12 """
13
14 import os
15
16 from nltk.tag import str2tuple
17 from nltk.tokenize import *
18
19 from api import *
20 from util import *
21 from timit import read_timit_block
22
24 """
25 Reader for simple part-of-speech tagged corpora. Paragraphs are
26 assumed to be split using blank lines. Sentences and words can be
27 tokenized using the default tokenizers, or by custom tokenizers
28 specified as parameters to the constructor. Words are parsed
29 using L{nltk.tag.str2tuple}. By default, C{'/'} is used as the
30 separator. I.e., words should have the form::
31
32 word1/tag1 word2/tag2 word3/tag3 ...
33
34 But custom separators may be specified as parameters to the
35 constructor. Part of speech tags are case-normalized to upper
36 case.
37 """
44 """
45 Construct a new Tagged Corpus reader for a set of documents
46 located at the given root directory. Example usage:
47
48 >>> root = '/...path to corpus.../'
49 >>> reader = TaggedCorpusReader(root, '.*', '.txt')
50
51 @param root: The root directory for this corpus.
52 @param fileids: A list or regexp specifying the fileids in this corpus.
53 """
54 CorpusReader.__init__(self, root, fileids, encoding)
55 self._sep = sep
56 self._word_tokenizer = word_tokenizer
57 self._sent_tokenizer = sent_tokenizer
58 self._para_block_reader = para_block_reader
59 self._tag_mapping_function = tag_mapping_function
60
61 - def raw(self, fileids=None):
69
70 - def words(self, fileids=None):
71 """
72 @return: the given file(s) as a list of words
73 and punctuation symbols.
74 @rtype: C{list} of C{str}
75 """
76 return concat([TaggedCorpusView(fileid, enc,
77 False, False, False,
78 self._sep, self._word_tokenizer,
79 self._sent_tokenizer,
80 self._para_block_reader,
81 None)
82 for (fileid, enc) in self.abspaths(fileids, True)])
83
84 - def sents(self, fileids=None):
85 """
86 @return: the given file(s) as a list of
87 sentences or utterances, each encoded as a list of word
88 strings.
89 @rtype: C{list} of (C{list} of C{str})
90 """
91 return concat([TaggedCorpusView(fileid, enc,
92 False, True, False,
93 self._sep, self._word_tokenizer,
94 self._sent_tokenizer,
95 self._para_block_reader,
96 None)
97 for (fileid, enc) in self.abspaths(fileids, True)])
98
99 - def paras(self, fileids=None):
100 """
101 @return: the given file(s) as a list of
102 paragraphs, each encoded as a list of sentences, which are
103 in turn encoded as lists of word strings.
104 @rtype: C{list} of (C{list} of (C{list} of C{str}))
105 """
106 return concat([TaggedCorpusView(fileid, enc,
107 False, True, True,
108 self._sep, self._word_tokenizer,
109 self._sent_tokenizer,
110 self._para_block_reader,
111 None)
112 for (fileid, enc) in self.abspaths(fileids, True)])
113
115 """
116 @return: the given file(s) as a list of tagged
117 words and punctuation symbols, encoded as tuples
118 C{(word,tag)}.
119 @rtype: C{list} of C{(str,str)}
120 """
121 if simplify_tags:
122 tag_mapping_function = self._tag_mapping_function
123 else:
124 tag_mapping_function = None
125 return concat([TaggedCorpusView(fileid, enc,
126 True, False, False,
127 self._sep, self._word_tokenizer,
128 self._sent_tokenizer,
129 self._para_block_reader,
130 tag_mapping_function)
131 for (fileid, enc) in self.abspaths(fileids, True)])
132
134 """
135 @return: the given file(s) as a list of
136 sentences, each encoded as a list of C{(word,tag)} tuples.
137
138 @rtype: C{list} of (C{list} of C{(str,str)})
139 """
140 if simplify_tags:
141 tag_mapping_function = self._tag_mapping_function
142 else:
143 tag_mapping_function = None
144 return concat([TaggedCorpusView(fileid, enc,
145 True, True, False,
146 self._sep, self._word_tokenizer,
147 self._sent_tokenizer,
148 self._para_block_reader,
149 tag_mapping_function)
150 for (fileid, enc) in self.abspaths(fileids, True)])
151
153 """
154 @return: the given file(s) as a list of
155 paragraphs, each encoded as a list of sentences, which are
156 in turn encoded as lists of C{(word,tag)} tuples.
157 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)}))
158 """
159 if simplify_tags:
160 tag_mapping_function = self._tag_mapping_function
161 else:
162 tag_mapping_function = None
163 return concat([TaggedCorpusView(fileid, enc,
164 True, True, True,
165 self._sep, self._word_tokenizer,
166 self._sent_tokenizer,
167 self._para_block_reader,
168 tag_mapping_function)
169 for (fileid, enc) in self.abspaths(fileids, True)])
170
173 """
174 A reader for part-of-speech tagged corpora whose documents are
175 divided into categories based on their file identifiers.
176 """
178 """
179 Initialize the corpus reader. Categorization arguments
180 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
181 the L{CategorizedCorpusReader constructor
182 <CategorizedCorpusReader.__init__>}. The remaining arguments
183 are passed to the L{TaggedCorpusReader constructor
184 <TaggedCorpusReader.__init__>}.
185 """
186 CategorizedCorpusReader.__init__(self, kwargs)
187 TaggedCorpusReader.__init__(self, *args, **kwargs)
188
189 - def _resolve(self, fileids, categories):
196 - def raw(self, fileids=None, categories=None):
199 - def words(self, fileids=None, categories=None):
202 - def sents(self, fileids=None, categories=None):
205 - def paras(self, fileids=None, categories=None):
208 - def tagged_words(self, fileids=None, categories=None, simplify_tags=False):
211 - def tagged_sents(self, fileids=None, categories=None, simplify_tags=False):
214 - def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
217
219 """
220 A specialized corpus view for tagged documents. It can be
221 customized via flags to divide the tagged corpus documents up by
222 sentence or paragraph, and to include or omit part of speech tags.
223 C{TaggedCorpusView} objects are typically created by
224 L{TaggedCorpusReader} (not directly by nltk users).
225 """
226 - def __init__(self, corpus_file, encoding, tagged, group_by_sent,
227 group_by_para, sep, word_tokenizer, sent_tokenizer,
228 para_block_reader, tag_mapping_function=None):
229 self._tagged = tagged
230 self._group_by_sent = group_by_sent
231 self._group_by_para = group_by_para
232 self._sep = sep
233 self._word_tokenizer = word_tokenizer
234 self._sent_tokenizer = sent_tokenizer
235 self._para_block_reader = para_block_reader
236 self._tag_mapping_function = tag_mapping_function
237 StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
238
240 """Reads one paragraph at a time."""
241 block = []
242 for para_str in self._para_block_reader(stream):
243 para = []
244 for sent_str in self._sent_tokenizer.tokenize(para_str):
245 sent = [str2tuple(s, self._sep) for s in
246 self._word_tokenizer.tokenize(sent_str)]
247 if self._tag_mapping_function:
248 sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent]
249 if not self._tagged:
250 sent = [w for (w,t) in sent]
251 if self._group_by_sent:
252 para.append(sent)
253 else:
254 para.extend(sent)
255 if self._group_by_para:
256 block.append(para)
257 else:
258 block.extend(para)
259 return block
260
261
263 """
264 A corpus reader for the MAC_MORPHO corpus. Each line contains a
265 single tagged word, using '_' as a separator. Sentence boundaries
266 are based on the end-sentence tag ('_.'). Paragraph information
267 is not included in the corpus, so each paragraph returned by
268 L{self.paras()} and L{self.tagged_paras()} contains a single
269 sentence.
270 """
271 - def __init__(self, root, fileids, encoding=None, tag_mapping_function=None):
279
282
284 """
285 A corpus reader for tagged sentences that are included in the TIMIT corpus.
286 """
290
292 raise NotImplementedError('use sents() instead')
293
295 raise NotImplementedError('use tagged_sents() instead')
296