1
2
3
4
5
6
7
8
9
10 """
11 A reader for corpora that consist of plaintext documents.
12 """
13
14 import codecs
15
16 import nltk.data
17 from nltk.tokenize import *
18
19 from util import *
20 from api import *
21
22 -class PlaintextCorpusReader(CorpusReader):
23 """
24 Reader for corpora that consist of plaintext documents. Paragraphs
25 are assumed to be split using blank lines. Sentences and words can
26 be tokenized using the default tokenizers, or by custom tokenizers
27 specificed as parameters to the constructor.
28
29 This corpus reader can be customized (e.g., to skip preface
30 sections of specific document formats) by creating a subclass and
31 overriding the L{CorpusView} class variable.
32 """
33
34 CorpusView = StreamBackedCorpusView
35 """The corpus view class used by this reader. Subclasses of
36 L{PlaintextCorpusReader} may specify alternative corpus view
37 classes (e.g., to skip the preface sections of documents.)"""
38
39 - def __init__(self, root, fileids,
40 word_tokenizer=WordPunctTokenizer(),
41 sent_tokenizer=nltk.data.LazyLoader(
42 'tokenizers/punkt/english.pickle'),
43 para_block_reader=read_blankline_block,
44 encoding=None):
45 """
46 Construct a new plaintext corpus reader for a set of documents
47 located at the given root directory. Example usage:
48
49 >>> root = '/...path to corpus.../'
50 >>> reader = PlaintextCorpusReader(root, '.*\.txt')
51
52 @param root: The root directory for this corpus.
53 @param fileids: A list or regexp specifying the fileids in this corpus.
54 @param word_tokenizer: Tokenizer for breaking sentences or
55 paragraphs into words.
56 @param sent_tokenizer: Tokenizer for breaking paragraphs
57 into words.
58 @param para_block_reader: The block reader used to divide the
59 corpus into paragraph blocks.
60 """
61 CorpusReader.__init__(self, root, fileids, encoding)
62 self._word_tokenizer = word_tokenizer
63 self._sent_tokenizer = sent_tokenizer
64 self._para_block_reader = para_block_reader
65
66 - def raw(self, fileids=None, sourced=False):
67 """
68 @return: the given file(s) as a single string.
69 @rtype: C{str}
70 """
71 if fileids is None: fileids = self._fileids
72 elif isinstance(fileids, basestring): fileids = [fileids]
73 return concat([self.open(f, sourced).read() for f in fileids])
74
75 - def words(self, fileids=None, sourced=False):
76 """
77 @return: the given file(s) as a list of words
78 and punctuation symbols.
79 @rtype: C{list} of C{str}
80 """
81
82 if sourced:
83 return concat([self.CorpusView(path, self._read_word_block,
84 encoding=enc, source=fileid)
85 for (path, enc, fileid)
86 in self.abspaths(fileids, True, True)])
87 else:
88 return concat([self.CorpusView(path, self._read_word_block,
89 encoding=enc)
90 for (path, enc, fileid)
91 in self.abspaths(fileids, True, True)])
92
93
94 - def sents(self, fileids=None, sourced=False):
95 """
96 @return: the given file(s) as a list of
97 sentences or utterances, each encoded as a list of word
98 strings.
99 @rtype: C{list} of (C{list} of C{str})
100 """
101 if self._sent_tokenizer is None:
102 raise ValueError('No sentence tokenizer for this corpus')
103 if sourced:
104 return concat([self.CorpusView(path, self._read_sent_block,
105 encoding=enc, source=fileid)
106 for (path, enc, fileid)
107 in self.abspaths(fileids, True, True)])
108 else:
109 return concat([self.CorpusView(path, self._read_sent_block,
110 encoding=enc)
111 for (path, enc, fileid)
112 in self.abspaths(fileids, True, True)])
113
114
115 - def paras(self, fileids=None, sourced=False):
116 """
117 @return: the given file(s) as a list of
118 paragraphs, each encoded as a list of sentences, which are
119 in turn encoded as lists of word strings.
120 @rtype: C{list} of (C{list} of (C{list} of C{str}))
121 """
122 if self._sent_tokenizer is None:
123 raise ValueError('No sentence tokenizer for this corpus')
124 if sourced:
125 return concat([self.CorpusView(path, self._read_para_block,
126 encoding=enc, source=fileid)
127 for (path, enc, fileid)
128 in self.abspaths(fileids, True, True)])
129 else:
130 return concat([self.CorpusView(path, self._read_para_block,
131 encoding=enc)
132 for (path, enc, fileid)
133 in self.abspaths(fileids, True, True)])
134
135 - def _read_word_block(self, stream):
136 words = []
137 for i in range(20):
138 words.extend(self._word_tokenizer.tokenize(stream.readline()))
139 return words
140
141 - def _read_sent_block(self, stream):
142 sents = []
143 for para in self._para_block_reader(stream):
144 sents.extend([self._word_tokenizer.tokenize(sent)
145 for sent in self._sent_tokenizer.tokenize(para)])
146 return sents
147
148 - def _read_para_block(self, stream):
149 paras = []
150 for para in self._para_block_reader(stream):
151 paras.append([self._word_tokenizer.tokenize(sent)
152 for sent in self._sent_tokenizer.tokenize(para)])
153 return paras
154
155
156 -class CategorizedPlaintextCorpusReader(CategorizedCorpusReader,
157 PlaintextCorpusReader):
158 """
159 A reader for plaintext corpora whose documents are divided into
160 categories based on their file identifiers.
161 """
162 - def __init__(self, *args, **kwargs):
163 """
164 Initialize the corpus reader. Categorization arguments
165 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to
166 the L{CategorizedCorpusReader constructor
167 <CategorizedCorpusReader.__init__>}. The remaining arguments
168 are passed to the L{PlaintextCorpusReader constructor
169 <PlaintextCorpusReader.__init__>}.
170 """
171 CategorizedCorpusReader.__init__(self, kwargs)
172 PlaintextCorpusReader.__init__(self, *args, **kwargs)
173
174 - def _resolve(self, fileids, categories):
175 if fileids is not None and categories is not None:
176 raise ValueError('Specify fileids or categories, not both')
177 if categories is not None:
178 return self.fileids(categories)
179 else:
180 return fileids
181 - def raw(self, fileids=None, categories=None):
184 - def words(self, fileids=None, categories=None):
187 - def sents(self, fileids=None, categories=None):
190 - def paras(self, fileids=None, categories=None):
193
194
195 -class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
196 - def __init__(self, *args, **kwargs):
197 CategorizedCorpusReader.__init__(self, kwargs)
198 kwargs['sent_tokenizer'] = nltk.data.LazyLoader('tokenizers/punkt/portuguese.pickle')
199 PlaintextCorpusReader.__init__(self, *args, **kwargs)
200
202
203 """
204 Reader for Europarl corpora that consist of plaintext documents.
205 Documents are divided into chapters instead of paragraphs as
206 for regular plaintext documents. Chapters are separated using blank
207 lines. Everything is inherited from L{PlaintextCorpusReader} except
208 that:
209 - Since the corpus is pre-processed and pre-tokenized, the
210 word tokenizer should just split the line at whitespaces.
211 - For the same reason, the sentence tokenizer should just
212 split the paragraph at line breaks.
213 - There is a new 'chapters()' method that returns chapters instead
214 instead of paragraphs.
215 - The 'paras()' method inherited from PlaintextCorpusReader is
216 made non-functional to remove any confusion between chapters
217 and paragraphs for Europarl.
218 """
219
225
231
237
239 """
240 @return: the given file(s) as a list of
241 chapters, each encoded as a list of sentences, which are
242 in turn encoded as lists of word strings.
243 @rtype: C{list} of (C{list} of (C{list} of C{str}))
244 """
245 return concat([self.CorpusView(fileid, self._read_para_block,
246 encoding=enc)
247 for (fileid, enc) in self.abspaths(fileids, True)])
248
249 - def paras(self, fileids=None):
250 raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.')
251