Package nltk :: Package corpus :: Package reader :: Module plaintext
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.plaintext

  1  # Natural Language Toolkit: Plaintext Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  #         Nitin Madnani <nmadnani@umiacs.umd.edu> 
  7  # URL: <http://www.nltk.org/> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  A reader for corpora that consist of plaintext documents. 
 12  """ 
 13   
 14  import codecs 
 15   
 16  import nltk.data 
 17  from nltk.tokenize import * 
 18   
 19  from util import * 
 20  from api import * 
 21   
22 -class PlaintextCorpusReader(CorpusReader):
23 """ 24 Reader for corpora that consist of plaintext documents. Paragraphs 25 are assumed to be split using blank lines. Sentences and words can 26 be tokenized using the default tokenizers, or by custom tokenizers 27 specificed as parameters to the constructor. 28 29 This corpus reader can be customized (e.g., to skip preface 30 sections of specific document formats) by creating a subclass and 31 overriding the L{CorpusView} class variable. 32 """ 33 34 CorpusView = StreamBackedCorpusView 35 """The corpus view class used by this reader. Subclasses of 36 L{PlaintextCorpusReader} may specify alternative corpus view 37 classes (e.g., to skip the preface sections of documents.)""" 38
39 - def __init__(self, root, fileids, 40 word_tokenizer=WordPunctTokenizer(), 41 sent_tokenizer=nltk.data.LazyLoader( 42 'tokenizers/punkt/english.pickle'), 43 para_block_reader=read_blankline_block, 44 encoding=None):
45 """ 46 Construct a new plaintext corpus reader for a set of documents 47 located at the given root directory. Example usage: 48 49 >>> root = '/...path to corpus.../' 50 >>> reader = PlaintextCorpusReader(root, '.*\.txt') 51 52 @param root: The root directory for this corpus. 53 @param fileids: A list or regexp specifying the fileids in this corpus. 54 @param word_tokenizer: Tokenizer for breaking sentences or 55 paragraphs into words. 56 @param sent_tokenizer: Tokenizer for breaking paragraphs 57 into words. 58 @param para_block_reader: The block reader used to divide the 59 corpus into paragraph blocks. 60 """ 61 CorpusReader.__init__(self, root, fileids, encoding) 62 self._word_tokenizer = word_tokenizer 63 self._sent_tokenizer = sent_tokenizer 64 self._para_block_reader = para_block_reader
65
66 - def raw(self, fileids=None, sourced=False):
67 """ 68 @return: the given file(s) as a single string. 69 @rtype: C{str} 70 """ 71 if fileids is None: fileids = self._fileids 72 elif isinstance(fileids, basestring): fileids = [fileids] 73 return concat([self.open(f, sourced).read() for f in fileids])
74
75 - def words(self, fileids=None, sourced=False):
76 """ 77 @return: the given file(s) as a list of words 78 and punctuation symbols. 79 @rtype: C{list} of C{str} 80 """ 81 # Once we require Python 2.5, use source=(fileid if sourced else None) 82 if sourced: 83 return concat([self.CorpusView(path, self._read_word_block, 84 encoding=enc, source=fileid) 85 for (path, enc, fileid) 86 in self.abspaths(fileids, True, True)]) 87 else: 88 return concat([self.CorpusView(path, self._read_word_block, 89 encoding=enc) 90 for (path, enc, fileid) 91 in self.abspaths(fileids, True, True)])
92 93
94 - def sents(self, fileids=None, sourced=False):
95 """ 96 @return: the given file(s) as a list of 97 sentences or utterances, each encoded as a list of word 98 strings. 99 @rtype: C{list} of (C{list} of C{str}) 100 """ 101 if self._sent_tokenizer is None: 102 raise ValueError('No sentence tokenizer for this corpus') 103 if sourced: 104 return concat([self.CorpusView(path, self._read_sent_block, 105 encoding=enc, source=fileid) 106 for (path, enc, fileid) 107 in self.abspaths(fileids, True, True)]) 108 else: 109 return concat([self.CorpusView(path, self._read_sent_block, 110 encoding=enc) 111 for (path, enc, fileid) 112 in self.abspaths(fileids, True, True)])
113 114
115 - def paras(self, fileids=None, sourced=False):
116 """ 117 @return: the given file(s) as a list of 118 paragraphs, each encoded as a list of sentences, which are 119 in turn encoded as lists of word strings. 120 @rtype: C{list} of (C{list} of (C{list} of C{str})) 121 """ 122 if self._sent_tokenizer is None: 123 raise ValueError('No sentence tokenizer for this corpus') 124 if sourced: 125 return concat([self.CorpusView(path, self._read_para_block, 126 encoding=enc, source=fileid) 127 for (path, enc, fileid) 128 in self.abspaths(fileids, True, True)]) 129 else: 130 return concat([self.CorpusView(path, self._read_para_block, 131 encoding=enc) 132 for (path, enc, fileid) 133 in self.abspaths(fileids, True, True)])
134
135 - def _read_word_block(self, stream):
136 words = [] 137 for i in range(20): # Read 20 lines at a time. 138 words.extend(self._word_tokenizer.tokenize(stream.readline())) 139 return words
140
141 - def _read_sent_block(self, stream):
142 sents = [] 143 for para in self._para_block_reader(stream): 144 sents.extend([self._word_tokenizer.tokenize(sent) 145 for sent in self._sent_tokenizer.tokenize(para)]) 146 return sents
147
148 - def _read_para_block(self, stream):
149 paras = [] 150 for para in self._para_block_reader(stream): 151 paras.append([self._word_tokenizer.tokenize(sent) 152 for sent in self._sent_tokenizer.tokenize(para)]) 153 return paras
154 155
156 -class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, 157 PlaintextCorpusReader):
158 """ 159 A reader for plaintext corpora whose documents are divided into 160 categories based on their file identifiers. 161 """
162 - def __init__(self, *args, **kwargs):
163 """ 164 Initialize the corpus reader. Categorization arguments 165 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to 166 the L{CategorizedCorpusReader constructor 167 <CategorizedCorpusReader.__init__>}. The remaining arguments 168 are passed to the L{PlaintextCorpusReader constructor 169 <PlaintextCorpusReader.__init__>}. 170 """ 171 CategorizedCorpusReader.__init__(self, kwargs) 172 PlaintextCorpusReader.__init__(self, *args, **kwargs)
173
174 - def _resolve(self, fileids, categories):
175 if fileids is not None and categories is not None: 176 raise ValueError('Specify fileids or categories, not both') 177 if categories is not None: 178 return self.fileids(categories) 179 else: 180 return fileids
181 - def raw(self, fileids=None, categories=None):
182 return PlaintextCorpusReader.raw( 183 self, self._resolve(fileids, categories))
184 - def words(self, fileids=None, categories=None):
185 return PlaintextCorpusReader.words( 186 self, self._resolve(fileids, categories))
187 - def sents(self, fileids=None, categories=None):
188 return PlaintextCorpusReader.sents( 189 self, self._resolve(fileids, categories))
190 - def paras(self, fileids=None, categories=None):
191 return PlaintextCorpusReader.paras( 192 self, self._resolve(fileids, categories))
193 194 # is there a better way?
195 -class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
196 - def __init__(self, *args, **kwargs):
197 CategorizedCorpusReader.__init__(self, kwargs) 198 kwargs['sent_tokenizer'] = nltk.data.LazyLoader('tokenizers/punkt/portuguese.pickle') 199 PlaintextCorpusReader.__init__(self, *args, **kwargs)
200
201 -class EuroparlCorpusReader(PlaintextCorpusReader):
202 203 """ 204 Reader for Europarl corpora that consist of plaintext documents. 205 Documents are divided into chapters instead of paragraphs as 206 for regular plaintext documents. Chapters are separated using blank 207 lines. Everything is inherited from L{PlaintextCorpusReader} except 208 that: 209 - Since the corpus is pre-processed and pre-tokenized, the 210 word tokenizer should just split the line at whitespaces. 211 - For the same reason, the sentence tokenizer should just 212 split the paragraph at line breaks. 213 - There is a new 'chapters()' method that returns chapters instead 214 instead of paragraphs. 215 - The 'paras()' method inherited from PlaintextCorpusReader is 216 made non-functional to remove any confusion between chapters 217 and paragraphs for Europarl. 218 """ 219
220 - def _read_word_block(self, stream):
221 words = [] 222 for i in range(20): # Read 20 lines at a time. 223 words.extend(stream.readline().split()) 224 return words
225
226 - def _read_sent_block(self, stream):
227 sents = [] 228 for para in self._para_block_reader(stream): 229 sents.extend([sent.split() for sent in para.splitlines()]) 230 return sents
231
232 - def _read_para_block(self, stream):
233 paras = [] 234 for para in self._para_block_reader(stream): 235 paras.append([sent.split() for sent in para.splitlines()]) 236 return paras
237
238 - def chapters(self, fileids=None):
239 """ 240 @return: the given file(s) as a list of 241 chapters, each encoded as a list of sentences, which are 242 in turn encoded as lists of word strings. 243 @rtype: C{list} of (C{list} of (C{list} of C{str})) 244 """ 245 return concat([self.CorpusView(fileid, self._read_para_block, 246 encoding=enc) 247 for (fileid, enc) in self.abspaths(fileids, True)])
248
249 - def paras(self, fileids=None):
250 raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.')
251