Package nltk :: Package corpus :: Package reader :: Module tagged
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.tagged

  1  # Natural Language Toolkit: Tagged Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  #         Steven Bird <sb@ldc.upenn.edu> 
  6  #         Jacob Perkins <japerk@gmail.com> 
  7  # URL: <http://www.nltk.org/> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  A reader for corpora whose documents contain part-of-speech-tagged words. 
 12  """        
 13   
 14  import os 
 15   
 16  from nltk.tag import str2tuple 
 17  from nltk.tokenize import * 
 18   
 19  from api import * 
 20  from util import * 
 21  from timit import read_timit_block 
 22   
23 -class TaggedCorpusReader(CorpusReader):
24 """ 25 Reader for simple part-of-speech tagged corpora. Paragraphs are 26 assumed to be split using blank lines. Sentences and words can be 27 tokenized using the default tokenizers, or by custom tokenizers 28 specified as parameters to the constructor. Words are parsed 29 using L{nltk.tag.str2tuple}. By default, C{'/'} is used as the 30 separator. I.e., words should have the form:: 31 32 word1/tag1 word2/tag2 word3/tag3 ... 33 34 But custom separators may be specified as parameters to the 35 constructor. Part of speech tags are case-normalized to upper 36 case. 37 """
38 - def __init__(self, root, fileids, 39 sep='/', word_tokenizer=WhitespaceTokenizer(), 40 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 41 para_block_reader=read_blankline_block, 42 encoding=None, 43 tag_mapping_function=None):
44 """ 45 Construct a new Tagged Corpus reader for a set of documents 46 located at the given root directory. Example usage: 47 48 >>> root = '/...path to corpus.../' 49 >>> reader = TaggedCorpusReader(root, '.*', '.txt') 50 51 @param root: The root directory for this corpus. 52 @param fileids: A list or regexp specifying the fileids in this corpus. 53 """ 54 CorpusReader.__init__(self, root, fileids, encoding) 55 self._sep = sep 56 self._word_tokenizer = word_tokenizer 57 self._sent_tokenizer = sent_tokenizer 58 self._para_block_reader = para_block_reader 59 self._tag_mapping_function = tag_mapping_function
60
61 - def raw(self, fileids=None):
62 """ 63 @return: the given file(s) as a single string. 64 @rtype: C{str} 65 """ 66 if fileids is None: fileids = self._fileids 67 elif isinstance(fileids, basestring): fileids = [fileids] 68 return concat([self.open(f).read() for f in fileids])
69
70 - def words(self, fileids=None):
71 """ 72 @return: the given file(s) as a list of words 73 and punctuation symbols. 74 @rtype: C{list} of C{str} 75 """ 76 return concat([TaggedCorpusView(fileid, enc, 77 False, False, False, 78 self._sep, self._word_tokenizer, 79 self._sent_tokenizer, 80 self._para_block_reader, 81 None) 82 for (fileid, enc) in self.abspaths(fileids, True)])
83
84 - def sents(self, fileids=None):
85 """ 86 @return: the given file(s) as a list of 87 sentences or utterances, each encoded as a list of word 88 strings. 89 @rtype: C{list} of (C{list} of C{str}) 90 """ 91 return concat([TaggedCorpusView(fileid, enc, 92 False, True, False, 93 self._sep, self._word_tokenizer, 94 self._sent_tokenizer, 95 self._para_block_reader, 96 None) 97 for (fileid, enc) in self.abspaths(fileids, True)])
98
99 - def paras(self, fileids=None):
100 """ 101 @return: the given file(s) as a list of 102 paragraphs, each encoded as a list of sentences, which are 103 in turn encoded as lists of word strings. 104 @rtype: C{list} of (C{list} of (C{list} of C{str})) 105 """ 106 return concat([TaggedCorpusView(fileid, enc, 107 False, True, True, 108 self._sep, self._word_tokenizer, 109 self._sent_tokenizer, 110 self._para_block_reader, 111 None) 112 for (fileid, enc) in self.abspaths(fileids, True)])
113
114 - def tagged_words(self, fileids=None, simplify_tags=False):
115 """ 116 @return: the given file(s) as a list of tagged 117 words and punctuation symbols, encoded as tuples 118 C{(word,tag)}. 119 @rtype: C{list} of C{(str,str)} 120 """ 121 if simplify_tags: 122 tag_mapping_function = self._tag_mapping_function 123 else: 124 tag_mapping_function = None 125 return concat([TaggedCorpusView(fileid, enc, 126 True, False, False, 127 self._sep, self._word_tokenizer, 128 self._sent_tokenizer, 129 self._para_block_reader, 130 tag_mapping_function) 131 for (fileid, enc) in self.abspaths(fileids, True)])
132
133 - def tagged_sents(self, fileids=None, simplify_tags=False):
134 """ 135 @return: the given file(s) as a list of 136 sentences, each encoded as a list of C{(word,tag)} tuples. 137 138 @rtype: C{list} of (C{list} of C{(str,str)}) 139 """ 140 if simplify_tags: 141 tag_mapping_function = self._tag_mapping_function 142 else: 143 tag_mapping_function = None 144 return concat([TaggedCorpusView(fileid, enc, 145 True, True, False, 146 self._sep, self._word_tokenizer, 147 self._sent_tokenizer, 148 self._para_block_reader, 149 tag_mapping_function) 150 for (fileid, enc) in self.abspaths(fileids, True)])
151
152 - def tagged_paras(self, fileids=None, simplify_tags=False):
153 """ 154 @return: the given file(s) as a list of 155 paragraphs, each encoded as a list of sentences, which are 156 in turn encoded as lists of C{(word,tag)} tuples. 157 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)})) 158 """ 159 if simplify_tags: 160 tag_mapping_function = self._tag_mapping_function 161 else: 162 tag_mapping_function = None 163 return concat([TaggedCorpusView(fileid, enc, 164 True, True, True, 165 self._sep, self._word_tokenizer, 166 self._sent_tokenizer, 167 self._para_block_reader, 168 tag_mapping_function) 169 for (fileid, enc) in self.abspaths(fileids, True)])
170
171 -class CategorizedTaggedCorpusReader(CategorizedCorpusReader, 172 TaggedCorpusReader):
173 """ 174 A reader for part-of-speech tagged corpora whose documents are 175 divided into categories based on their file identifiers. 176 """
177 - def __init__(self, *args, **kwargs):
178 """ 179 Initialize the corpus reader. Categorization arguments 180 (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to 181 the L{CategorizedCorpusReader constructor 182 <CategorizedCorpusReader.__init__>}. The remaining arguments 183 are passed to the L{TaggedCorpusReader constructor 184 <TaggedCorpusReader.__init__>}. 185 """ 186 CategorizedCorpusReader.__init__(self, kwargs) 187 TaggedCorpusReader.__init__(self, *args, **kwargs)
188
189 - def _resolve(self, fileids, categories):
190 if fileids is not None and categories is not None: 191 raise ValueError('Specify fileids or categories, not both') 192 if categories is not None: 193 return self.fileids(categories) 194 else: 195 return fileids
196 - def raw(self, fileids=None, categories=None):
197 return TaggedCorpusReader.raw( 198 self, self._resolve(fileids, categories))
199 - def words(self, fileids=None, categories=None):
200 return TaggedCorpusReader.words( 201 self, self._resolve(fileids, categories))
202 - def sents(self, fileids=None, categories=None):
203 return TaggedCorpusReader.sents( 204 self, self._resolve(fileids, categories))
205 - def paras(self, fileids=None, categories=None):
206 return TaggedCorpusReader.paras( 207 self, self._resolve(fileids, categories))
208 - def tagged_words(self, fileids=None, categories=None, simplify_tags=False):
209 return TaggedCorpusReader.tagged_words( 210 self, self._resolve(fileids, categories), simplify_tags)
211 - def tagged_sents(self, fileids=None, categories=None, simplify_tags=False):
212 return TaggedCorpusReader.tagged_sents( 213 self, self._resolve(fileids, categories), simplify_tags)
214 - def tagged_paras(self, fileids=None, categories=None, simplify_tags=False):
215 return TaggedCorpusReader.tagged_paras( 216 self, self._resolve(fileids, categories), simplify_tags)
217
218 -class TaggedCorpusView(StreamBackedCorpusView):
219 """ 220 A specialized corpus view for tagged documents. It can be 221 customized via flags to divide the tagged corpus documents up by 222 sentence or paragraph, and to include or omit part of speech tags. 223 C{TaggedCorpusView} objects are typically created by 224 L{TaggedCorpusReader} (not directly by nltk users). 225 """
226 - def __init__(self, corpus_file, encoding, tagged, group_by_sent, 227 group_by_para, sep, word_tokenizer, sent_tokenizer, 228 para_block_reader, tag_mapping_function=None):
229 self._tagged = tagged 230 self._group_by_sent = group_by_sent 231 self._group_by_para = group_by_para 232 self._sep = sep 233 self._word_tokenizer = word_tokenizer 234 self._sent_tokenizer = sent_tokenizer 235 self._para_block_reader = para_block_reader 236 self._tag_mapping_function = tag_mapping_function 237 StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding)
238
239 - def read_block(self, stream):
240 """Reads one paragraph at a time.""" 241 block = [] 242 for para_str in self._para_block_reader(stream): 243 para = [] 244 for sent_str in self._sent_tokenizer.tokenize(para_str): 245 sent = [str2tuple(s, self._sep) for s in 246 self._word_tokenizer.tokenize(sent_str)] 247 if self._tag_mapping_function: 248 sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] 249 if not self._tagged: 250 sent = [w for (w,t) in sent] 251 if self._group_by_sent: 252 para.append(sent) 253 else: 254 para.extend(sent) 255 if self._group_by_para: 256 block.append(para) 257 else: 258 block.extend(para) 259 return block
260 261 # needs to implement simplified tags
262 -class MacMorphoCorpusReader(TaggedCorpusReader):
263 """ 264 A corpus reader for the MAC_MORPHO corpus. Each line contains a 265 single tagged word, using '_' as a separator. Sentence boundaries 266 are based on the end-sentence tag ('_.'). Paragraph information 267 is not included in the corpus, so each paragraph returned by 268 L{self.paras()} and L{self.tagged_paras()} contains a single 269 sentence. 270 """
271 - def __init__(self, root, fileids, encoding=None, tag_mapping_function=None):
272 TaggedCorpusReader.__init__( 273 self, root, fileids, sep='_', 274 word_tokenizer=LineTokenizer(), 275 sent_tokenizer=RegexpTokenizer('.*\n'), 276 para_block_reader=self._read_block, 277 encoding=encoding, 278 tag_mapping_function=tag_mapping_function)
279
280 - def _read_block(self, stream):
281 return read_regexp_block(stream, r'.*', r'.*_\.')
282
283 -class TimitTaggedCorpusReader(TaggedCorpusReader):
284 """ 285 A corpus reader for tagged sentences that are included in the TIMIT corpus. 286 """
287 - def __init__(self, *args, **kwargs):
288 TaggedCorpusReader.__init__( 289 self, para_block_reader=read_timit_block, *args, **kwargs)
290
291 - def paras(self):
292 raise NotImplementedError('use sents() instead')
293
294 - def tagged_paras(self):
295 raise NotImplementedError('use tagged_sents() instead')
296