Package nltk :: Package corpus :: Package reader :: Module chunked
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.chunked

  1  # Natural Language Toolkit: Chunked Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Steven Bird <sb@ldc.upenn.edu> 
  5  #         Edward Loper <edloper@gradient.cis.upenn.edu> 
  6  # URL: <http://www.nltk.org/> 
  7  # For license information, see LICENSE.TXT 
  8   
  9  """ 
 10  A reader for corpora that contain chunked (and optionally tagged) 
 11  documents. 
 12  """ 
 13   
 14  import os.path, codecs 
 15   
 16  import nltk 
 17  from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 
 18  from nltk.tree import Tree 
 19  from nltk.tokenize import * 
 20  from util import * 
 21  from api import * 
 22   
23 -class ChunkedCorpusReader(CorpusReader):
24 """ 25 Reader for chunked (and optionally tagged) corpora. Paragraphs 26 are split using a block reader. They are then tokenized into 27 sentences using a sentence tokenizer. Finally, these sentences 28 are parsed into chunk trees using a string-to-chunktree conversion 29 function. Each of these steps can be performed using a default 30 function or a custom function. By default, paragraphs are split 31 on blank lines; sentences are listed one per line; and sentences 32 are parsed into chunk trees using L{nltk.chunk.tagstr2tree}. 33 """
34 - def __init__(self, root, fileids, extension='', 35 str2chunktree=nltk.chunk.tagstr2tree, 36 sent_tokenizer=RegexpTokenizer('\n', gaps=True), 37 para_block_reader=read_blankline_block, 38 encoding=None):
39 """ 40 @param root: The root directory for this corpus. 41 @param fileids: A list or regexp specifying the fileids in this corpus. 42 """ 43 CorpusReader.__init__(self, root, fileids, encoding) 44 45 self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader) 46 """Arguments for corpus views generated by this corpus: a tuple 47 (str2chunktree, sent_tokenizer, para_block_tokenizer)"""
48
49 - def raw(self, fileids=None):
50 """ 51 @return: the given file(s) as a single string. 52 @rtype: C{str} 53 """ 54 if fileids is None: fileids = self._fileids 55 elif isinstance(fileids, basestring): fileids = [fileids] 56 return concat([self.open(f).read() for f in fileids])
57
58 - def words(self, fileids=None):
59 """ 60 @return: the given file(s) as a list of words 61 and punctuation symbols. 62 @rtype: C{list} of C{str} 63 """ 64 return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) 65 for (f, enc) in self.abspaths(fileids, True)])
66
67 - def sents(self, fileids=None):
68 """ 69 @return: the given file(s) as a list of 70 sentences or utterances, each encoded as a list of word 71 strings. 72 @rtype: C{list} of (C{list} of C{str}) 73 """ 74 return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) 75 for (f, enc) in self.abspaths(fileids, True)])
76
77 - def paras(self, fileids=None):
78 """ 79 @return: the given file(s) as a list of 80 paragraphs, each encoded as a list of sentences, which are 81 in turn encoded as lists of word strings. 82 @rtype: C{list} of (C{list} of (C{list} of C{str})) 83 """ 84 return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) 85 for (f, enc) in self.abspaths(fileids, True)])
86
87 - def tagged_words(self, fileids=None):
88 """ 89 @return: the given file(s) as a list of tagged 90 words and punctuation symbols, encoded as tuples 91 C{(word,tag)}. 92 @rtype: C{list} of C{(str,str)} 93 """ 94 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args) 95 for (f, enc) in self.abspaths(fileids, True)])
96
97 - def tagged_sents(self, fileids=None):
98 """ 99 @return: the given file(s) as a list of 100 sentences, each encoded as a list of C{(word,tag)} tuples. 101 102 @rtype: C{list} of (C{list} of C{(str,str)}) 103 """ 104 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args) 105 for (f, enc) in self.abspaths(fileids, True)])
106
107 - def tagged_paras(self, fileids=None):
108 """ 109 @return: the given file(s) as a list of 110 paragraphs, each encoded as a list of sentences, which are 111 in turn encoded as lists of C{(word,tag)} tuples. 112 @rtype: C{list} of (C{list} of (C{list} of C{(str,str)})) 113 """ 114 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args) 115 for (f, enc) in self.abspaths(fileids, True)])
116
117 - def chunked_words(self, fileids=None):
118 """ 119 @return: the given file(s) as a list of tagged 120 words and chunks. Words are encoded as C{(word, tag)} 121 tuples (if the corpus has tags) or word strings (if the 122 corpus has no tags). Chunks are encoded as depth-one 123 trees over C{(word,tag)} tuples or word strings. 124 @rtype: C{list} of (C{(str,str)} and L{Tree}) 125 """ 126 return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args) 127 for (f, enc) in self.abspaths(fileids, True)])
128
129 - def chunked_sents(self, fileids=None):
130 """ 131 @return: the given file(s) as a list of 132 sentences, each encoded as a shallow C{Tree}. The leaves 133 of these trees are encoded as C{(word, tag)} tuples (if 134 the corpus has tags) or word strings (if the corpus has no 135 tags). 136 @rtype: C{list} of L{Tree} 137 """ 138 return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args) 139 for (f, enc) in self.abspaths(fileids, True)])
140
141 - def chunked_paras(self, fileids=None):
142 """ 143 @return: the given file(s) as a list of 144 paragraphs, each encoded as a list of sentences, which are 145 in turn encoded as a shallow C{Tree}. The leaves of these 146 trees are encoded as C{(word, tag)} tuples (if the corpus 147 has tags) or word strings (if the corpus has no tags). 148 @rtype: C{list} of (C{list} of L{Tree}) 149 """ 150 return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args) 151 for (f, enc) in self.abspaths(fileids, True)])
152
153 - def _read_block(self, stream):
154 return [nltk.chunk.tagstr2tree(t) for t in 155 read_blankline_block(stream)]
156
157 -class ChunkedCorpusView(StreamBackedCorpusView):
158 - def __init__(self, fileid, encoding, tagged, group_by_sent, 159 group_by_para, chunked, str2chunktree, sent_tokenizer, 160 para_block_reader):
161 StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) 162 self._tagged = tagged 163 self._group_by_sent = group_by_sent 164 self._group_by_para = group_by_para 165 self._chunked = chunked 166 self._str2chunktree = str2chunktree 167 self._sent_tokenizer = sent_tokenizer 168 self._para_block_reader = para_block_reader
169
170 - def read_block(self, stream):
171 block = [] 172 for para_str in self._para_block_reader(stream): 173 para = [] 174 for sent_str in self._sent_tokenizer.tokenize(para_str): 175 sent = self._str2chunktree(sent_str) 176 177 # If requested, throw away the tags. 178 if not self._tagged: 179 sent = self._untag(sent) 180 181 # If requested, throw away the chunks. 182 if not self._chunked: 183 sent = sent.leaves() 184 185 # Add the sentence to `para`. 186 if self._group_by_sent: 187 para.append(sent) 188 else: 189 para.extend(sent) 190 191 # Add the paragraph to `block`. 192 if self._group_by_para: 193 block.append(para) 194 else: 195 block.extend(para) 196 197 # Return the block 198 return block
199
200 - def _untag(self, tree):
201 for i, child in enumerate(tree): 202 if isinstance(child, Tree): 203 self._untag(child) 204 elif isinstance(child, tuple): 205 tree[i] = child[0] 206 else: 207 raise ValueError('expected child to be Tree or tuple') 208 return tree
209