Package nltk :: Package corpus :: Package reader :: Module ipipan
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.ipipan

  1  # Natural Language Toolkit: IPI PAN Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Konrad Goluchowski <kodie@mimuw.edu.pl> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  # import functools 
  9  from nltk.compat import * 
 10   
 11  from nltk.corpus.reader.util import StreamBackedCorpusView, concat 
 12  from nltk.corpus.reader.api import CorpusReader 
13 14 -def _parse_args(fun):
15 def decorator(self, fileids=None, **kwargs): 16 kwargs.pop('tags', None) 17 if not fileids: 18 fileids = self.fileids() 19 return fun(self, fileids, **kwargs)
20 decorator.__name__ = fun.__name__ 21 decorator.__doc__ = fun.__doc__ 22 decorator.__module__ = fun.__module__ 23 return decorator 24
25 # Assumes Python >=2.5 26 # def _parse_args(fun): 27 # @functools.wraps(fun) 28 # def decorator(self, fileids=None, **kwargs): 29 # kwargs.pop('tags', None) 30 # if not fileids: 31 # fileids = self.fileids() 32 # return fun(self, fileids, **kwargs) 33 # return decorator 34 35 -class IPIPANCorpusReader(CorpusReader):
36 """Corpus reader designed to work with corpus created by IPI PAN. 37 See http://korpus.pl/en/ for more details about IPI PAN corpus. 38 39 The corpus includes information about text domain, channel and categories. 40 You can access possible values using ipipan.domains(), ipipan.channels() and 41 ipipan.categories(). You can use also this metadata to filter files, e.g.: 42 ipipan.fileids(channel='prasa') 43 ipipan.fileids(categories='publicystyczny') 44 45 The reader supports methods: words, sents, paras and their tagged versions. 46 You can get part of speech instead of full tag by giving "simplify_tags=True" 47 parameter, e.g.: 48 ipipan.tagged_sents(simplify_tags=True) 49 50 Also you can get all tags disambiguated tags specifying parameter 51 "one_tag=False", e.g.: 52 ipipan.tagged_paras(one_tag=False) 53 54 You can get all tags that were assigned by a morphological analyzer specifying 55 parameter "disamb_only=False", e.g. 56 ipipan.tagged_words(disamb_only=False) 57 58 The IPIPAN Corpus contains tags indicating if there is a space between two 59 tokens. To add special "no space" markers, you should specify parameter 60 "append_no_space=True", e.g. 61 ipipan.tagged_words(append_no_space=True) 62 As a result in place where there should be no space between two tokens new 63 pair ('', 'no-space') will be inserted (for tagged data) and just '' for 64 methods without tags. 65 66 The corpus reader can also try to append spaces between words. To enable this 67 option, specify parameter "append_space=True", e.g. 68 ipipan.words(append_space=True) 69 As a result either ' ' or (' ', 'space') will be inserted between tokens. 70 71 By default, xml entities like &quot; and &amp; are replaced by corresponding 72 characters. You can turn off this feature, specifying parameter 73 "replace_xmlentities=False", e.g. 74 ipipan.words(replace_xmlentities=False) 75 """ 76
77 - def __init__(self, root, fileids):
78 CorpusReader.__init__(self, root, fileids, None, None)
79
80 - def raw(self, fileids=None):
81 if not fileids: 82 fileids = self.fileids() 83 return ''.join([open(fileid, 'r').read() 84 for fileid in self._list_morph_files(fileids)])
85
86 - def channels(self, fileids=None):
87 if not fileids: 88 fileids = self.fileids() 89 return self._parse_header(fileids, 'channel')
90
91 - def domains(self, fileids=None):
92 if not fileids: 93 fileids = self.fileids() 94 return self._parse_header(fileids, 'domain')
95
96 - def categories(self, fileids=None):
97 if not fileids: 98 fileids = self.fileids() 99 return [self._map_category(cat) 100 for cat in self._parse_header(fileids, 'keyTerm')]
101
102 - def fileids(self, channels=None, domains=None, categories=None):
103 if channels is not None and domains is not None and \ 104 categories is not None: 105 raise ValueError('You can specify only one of channels, domains ' 106 'and categories parameter at once') 107 if channels is None and domains is None and \ 108 categories is None: 109 return CorpusReader.fileids(self) 110 if isinstance(channels, basestring): 111 channels = [channels] 112 if isinstance(domains, basestring): 113 domains = [domains] 114 if isinstance(categories, basestring): 115 categories = [categories] 116 if channels: 117 return self._list_morph_files_by('channel', channels) 118 elif domains: 119 return self._list_morph_files_by('domain', domains) 120 else: 121 return self._list_morph_files_by('keyTerm', categories, 122 map=self._map_category)
123 124 @_parse_args
125 - def sents(self, fileids=None, **kwargs):
126 return concat([self._view(fileid, 127 mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs) 128 for fileid in self._list_morph_files(fileids)])
129 130 @_parse_args
131 - def paras(self, fileids=None, **kwargs):
132 return concat([self._view(fileid, 133 mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs) 134 for fileid in self._list_morph_files(fileids)])
135 136 @_parse_args
137 - def words(self, fileids=None, **kwargs):
138 return concat([self._view(fileid, tags=False, **kwargs) 139 for fileid in self._list_morph_files(fileids)])
140 141 @_parse_args
142 - def tagged_sents(self, fileids=None, **kwargs):
143 return concat([self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, 144 **kwargs) 145 for fileid in self._list_morph_files(fileids)])
146 147 @_parse_args
148 - def tagged_paras(self, fileids=None, **kwargs):
149 return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, 150 **kwargs) 151 for fileid in self._list_morph_files(fileids)])
152 153 @_parse_args
154 - def tagged_words(self, fileids=None, **kwargs):
155 return concat([self._view(fileid, **kwargs) 156 for fileid in self._list_morph_files(fileids)])
157
158 - def _list_morph_files(self, fileids):
159 return [f for f in self.abspaths(fileids)]
160
161 - def _list_header_files(self, fileids):
162 return [f.replace('morph.xml', 'header.xml') 163 for f in self._list_morph_files(fileids)]
164
165 - def _parse_header(self, fileids, tag):
166 values = set() 167 for f in self._list_header_files(fileids): 168 values_list = self._get_tag(f, tag) 169 for v in values_list: 170 values.add(v) 171 return list(values)
172
173 - def _list_morph_files_by(self, tag, values, map=None):
174 fileids = self.fileids() 175 ret_fileids = set() 176 for f in fileids: 177 fp = self.abspath(f).replace('morph.xml', 'header.xml') 178 values_list = self._get_tag(fp, tag) 179 for value in values_list: 180 if map is not None: 181 value = map(value) 182 if value in values: 183 ret_fileids.add(f) 184 return list(ret_fileids)
185
186 - def _get_tag(self, f, tag):
187 tags = [] 188 header = open(f, 'r').read() 189 tag_end = 0 190 while True: 191 tag_pos = header.find('<'+tag, tag_end) 192 if tag_pos < 0: return tags 193 tag_end = header.find('</'+tag+'>', tag_pos) 194 tags.append(header[tag_pos+len(tag)+2:tag_end])
195
196 - def _map_category(self, cat):
197 pos = cat.find('>') 198 if pos == -1: 199 return cat 200 else: 201 return cat[pos+1:]
202
203 - def _view(self, filename, **kwargs):
204 tags = kwargs.pop('tags', True) 205 mode = kwargs.pop('mode', 0) 206 simplify_tags = kwargs.pop('simplify_tags', False) 207 one_tag = kwargs.pop('one_tag', True) 208 disamb_only = kwargs.pop('disamb_only', True) 209 append_no_space = kwargs.pop('append_no_space', False) 210 append_space = kwargs.pop('append_space', False) 211 replace_xmlentities = kwargs.pop('replace_xmlentities', True) 212 213 if len(kwargs) > 0: 214 raise ValueError('Unexpected arguments: %s' % kwargs.keys()) 215 if not one_tag and not disamb_only: 216 raise ValueError('You cannot specify both one_tag=False and ' 217 'disamb_only=False') 218 if not tags and (simplify_tags or not one_tag or not disamb_only): 219 raise ValueError('You cannot specify simplify_tags, one_tag or ' 220 'disamb_only with functions other than tagged_*') 221 222 return IPIPANCorpusView(filename, 223 tags=tags, mode=mode, simplify_tags=simplify_tags, 224 one_tag=one_tag, disamb_only=disamb_only, 225 append_no_space=append_no_space, 226 append_space=append_space, 227 replace_xmlentities=replace_xmlentities 228 )
229
230 231 -class IPIPANCorpusView(StreamBackedCorpusView):
232 233 WORDS_MODE = 0 234 SENTS_MODE = 1 235 PARAS_MODE = 2 236
237 - def __init__(self, filename, startpos=0, **kwargs):
238 StreamBackedCorpusView.__init__(self, filename, None, startpos, None) 239 self.in_sentence = False 240 self.position = 0 241 242 self.show_tags = kwargs.pop('tags', True) 243 self.disamb_only = kwargs.pop('disamb_only', True) 244 self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE) 245 self.simplify_tags = kwargs.pop('simplify_tags', False) 246 self.one_tag = kwargs.pop('one_tag', True) 247 self.append_no_space = kwargs.pop('append_no_space', False) 248 self.append_space = kwargs.pop('append_space', False) 249 self.replace_xmlentities = kwargs.pop('replace_xmlentities', True)
250
251 - def read_block(self, stream):
252 sentence = [] 253 sentences = [] 254 space = False 255 no_space = False 256 257 tags = set() 258 259 lines = self._read_data(stream) 260 261 while True: 262 263 # we may have only part of last line 264 if len(lines) <= 1: 265 self._seek(stream) 266 lines = self._read_data(stream) 267 268 if lines == ['']: 269 assert not sentences 270 return [] 271 272 line = lines.pop() 273 self.position += len(line) + 1 274 275 if line.startswith('<chunk type="s"'): 276 self.in_sentence = True 277 elif line.startswith('<chunk type="p"'): 278 pass 279 elif line.startswith('<tok'): 280 if self.append_space and space and not no_space: 281 self._append_space(sentence) 282 space = True 283 no_space = False 284 orth = "" 285 tags = set() 286 elif line.startswith('</chunk'): 287 if self.in_sentence: 288 self.in_sentence = False 289 self._seek(stream) 290 if self.mode == self.SENTS_MODE: 291 return [sentence] 292 elif self.mode == self.WORDS_MODE: 293 if self.append_space: 294 self._append_space(sentence) 295 return sentence 296 else: 297 sentences.append(sentence) 298 elif self.mode == self.PARAS_MODE: 299 self._seek(stream) 300 return [sentences] 301 elif line.startswith('<orth'): 302 orth = line[6:-7] 303 if self.replace_xmlentities: 304 orth = orth.replace('&quot;', '"').replace('&amp;', '&') 305 elif line.startswith('<lex'): 306 if not self.disamb_only or line.find('disamb=') != -1: 307 tag = line[line.index('<ctag')+6 : line.index('</ctag') ] 308 tags.add(tag) 309 elif line.startswith('</tok'): 310 if self.show_tags: 311 if self.simplify_tags: 312 tags = [t.split(':')[0] for t in tags] 313 if not self.one_tag or not self.disamb_only: 314 sentence.append((orth, tuple(tags))) 315 else: 316 sentence.append((orth, tags.pop())) 317 else: 318 sentence.append(orth) 319 elif line.startswith('<ns/>'): 320 if self.append_space: 321 no_space = True 322 if self.append_no_space: 323 if self.show_tags: 324 sentence.append(('', 'no-space')) 325 else: 326 sentence.append('') 327 elif line.startswith('</cesAna'): 328 pass
329
330 - def _read_data(self, stream):
331 self.position = stream.tell() 332 buff = stream.read(4096) 333 lines = buff.split('\n') 334 lines.reverse() 335 return lines
336
337 - def _seek(self, stream):
338 stream.seek(self.position)
339
340 - def _append_space(self, sentence):
341 if self.show_tags: 342 sentence.append((' ', 'space')) 343 else: 344 sentence.append(' ')
345