Package nltk :: Package corpus :: Package reader :: Module nps_chat
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.nps_chat

 1  # Natural Language Toolkit: NPS Chat Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2011 NLTK Project 
 4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
 5  # URL: <http://www.nltk.org/> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  import re 
 9  import textwrap 
10   
11  from nltk.compat import * 
12  from nltk.util import LazyConcatenation 
13  from nltk.internals import ElementWrapper 
14   
15  from util import * 
16  from api import * 
17  from xmldocs import * 
18 19 -class NPSChatCorpusReader(XMLCorpusReader):
20
21 - def __init__(self, root, fileids, wrap_etree=False, tag_mapping_function=None):
22 XMLCorpusReader.__init__(self, root, fileids, wrap_etree) 23 self._tag_mapping_function = tag_mapping_function
24
25 - def xml_posts(self, fileids=None):
26 if self._wrap_etree: 27 return concat([XMLCorpusView(fileid, 'Session/Posts/Post', 28 self._wrap_elt) 29 for fileid in self.abspaths(fileids)]) 30 else: 31 return concat([XMLCorpusView(fileid, 'Session/Posts/Post') 32 for fileid in self.abspaths(fileids)])
33
34 - def posts(self, fileids=None):
35 return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals', 36 self._elt_to_words) 37 for fileid in self.abspaths(fileids)])
38
39 - def tagged_posts(self, fileids=None, simplify_tags=False):
40 def reader(elt, handler): 41 return self._elt_to_tagged_words(elt, handler, simplify_tags)
42 return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals', 43 reader) 44 for fileid in self.abspaths(fileids)])
45
46 - def words(self, fileids=None):
47 return LazyConcatenation(self.posts(fileids))
48
49 - def tagged_words(self, fileids=None, simplify_tags=False):
50 return LazyConcatenation(self.tagged_posts(fileids, simplify_tags))
51
52 - def _wrap_elt(self, elt, handler):
53 return ElementWrapper(elt)
54
55 - def _elt_to_words(self, elt, handler):
56 return [self._simplify_username(t.attrib['word']) 57 for t in elt.findall('t')]
58
59 - def _elt_to_tagged_words(self, elt, handler, simplify_tags=False):
60 tagged_post = [(self._simplify_username(t.attrib['word']), 61 t.attrib['pos']) for t in elt.findall('t')] 62 if simplify_tags: 63 tagged_post = [(w, self._tag_mapping_function(t)) 64 for (w,t) in tagged_post] 65 return tagged_post
66 67 @staticmethod
68 - def _simplify_username(word):
69 if 'User' in word: 70 word = 'U' + word.split('User', 1)[1] 71 return word
72