1
2
3
4
5
6
7
8 import re
9 import textwrap
10
11 from nltk.compat import *
12 from nltk.util import LazyConcatenation
13 from nltk.internals import ElementWrapper
14
15 from util import *
16 from api import *
17 from xmldocs import *
20
21 - def __init__(self, root, fileids, wrap_etree=False, tag_mapping_function=None):
24
25 - def xml_posts(self, fileids=None):
26 if self._wrap_etree:
27 return concat([XMLCorpusView(fileid, 'Session/Posts/Post',
28 self._wrap_elt)
29 for fileid in self.abspaths(fileids)])
30 else:
31 return concat([XMLCorpusView(fileid, 'Session/Posts/Post')
32 for fileid in self.abspaths(fileids)])
33
34 - def posts(self, fileids=None):
35 return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals',
36 self._elt_to_words)
37 for fileid in self.abspaths(fileids)])
38
39 - def tagged_posts(self, fileids=None, simplify_tags=False):
40 def reader(elt, handler):
41 return self._elt_to_tagged_words(elt, handler, simplify_tags)
42 return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals',
43 reader)
44 for fileid in self.abspaths(fileids)])
45
46 - def words(self, fileids=None):
48
51
54
58
60 tagged_post = [(self._simplify_username(t.attrib['word']),
61 t.attrib['pos']) for t in elt.findall('t')]
62 if simplify_tags:
63 tagged_post = [(w, self._tag_mapping_function(t))
64 for (w,t) in tagged_post]
65 return tagged_post
66
67 @staticmethod
69 if 'User' in word:
70 word = 'U' + word.split('User', 1)[1]
71 return word
72