Package nltk :: Package corpus :: Package reader :: Module switchboard
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.switchboard

  1  # Natural Language Toolkit: Switchboard Corpus Reader 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import re 
  9   
 10  from nltk.tag import str2tuple 
 11   
 12  from util import * 
 13  from api import * 
 14   
15 -class SwitchboardTurn(list):
16 """ 17 A specialized list object used to encode switchboard utterances. 18 The elements of the list are the words in the utterance; and two 19 attributes, C{speaker} and C{id}, are provided to retrieve the 20 spearker identifier and utterance id. Note that utterance ids 21 are only unique within a given discourse. 22 """
23 - def __init__(self, words, speaker, id):
24 list.__init__(self, words) 25 self.speaker = speaker 26 self.id = int(id)
27 - def __repr__(self):
28 if len(self) == 0: 29 text = '' 30 elif isinstance(self[0], tuple): 31 text = ' '.join('%s/%s' % w for w in self) 32 else: 33 text = ' '.join(self) 34 return '<%s.%s: %r>' % (self.speaker, self.id, text)
35
36 -class SwitchboardCorpusReader(CorpusReader):
37 _FILES = ['tagged'] 38 # Use the "tagged" file even for non-tagged data methods, since 39 # it's tokenized. 40
41 - def __init__(self, root, tag_mapping_function=None):
42 CorpusReader.__init__(self, root, self._FILES) 43 self._tag_mapping_function = tag_mapping_function
44
45 - def words(self):
46 return StreamBackedCorpusView(self.abspath('tagged'), 47 self._words_block_reader)
48
49 - def tagged_words(self, simplify_tags=False):
50 def tagged_words_block_reader(stream): 51 return self._tagged_words_block_reader(stream, simplify_tags)
52 return StreamBackedCorpusView(self.abspath('tagged'), 53 tagged_words_block_reader)
54
55 - def turns(self):
56 return StreamBackedCorpusView(self.abspath('tagged'), 57 self._turns_block_reader)
58
59 - def tagged_turns(self, simplify_tags=False):
60 def tagged_turns_block_reader(stream): 61 return self._tagged_turns_block_reader(stream, simplify_tags)
62 return StreamBackedCorpusView(self.abspath('tagged'), 63 tagged_turns_block_reader) 64
65 - def discourses(self):
66 return StreamBackedCorpusView(self.abspath('tagged'), 67 self._discourses_block_reader)
68
69 - def tagged_discourses(self, simplify_tags=False):
70 def tagged_discourses_block_reader(stream): 71 return self._tagged_discourses_block_reader(stream, simplify_tags)
72 return StreamBackedCorpusView(self.abspath('tagged'), 73 tagged_discourses_block_reader) 74
75 - def _discourses_block_reader(self, stream):
76 # returns at most 1 discourse. (The other methods depend on this.) 77 return [[self._parse_utterance(u, include_tag=False) 78 for b in read_blankline_block(stream) 79 for u in b.split('\n') if u.strip()]]
80
81 - def _tagged_discourses_block_reader(self, stream, simplify_tags=False):
82 # returns at most 1 discourse. (The other methods depend on this.) 83 return [[self._parse_utterance(u, include_tag=True, 84 simplify_tags=simplify_tags) 85 for b in read_blankline_block(stream) 86 for u in b.split('\n') if u.strip()]]
87
88 - def _turns_block_reader(self, stream):
89 return self._discourses_block_reader(stream)[0]
90
91 - def _tagged_turns_block_reader(self, stream, simplify_tags=False):
92 return self._tagged_discourses_block_reader(stream, simplify_tags)[0]
93
94 - def _words_block_reader(self, stream):
95 return sum(self._discourses_block_reader(stream)[0], [])
96
97 - def _tagged_words_block_reader(self, stream, simplify_tags=False):
98 return sum(self._tagged_discourses_block_reader(stream, 99 simplify_tags)[0], [])
100 101 _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)') 102 _SEP = '/'
103 - def _parse_utterance(self, utterance, include_tag, simplify_tags=False):
104 m = self._UTTERANCE_RE.match(utterance) 105 if m is None: 106 raise ValueError('Bad utterance %r' % utterance) 107 speaker, id, text = m.groups() 108 words = [str2tuple(s, self._SEP) for s in text.split()] 109 if not include_tag: 110 words = [w for (w,t) in words] 111 elif simplify_tags: 112 words = [(w, self._tag_mapping_function(t)) for (w,t) in words] 113 return SwitchboardTurn(words, speaker, id)
114