| Home | Trees | Indices | Help |
|
|---|
|
|
1 # Natural Language Toolkit: Switchboard Corpus Reader 2 # 3 # Copyright (C) 2001-2011 NLTK Project 4 # Author: Edward Loper <edloper@gradient.cis.upenn.edu> 5 # URL: <http://www.nltk.org/> 6 # For license information, see LICENSE.TXT 7 8 import re 9 10 from nltk.tag import str2tuple 11 12 from util import * 13 from api import * 1416 """ 17 A specialized list object used to encode switchboard utterances. 18 The elements of the list are the words in the utterance; and two 19 attributes, C{speaker} and C{id}, are provided to retrieve the 20 spearker identifier and utterance id. Note that utterance ids 21 are only unique within a given discourse. 22 """3537 _FILES = ['tagged'] 38 # Use the "tagged" file even for non-tagged data methods, since 39 # it's tokenized. 4054 5842 CorpusReader.__init__(self, root, self._FILES) 43 self._tag_mapping_function = tag_mapping_function44 4850 def tagged_words_block_reader(stream): 51 return self._tagged_words_block_reader(stream, simplify_tags)52 return StreamBackedCorpusView(self.abspath('tagged'), 53 tagged_words_block_reader)60 def tagged_turns_block_reader(stream): 61 return self._tagged_turns_block_reader(stream, simplify_tags)62 return StreamBackedCorpusView(self.abspath('tagged'), 63 tagged_turns_block_reader) 64 6870 def tagged_discourses_block_reader(stream): 71 return self._tagged_discourses_block_reader(stream, simplify_tags)72 return StreamBackedCorpusView(self.abspath('tagged'), 73 tagged_discourses_block_reader) 7476 # returns at most 1 discourse. (The other methods depend on this.) 77 return [[self._parse_utterance(u, include_tag=False) 78 for b in read_blankline_block(stream) 79 for u in b.split('\n') if u.strip()]]8082 # returns at most 1 discourse. (The other methods depend on this.) 83 return [[self._parse_utterance(u, include_tag=True, 84 simplify_tags=simplify_tags) 85 for b in read_blankline_block(stream) 86 for u in b.split('\n') if u.strip()]]8789 return self._discourses_block_reader(stream)[0]9092 return self._tagged_discourses_block_reader(stream, simplify_tags)[0]9395 return sum(self._discourses_block_reader(stream)[0], [])96 100 101 _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)') 102 _SEP = '/'104 m = self._UTTERANCE_RE.match(utterance) 105 if m is None: 106 raise ValueError('Bad utterance %r' % utterance) 107 speaker, id, text = m.groups() 108 words = [str2tuple(s, self._SEP) for s in text.split()] 109 if not include_tag: 110 words = [w for (w,t) in words] 111 elif simplify_tags: 112 words = [(w, self._tag_mapping_function(t)) for (w,t) in words] 113 return SwitchboardTurn(words, speaker, id)114
| Home | Trees | Indices | Help |
|
|---|
| Generated by Epydoc 3.0.1 on Mon Apr 11 14:40:13 2011 | http://epydoc.sourceforge.net |