1
2
3
4
5
6
7
8
9
10
11 """
12 Read tokens, phonemes and audio data from the NLTK TIMIT Corpus.
13
14 This corpus contains selected portion of the TIMIT corpus.
15
16 - 16 speakers from 8 dialect regions
17 - 1 male and 1 female from each dialect region
18 - total 130 sentences (10 sentences per speaker. Note that some
19 sentences are shared among other speakers, especially sa1 and sa2
20 are spoken by all speakers.)
21 - total 160 recording of sentences (10 recordings per speaker)
22 - audio format: NIST Sphere, single channel, 16kHz sampling,
23 16 bit sample, PCM encoding
24
25
26 Module contents
27 ===============
28
29 The timit corpus reader provides 4 functions and 4 data items.
30
31 - utterances
32
33 List of utterances in the corpus. There are total 160 utterances,
34 each of which corresponds to a unique utterance of a speaker.
35 Here's an example of an utterance identifier in the list::
36
37 dr1-fvmh0/sx206
38 - _---- _---
39 | | | | |
40 | | | | |
41 | | | | `--- sentence number
42 | | | `----- sentence type (a:all, i:shared, x:exclusive)
43 | | `--------- speaker ID
44 | `------------ sex (m:male, f:female)
45 `-------------- dialect region (1..8)
46
47 - speakers
48
49 List of speaker IDs. An example of speaker ID::
50
51 dr1-fvmh0
52
53 Note that if you split an item ID with colon and take the first element of
54 the result, you will get a speaker ID.
55
56 >>> itemid = dr1-fvmh0/sx206
57 >>> spkrid,sentid = itemid.split('/')
58 >>> spkrid
59 'dr1-fvmh0'
60
61 The second element of the result is a sentence ID.
62
63 - dictionary()
64
65 Phonetic dictionary of words contained in this corpus. This is a Python
66 dictionary from words to phoneme lists.
67
68 - spkrinfo()
69
70 Speaker information table. It's a Python dictionary from speaker IDs to
71 records of 10 fields. Speaker IDs the same as the ones in timie.speakers.
72 Each record is a dictionary from field names to values, and the fields are
73 as follows::
74
75 id speaker ID as defined in the original TIMIT speaker info table
76 sex speaker gender (M:male, F:female)
77 dr speaker dialect region (1:new england, 2:northern,
78 3:north midland, 4:south midland, 5:southern, 6:new york city,
79 7:western, 8:army brat (moved around))
80 use corpus type (TRN:training, TST:test)
81 in this sample corpus only TRN is available
82 recdate recording date
83 birthdate speaker birth date
84 ht speaker height
85 race speaker race (WHT:white, BLK:black, AMR:american indian,
86 SPN:spanish-american, ORN:oriental,???:unknown)
87 edu speaker education level (HS:high school, AS:associate degree,
88 BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA),
89 PHD:doctorate degree (PhD,JD,MD), ??:unknown)
90 comments comments by the recorder
91
92 The 4 functions are as follows.
93
94 - tokenized(sentences=items, offset=False)
95
96 Given a list of items, returns an iterator of a list of word lists,
97 each of which corresponds to an item (sentence). If offset is set to True,
98 each element of the word list is a tuple of word(string), start offset and
99 end offset, where offset is represented as a number of 16kHz samples.
100
101 - phonetic(sentences=items, offset=False)
102
103 Given a list of items, returns an iterator of a list of phoneme lists,
104 each of which corresponds to an item (sentence). If offset is set to True,
105 each element of the phoneme list is a tuple of word(string), start offset
106 and end offset, where offset is represented as a number of 16kHz samples.
107
108 - audiodata(item, start=0, end=None)
109
110 Given an item, returns a chunk of audio samples formatted into a string.
111 When the fuction is called, if start and end are omitted, the entire
112 samples of the recording will be returned. If only end is omitted,
113 samples from the start offset to the end of the recording will be returned.
114
115 - play(data)
116
117 Play the given audio samples. The audio samples can be obtained from the
118 timit.audiodata function.
119
120 """
121
122 import sys
123 import os
124 import re
125 import tempfile
126 import time
127
128 from nltk.tree import Tree
129 from nltk.internals import import_from_stdlib
130
131 from util import *
132 from api import *
133
135 """
136 Reader for the TIMIT corpus (or any other corpus with the same
137 file layout and use of file formats). The corpus root directory
138 should contain the following files:
139
140 - timitdic.txt: dictionary of standard transcriptions
141 - spkrinfo.txt: table of speaker information
142
143 In addition, the root directory should contain one subdirectory
144 for each speaker, containing three files for each utterance:
145
146 - <utterance-id>.txt: text content of utterances
147 - <utterance-id>.wrd: tokenized text content of utterances
148 - <utterance-id>.phn: phonetic transcription of utterances
149 - <utterance-id>.wav: utterance sound file
150 """
151
152 _FILE_RE = (r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' +
153 r'timitdic\.txt|spkrinfo\.txt')
154 """A regexp matching fileids that are used by this corpus reader."""
155 _UTTERANCE_RE = r'\w+-\w+/\w+\.txt'
156
157 - def __init__(self, root, encoding=None):
178
180 """
181 Return a list of file identifiers for the files that make up
182 this corpus.
183
184 @param filetype: If specified, then C{filetype} indicates that
185 only the files that have the given type should be
186 returned. Accepted values are: C{txt}, C{wrd}, C{phn},
187 C{wav}, or C{metadata},
188 """
189 if filetype is None:
190 return CorpusReader.fileids(self)
191 elif filetype in ('txt', 'wrd', 'phn', 'wav'):
192 return ['%s.%s' % (u, filetype) for u in self._utterances]
193 elif filetype == 'metadata':
194 return ['timitdic.txt', 'spkrinfo.txt']
195 else:
196 raise ValueError('Bad value for filetype: %r' % filetype)
197
198 - def utteranceids(self, dialect=None, sex=None, spkrid=None,
199 sent_type=None, sentid=None):
200 """
201 @return: A list of the utterance identifiers for all
202 utterances in this corpus, or for the given speaker, dialect
203 region, gender, sentence type, or sentence number, if
204 specified.
205 """
206 if isinstance(dialect, basestring): dialect = [dialect]
207 if isinstance(sex, basestring): sex = [sex]
208 if isinstance(spkrid, basestring): spkrid = [spkrid]
209 if isinstance(sent_type, basestring): sent_type = [sent_type]
210 if isinstance(sentid, basestring): sentid = [sentid]
211
212 utterances = self._utterances[:]
213 if dialect is not None:
214 utterances = [u for u in utterances if u[2] in dialect]
215 if sex is not None:
216 utterances = [u for u in utterances if u[4] in sex]
217 if spkrid is not None:
218 utterances = [u for u in utterances if u[:9] in spkrid]
219 if sent_type is not None:
220 utterances = [u for u in utterances if u[11] in sent_type]
221 if sentid is not None:
222 utterances = [u for u in utterances if u[10:] in spkrid]
223 return utterances
224
226 """
227 @return: A dictionary giving the 'standard' transcription for
228 each word.
229 """
230 _transcriptions = {}
231 for line in self.open('timitdic.txt'):
232 if not line.strip() or line[0] == ';': continue
233 m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line)
234 if not m: raise ValueError('Bad line: %r' % line)
235 _transcriptions[m.group(1)] = m.group(2).split()
236 return _transcriptions
237
240
243
246
248 """
249 @return: A list of all utterances associated with a given
250 speaker.
251 """
252 return [utterance for utterance in self._utterances
253 if utterance.startswith(speaker+'/')]
254
256 """
257 @return: A dictionary mapping .. something.
258 """
259 if speaker in self._utterances:
260 speaker = self.spkrid(speaker)
261
262 if self._speakerinfo is None:
263 self._speakerinfo = {}
264 for line in self.open('spkrinfo.txt'):
265 if not line.strip() or line[0] == ';': continue
266 rec = line.strip().split(None, 9)
267 key = "dr%s-%s%s" % (rec[2],rec[1].lower(),rec[0].lower())
268 self._speakerinfo[key] = SpeakerInfo(*rec)
269
270 return self._speakerinfo[speaker]
271
272 - def phones(self, utterances=None):
276
284
285 - def words(self, utterances=None):
289
294
295 - def sents(self, utterances=None):
299
305
330
331
332
333
334 - def wav(self, utterance, start=0, end=None):
335
336 wave = import_from_stdlib('wave')
337
338 w = wave.open(self.open(utterance+'.wav'), 'rb')
339
340
341 if start==0 and end is None:
342 return w.read()
343
344
345 else:
346
347 w.readframes(start)
348
349 frames = w.readframes(end-start)
350
351
352 tf = tempfile.TemporaryFile()
353 out = wave.open(tf, 'w')
354
355 out.setparams(w.getparams())
356 out.writeframes(frames)
357 out.close()
358
359
360 tf.seek(0)
361 return tf.read()
362
363 - def audiodata(self, utterance, start=0, end=None):
371
373 if utterances is None: utterances = self._utterances
374 if isinstance(utterances, basestring): utterances = [utterances]
375 return ['%s%s' % (u, extension) for u in utterances]
376
377 - def play(self, utterance, start=0, end=None):
378 """
379 Play the given audio sample.
380
381 @param utterance: The utterance id of the sample to play
382 """
383
384 try:
385 import ossaudiodev
386 try:
387 dsp = ossaudiodev.open('w')
388 dsp.setfmt(ossaudiodev.AFMT_S16_LE)
389 dsp.channels(1)
390 dsp.speed(16000)
391 dsp.write(self.audiodata(utterance, start, end))
392 dsp.close()
393 except IOError, e:
394 print >>sys.stderr, ("can't acquire the audio device; please "
395 "activate your audio device.")
396 print >>sys.stderr, "system error message:", str(e)
397 return
398 except ImportError:
399 pass
400
401
402 try:
403 import pygame.mixer, StringIO
404 pygame.mixer.init(16000)
405 f = StringIO.StringIO(self.wav(utterance, start, end))
406 pygame.mixer.Sound(f).play()
407 while pygame.mixer.get_busy():
408 time.sleep(0.01)
409 return
410 except ImportError:
411 pass
412
413
414 print >>sys.stderr, ("you must install pygame or ossaudiodev "
415 "for audio playback.")
416
417
419 - def __init__(self, id, sex, dr, use, recdate, birthdate,
420 ht, race, edu, comments=None):
421 self.id = id
422 self.sex = sex
423 self.dr = dr
424 self.use = use
425 self.recdate = recdate
426 self.birthdate = birthdate
427 self.ht = ht
428 self.race = race
429 self.edu = edu
430 self.comments = comments
431
433 attribs = 'id sex dr use recdate birthdate ht race edu comments'
434 args = ['%s=%r' % (attr, getattr(self, attr))
435 for attr in attribs.split()]
436 return 'SpeakerInfo(%s)' % (', '.join(args))
437
439 """
440 Block reader for timit tagged sentences, which are preceded by a sentence
441 number that will be ignored.
442 """
443 line = stream.readline()
444 if not line: return []
445 n, sent = line.split(' ', 1)
446 return [sent]
447