Package nltk :: Package corpus :: Package reader :: Module cmudict
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.cmudict

 1  # Natural Language Toolkit: Genesis Corpus Reader 
 2  # 
 3  # Copyright (C) 2001-2011 NLTK Project 
 4  # Author: Steven Bird <sb@ldc.upenn.edu> 
 5  # URL: <http://www.nltk.org/> 
 6  # For license information, see LICENSE.TXT 
 7   
 8  """ 
 9  The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] 
10  ftp://ftp.cs.cmu.edu/project/speech/dict/ 
11  Copyright 1998 Carnegie Mellon University 
12   
13  File Format: Each line consists of an uppercased word, a counter 
14  (for alternative pronunciations), and a transcription.  Vowels are 
15  marked for stress (1=primary, 2=secondary, 0=no stress).  E.g.: 
16  NATURAL 1 N AE1 CH ER0 AH0 L 
17   
18  The dictionary contains 127069 entries.  Of these, 119400 words are assigned 
19  a unique pronunciation, 6830 words have two pronunciations, and 839 words have 
20  three or more pronunciations.  Many of these are fast-speech variants. 
21   
22  Phonemes: There are 39 phonemes, as shown below: 
23       
24  Phoneme Example Translation    Phoneme Example Translation 
25  ------- ------- -----------    ------- ------- ----------- 
26  AA      odd     AA D           AE      at      AE T 
27  AH      hut     HH AH T        AO      ought   AO T 
28  AW      cow     K AW           AY      hide    HH AY D 
29  B       be      B IY           CH      cheese  CH IY Z 
30  D       dee     D IY           DH      thee    DH IY 
31  EH      Ed      EH D           ER      hurt    HH ER T 
32  EY      ate     EY T           F       fee     F IY 
33  G       green   G R IY N       HH      he      HH IY 
34  IH      it      IH T           IY      eat     IY T 
35  JH      gee     JH IY          K       key     K IY 
36  L       lee     L IY           M       me      M IY 
37  N       knee    N IY           NG      ping    P IH NG 
38  OW      oat     OW T           OY      toy     T OY 
39  P       pee     P IY           R       read    R IY D 
40  S       sea     S IY           SH      she     SH IY 
41  T       tea     T IY           TH      theta   TH EY T AH 
42  UH      hood    HH UH D        UW      two     T UW 
43  V       vee     V IY           W       we      W IY 
44  Y       yield   Y IY L D       Z       zee     Z IY 
45  ZH      seizure S IY ZH ER 
46  """ 
47   
48  import codecs 
49   
50  from nltk.util import Index 
51   
52  from util import * 
53  from api import * 
54   
55 -class CMUDictCorpusReader(CorpusReader):
56 - def entries(self):
57 """ 58 @return: the cmudict lexicon as a list of entries 59 containing (word, transcriptions) tuples. 60 """ 61 return concat([StreamBackedCorpusView(fileid, read_cmudict_block, 62 encoding=enc) 63 for fileid, enc in self.abspaths(None, True)])
64
65 - def raw(self):
66 """ 67 @return: the cmudict lexicon as a raw string. 68 """ 69 if fileids is None: fileids = self._fileids 70 elif isinstance(fileids, basestring): fileids = [fileids] 71 return concat([self.open(f).read() for f in fileids])
72
73 - def words(self):
74 """ 75 @return: a list of all words defined in the cmudict lexicon. 76 """ 77 return [word.lower() for (word, _) in self.entries()]
78
79 - def dict(self):
80 """ 81 @return: the cmudict lexicon as a dictionary, whose keys are 82 lowercase words and whose values are lists of pronunciations. 83 """ 84 return dict(Index(self.entries()))
85
86 -def read_cmudict_block(stream):
87 entries = [] 88 while len(entries) < 100: # Read 100 at a time. 89 line = stream.readline() 90 if line == '': return entries # end of file. 91 pieces = line.split() 92 entries.append( (pieces[0].lower(), pieces[2:]) ) 93 return entries
94