Package nltk :: Package corpus :: Package reader :: Module ycoe
[hide private]
[frames] | no frames]

Source Code for Module nltk.corpus.reader.ycoe

  1  # -*- coding: iso-8859-1 -*- 
  2   
  3  # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) 
  4  # 
  5  # Copyright (C) 2001-2011 NLTK Project 
  6  # Author: Selina Dennis <selina@tranzfusion.net> 
  7  # URL: <http://www.nltk.org/> 
  8  # For license information, see LICENSE.TXT 
  9   
 10  """ 
 11  Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 
 12  English Prose (YCOE), a 1.5 million word syntactically-annotated 
 13  corpus of Old English prose texts. The corpus is distributed by the 
 14  Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included 
 15  with NLTK. 
 16   
 17  The YCOE corpus is divided into 100 files, each representing 
 18  an Old English prose text. Tags used within each text complies 
 19  to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm 
 20  """ 
 21   
 22  import os 
 23  import re 
 24   
 25  from nltk.tokenize import RegexpTokenizer 
 26  from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader 
 27  from nltk.corpus.reader.tagged import TaggedCorpusReader 
 28  from string import split 
 29   
 30  from util import * 
 31  from api import * 
 32   
33 -class YCOECorpusReader(CorpusReader):
34 """ 35 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old 36 English Prose (YCOE), a 1.5 million word syntactically-annotated 37 corpus of Old English prose texts. 38 """
39 - def __init__(self, root, encoding=None):
40 CorpusReader.__init__(self, root, [], encoding) 41 42 self._psd_reader = YCOEParseCorpusReader( 43 self.root.join('psd'), '.*', '.psd', encoding=encoding) 44 self._pos_reader = YCOETaggedCorpusReader( 45 self.root.join('pos'), '.*', '.pos') 46 47 # Make sure we have a consistent set of items: 48 documents = set(f[:-4] for f in self._psd_reader.fileids()) 49 if set(f[:-4] for f in self._pos_reader.fileids()) != documents: 50 raise ValueError('Items in "psd" and "pos" ' 51 'subdirectories do not match.') 52 53 fileids = sorted(['%s.psd' % doc for doc in documents] + 54 ['%s.pos' % doc for doc in documents]) 55 CorpusReader.__init__(self, root, fileids, encoding) 56 self._documents = sorted(documents)
57
58 - def documents(self, fileids=None):
59 """ 60 Return a list of document identifiers for all documents in 61 this corpus, or for the documents with the given file(s) if 62 specified. 63 """ 64 if fileids is None: 65 return self._documents 66 if isinstance(fileids, basestring): 67 fileids = [fileids] 68 for f in fileids: 69 if f not in self._fileids: 70 raise KeyError('File id %s not found' % fileids) 71 # Strip off the '.pos' and '.psd' extensions. 72 return sorted(set(f[:-4] for f in fileids))
73
74 - def fileids(self, documents=None):
75 """ 76 Return a list of file identifiers for the files that make up 77 this corpus, or that store the given document(s) if specified. 78 """ 79 if documents is None: 80 return self._fileids 81 elif isinstance(documents, basestring): 82 documents = [documents] 83 return sorted(set(['%s.pos' % doc for doc in documents] + 84 ['%s.psd' % doc for doc in documents]))
85
86 - def _getfileids(self, documents, subcorpus):
87 """ 88 Helper that selects the appropriate fileids for a given set of 89 documents from a given subcorpus (pos or psd). 90 """ 91 if documents is None: 92 documents = self._documents 93 else: 94 if isinstance(documents, basestring): 95 documents = [documents] 96 for document in documents: 97 if document not in self._documents: 98 if document[-4:] in ('.pos', '.psd'): 99 raise ValueError( 100 'Expected a document identifier, not a file ' 101 'identifier. (Use corpus.documents() to get ' 102 'a list of document identifiers.') 103 else: 104 raise ValueError('Document identifier %s not found' 105 % document) 106 return ['%s.%s' % (d, subcorpus) for d in documents]
107 108 # Delegate to one of our two sub-readers:
109 - def words(self, documents=None):
110 return self._pos_reader.words(self._getfileids(documents, 'pos'))
111 - def sents(self, documents=None):
112 return self._pos_reader.sents(self._getfileids(documents, 'pos'))
113 - def paras(self, documents=None):
114 return self._pos_reader.paras(self._getfileids(documents, 'pos'))
115 - def tagged_words(self, documents=None):
116 return self._pos_reader.tagged_words(self._getfileids(documents, 'pos'))
117 - def tagged_sents(self, documents=None):
118 return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos'))
119 - def tagged_paras(self, documents=None):
120 return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos'))
121 - def parsed_sents(self, documents=None):
122 return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd'))
123 124
125 -class YCOEParseCorpusReader(BracketParseCorpusReader):
126 """Specialized version of the standard bracket parse corpus reader 127 that strips out (CODE ...) and (ID ...) nodes."""
128 - def _parse(self, t):
129 t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t) 130 if re.match(r'\s*\(\s*\)\s*$', t): return None 131 return BracketParseCorpusReader._parse(self, t)
132
133 -class YCOETaggedCorpusReader(TaggedCorpusReader):
134 - def __init__(self, root, items, encoding=None):
135 gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' 136 sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) 137 TaggedCorpusReader.__init__(self, root, items, sep='_', 138 sent_tokenizer=sent_tokenizer)
139 140 #: A list of all documents and their titles in ycoe. 141 documents = { 142 'coadrian.o34': 'Adrian and Ritheus', 143 'coaelhom.o3': 'Ælfric, Supplemental Homilies', 144 'coaelive.o3': 'Ælfric\'s Lives of Saints', 145 'coalcuin': 'Alcuin De virtutibus et vitiis', 146 'coalex.o23': 'Alexander\'s Letter to Aristotle', 147 'coapollo.o3': 'Apollonius of Tyre', 148 'coaugust': 'Augustine', 149 'cobede.o2': 'Bede\'s History of the English Church', 150 'cobenrul.o3': 'Benedictine Rule', 151 'coblick.o23': 'Blickling Homilies', 152 'coboeth.o2': 'Boethius\' Consolation of Philosophy', 153 'cobyrhtf.o3': 'Byrhtferth\'s Manual', 154 'cocanedgD': 'Canons of Edgar (D)', 155 'cocanedgX': 'Canons of Edgar (X)', 156 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I', 157 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II', 158 'cochad.o24': 'Saint Chad', 159 'cochdrul': 'Chrodegang of Metz, Rule', 160 'cochristoph': 'Saint Christopher', 161 'cochronA.o23': 'Anglo-Saxon Chronicle A', 162 'cochronC': 'Anglo-Saxon Chronicle C', 163 'cochronD': 'Anglo-Saxon Chronicle D', 164 'cochronE.o34': 'Anglo-Saxon Chronicle E', 165 'cocura.o2': 'Cura Pastoralis', 166 'cocuraC': 'Cura Pastoralis (Cotton)', 167 'codicts.o34': 'Dicts of Cato', 168 'codocu1.o1': 'Documents 1 (O1)', 169 'codocu2.o12': 'Documents 2 (O1/O2)', 170 'codocu2.o2': 'Documents 2 (O2)', 171 'codocu3.o23': 'Documents 3 (O2/O3)', 172 'codocu3.o3': 'Documents 3 (O3)', 173 'codocu4.o24': 'Documents 4 (O2/O4)', 174 'coeluc1': 'Honorius of Autun, Elucidarium 1', 175 'coeluc2': 'Honorius of Autun, Elucidarium 1', 176 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis', 177 'coeuphr': 'Saint Euphrosyne', 178 'coeust': 'Saint Eustace and his companions', 179 'coexodusP': 'Exodus (P)', 180 'cogenesiC': 'Genesis (C)', 181 'cogregdC.o24': 'Gregory\'s Dialogues (C)', 182 'cogregdH.o23': 'Gregory\'s Dialogues (H)', 183 'coherbar': 'Pseudo-Apuleius, Herbarium', 184 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)', 185 'coinspolX': 'Wulfstan\'s Institute of Polity (X)', 186 'cojames': 'Saint James', 187 'colacnu.o23': 'Lacnunga', 188 'colaece.o2': 'Leechdoms', 189 'colaw1cn.o3': 'Laws, Cnut I', 190 'colaw2cn.o3': 'Laws, Cnut II', 191 'colaw5atr.o3': 'Laws, Æthelred V', 192 'colaw6atr.o3': 'Laws, Æthelred VI', 193 'colawaf.o2': 'Laws, Alfred', 194 'colawafint.o2': 'Alfred\'s Introduction to Laws', 195 'colawger.o34': 'Laws, Gerefa', 196 'colawine.ox2': 'Laws, Ine', 197 'colawnorthu.o3': 'Northumbra Preosta Lagu', 198 'colawwllad.o4': 'Laws, William I, Lad', 199 'coleofri.o4': 'Leofric', 200 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth', 201 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)', 202 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)', 203 'colwgeat': 'Ælfric\'s Letter to Wulfgeat', 204 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)', 205 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)', 206 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I', 207 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II', 208 'comargaC.o34': 'Saint Margaret (C)', 209 'comargaT': 'Saint Margaret (T)', 210 'comart1': 'Martyrology, I', 211 'comart2': 'Martyrology, II', 212 'comart3.o23': 'Martyrology, III', 213 'comarvel.o23': 'Marvels of the East', 214 'comary': 'Mary of Egypt', 215 'coneot': 'Saint Neot', 216 'conicodA': 'Gospel of Nicodemus (A)', 217 'conicodC': 'Gospel of Nicodemus (C)', 218 'conicodD': 'Gospel of Nicodemus (D)', 219 'conicodE': 'Gospel of Nicodemus (E)', 220 'coorosiu.o2': 'Orosius', 221 'cootest.o3': 'Heptateuch', 222 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I', 223 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II', 224 'coprefcura.o2': 'Preface to the Cura Pastoralis', 225 'coprefgen.o3': 'Ælfric\'s Preface to Genesis', 226 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints', 227 'coprefsolilo': 'Preface to Augustine\'s Soliloquies', 228 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 229 'corood': 'History of the Holy Rood-Tree', 230 'cosevensl': 'Seven Sleepers', 231 'cosolilo': 'St. Augustine\'s Soliloquies', 232 'cosolsat1.o4': 'Solomon and Saturn I', 233 'cosolsat2': 'Solomon and Saturn II', 234 'cotempo.o3': 'Ælfric\'s De Temporibus Anni', 235 'coverhom': 'Vercelli Homilies', 236 'coverhomE': 'Vercelli Homilies (E)', 237 'coverhomL': 'Vercelli Homilies (L)', 238 'covinceB': 'Saint Vincent (Bodley 343)', 239 'covinsal': 'Vindicta Salvatoris', 240 'cowsgosp.o3': 'West-Saxon Gospels', 241 'cowulf.o34': 'Wulfstan\'s Homilies' 242 } 243