1
2
3
4
5
6
7
8
9
10 """
11 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
12 English Prose (YCOE), a 1.5 million word syntactically-annotated
13 corpus of Old English prose texts. The corpus is distributed by the
14 Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included
15 with NLTK.
16
17 The YCOE corpus is divided into 100 files, each representing
18 an Old English prose text. Tags used within each text complies
19 to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm
20 """
21
22 import os
23 import re
24
25 from nltk.tokenize import RegexpTokenizer
26 from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
27 from nltk.corpus.reader.tagged import TaggedCorpusReader
28 from string import split
29
30 from util import *
31 from api import *
32
34 """
35 Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old
36 English Prose (YCOE), a 1.5 million word syntactically-annotated
37 corpus of Old English prose texts.
38 """
39 - def __init__(self, root, encoding=None):
40 CorpusReader.__init__(self, root, [], encoding)
41
42 self._psd_reader = YCOEParseCorpusReader(
43 self.root.join('psd'), '.*', '.psd', encoding=encoding)
44 self._pos_reader = YCOETaggedCorpusReader(
45 self.root.join('pos'), '.*', '.pos')
46
47
48 documents = set(f[:-4] for f in self._psd_reader.fileids())
49 if set(f[:-4] for f in self._pos_reader.fileids()) != documents:
50 raise ValueError('Items in "psd" and "pos" '
51 'subdirectories do not match.')
52
53 fileids = sorted(['%s.psd' % doc for doc in documents] +
54 ['%s.pos' % doc for doc in documents])
55 CorpusReader.__init__(self, root, fileids, encoding)
56 self._documents = sorted(documents)
57
59 """
60 Return a list of document identifiers for all documents in
61 this corpus, or for the documents with the given file(s) if
62 specified.
63 """
64 if fileids is None:
65 return self._documents
66 if isinstance(fileids, basestring):
67 fileids = [fileids]
68 for f in fileids:
69 if f not in self._fileids:
70 raise KeyError('File id %s not found' % fileids)
71
72 return sorted(set(f[:-4] for f in fileids))
73
75 """
76 Return a list of file identifiers for the files that make up
77 this corpus, or that store the given document(s) if specified.
78 """
79 if documents is None:
80 return self._fileids
81 elif isinstance(documents, basestring):
82 documents = [documents]
83 return sorted(set(['%s.pos' % doc for doc in documents] +
84 ['%s.psd' % doc for doc in documents]))
85
87 """
88 Helper that selects the appropriate fileids for a given set of
89 documents from a given subcorpus (pos or psd).
90 """
91 if documents is None:
92 documents = self._documents
93 else:
94 if isinstance(documents, basestring):
95 documents = [documents]
96 for document in documents:
97 if document not in self._documents:
98 if document[-4:] in ('.pos', '.psd'):
99 raise ValueError(
100 'Expected a document identifier, not a file '
101 'identifier. (Use corpus.documents() to get '
102 'a list of document identifiers.')
103 else:
104 raise ValueError('Document identifier %s not found'
105 % document)
106 return ['%s.%s' % (d, subcorpus) for d in documents]
107
108
109 - def words(self, documents=None):
111 - def sents(self, documents=None):
113 - def paras(self, documents=None):
123
124
126 """Specialized version of the standard bracket parse corpus reader
127 that strips out (CODE ...) and (ID ...) nodes."""
132
134 - def __init__(self, root, items, encoding=None):
139
140
141 documents = {
142 'coadrian.o34': 'Adrian and Ritheus',
143 'coaelhom.o3': 'Ælfric, Supplemental Homilies',
144 'coaelive.o3': 'Ælfric\'s Lives of Saints',
145 'coalcuin': 'Alcuin De virtutibus et vitiis',
146 'coalex.o23': 'Alexander\'s Letter to Aristotle',
147 'coapollo.o3': 'Apollonius of Tyre',
148 'coaugust': 'Augustine',
149 'cobede.o2': 'Bede\'s History of the English Church',
150 'cobenrul.o3': 'Benedictine Rule',
151 'coblick.o23': 'Blickling Homilies',
152 'coboeth.o2': 'Boethius\' Consolation of Philosophy',
153 'cobyrhtf.o3': 'Byrhtferth\'s Manual',
154 'cocanedgD': 'Canons of Edgar (D)',
155 'cocanedgX': 'Canons of Edgar (X)',
156 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I',
157 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II',
158 'cochad.o24': 'Saint Chad',
159 'cochdrul': 'Chrodegang of Metz, Rule',
160 'cochristoph': 'Saint Christopher',
161 'cochronA.o23': 'Anglo-Saxon Chronicle A',
162 'cochronC': 'Anglo-Saxon Chronicle C',
163 'cochronD': 'Anglo-Saxon Chronicle D',
164 'cochronE.o34': 'Anglo-Saxon Chronicle E',
165 'cocura.o2': 'Cura Pastoralis',
166 'cocuraC': 'Cura Pastoralis (Cotton)',
167 'codicts.o34': 'Dicts of Cato',
168 'codocu1.o1': 'Documents 1 (O1)',
169 'codocu2.o12': 'Documents 2 (O1/O2)',
170 'codocu2.o2': 'Documents 2 (O2)',
171 'codocu3.o23': 'Documents 3 (O2/O3)',
172 'codocu3.o3': 'Documents 3 (O3)',
173 'codocu4.o24': 'Documents 4 (O2/O4)',
174 'coeluc1': 'Honorius of Autun, Elucidarium 1',
175 'coeluc2': 'Honorius of Autun, Elucidarium 1',
176 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis',
177 'coeuphr': 'Saint Euphrosyne',
178 'coeust': 'Saint Eustace and his companions',
179 'coexodusP': 'Exodus (P)',
180 'cogenesiC': 'Genesis (C)',
181 'cogregdC.o24': 'Gregory\'s Dialogues (C)',
182 'cogregdH.o23': 'Gregory\'s Dialogues (H)',
183 'coherbar': 'Pseudo-Apuleius, Herbarium',
184 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)',
185 'coinspolX': 'Wulfstan\'s Institute of Polity (X)',
186 'cojames': 'Saint James',
187 'colacnu.o23': 'Lacnunga',
188 'colaece.o2': 'Leechdoms',
189 'colaw1cn.o3': 'Laws, Cnut I',
190 'colaw2cn.o3': 'Laws, Cnut II',
191 'colaw5atr.o3': 'Laws, Æthelred V',
192 'colaw6atr.o3': 'Laws, Æthelred VI',
193 'colawaf.o2': 'Laws, Alfred',
194 'colawafint.o2': 'Alfred\'s Introduction to Laws',
195 'colawger.o34': 'Laws, Gerefa',
196 'colawine.ox2': 'Laws, Ine',
197 'colawnorthu.o3': 'Northumbra Preosta Lagu',
198 'colawwllad.o4': 'Laws, William I, Lad',
199 'coleofri.o4': 'Leofric',
200 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth',
201 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)',
202 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)',
203 'colwgeat': 'Ælfric\'s Letter to Wulfgeat',
204 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)',
205 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)',
206 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I',
207 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II',
208 'comargaC.o34': 'Saint Margaret (C)',
209 'comargaT': 'Saint Margaret (T)',
210 'comart1': 'Martyrology, I',
211 'comart2': 'Martyrology, II',
212 'comart3.o23': 'Martyrology, III',
213 'comarvel.o23': 'Marvels of the East',
214 'comary': 'Mary of Egypt',
215 'coneot': 'Saint Neot',
216 'conicodA': 'Gospel of Nicodemus (A)',
217 'conicodC': 'Gospel of Nicodemus (C)',
218 'conicodD': 'Gospel of Nicodemus (D)',
219 'conicodE': 'Gospel of Nicodemus (E)',
220 'coorosiu.o2': 'Orosius',
221 'cootest.o3': 'Heptateuch',
222 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I',
223 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II',
224 'coprefcura.o2': 'Preface to the Cura Pastoralis',
225 'coprefgen.o3': 'Ælfric\'s Preface to Genesis',
226 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints',
227 'coprefsolilo': 'Preface to Augustine\'s Soliloquies',
228 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus',
229 'corood': 'History of the Holy Rood-Tree',
230 'cosevensl': 'Seven Sleepers',
231 'cosolilo': 'St. Augustine\'s Soliloquies',
232 'cosolsat1.o4': 'Solomon and Saturn I',
233 'cosolsat2': 'Solomon and Saturn II',
234 'cotempo.o3': 'Ælfric\'s De Temporibus Anni',
235 'coverhom': 'Vercelli Homilies',
236 'coverhomE': 'Vercelli Homilies (E)',
237 'coverhomL': 'Vercelli Homilies (L)',
238 'covinceB': 'Saint Vincent (Bodley 343)',
239 'covinsal': 'Vindicta Salvatoris',
240 'cowsgosp.o3': 'West-Saxon Gospels',
241 'cowulf.o34': 'Wulfstan\'s Homilies'
242 }
243