1
2
3
4
5
6
7
8 import re
9 from nltk import defaultdict
10 from api import *
11
12
13
14
15
16
17 rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''')
18
19
20
21 reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''')
22
23
24 reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''')
25
26
27 reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''')
28
29
30 reComm = re.compile('''([^#]*)(?:#.*)?''')
31
32
33
34
36 '''
37 Class representing a lexicon for CCG grammars.
38 primitives - The list of primitive categories for the lexicon
39 families - Families of categories
40 entries - A mapping of words to possible categories
41 '''
42 - def __init__(self,start,primitives,families,entries):
47
48
50 return self._entries[word]
51
52
55
56
57
59 st = ""
60 first = True
61 for ident in self._entries.keys():
62 if not first:
63 st = st + "\n"
64 st = st + ident + " => "
65
66 first = True
67 for cat in self._entries[ident]:
68 if not first:
69 st = st + " | "
70 else:
71 first = False
72 st = st + str(cat)
73 return st
74
75
76
77
78
79
80
81
83 rest = string[1:]
84 inside = "("
85
86 while rest != "" and not rest.startswith(')'):
87 if rest.startswith('('):
88 (part,rest) = matchBrackets(rest)
89 inside = inside + part
90 else:
91 inside = inside + rest[0]
92 rest = rest[1:]
93 if rest.startswith(')'):
94 return (inside + ')',rest[1:])
95 raise AssertionError, 'Unmatched bracket in string \'' + string + '\''
96
97
98
103
104
107
108
110 if subscr:
111 return subscr[1:-1].split(',')
112 return []
113
114
116
117
118 if chunks[0] == "var":
119 if chunks[1] == None:
120 if var is None:
121 var = CCGVar()
122 return (var,var)
123
124 catstr = chunks[0]
125 if catstr in families:
126 (cat, cvar) = families[catstr]
127 if var is None:
128 var = cvar
129 else:
130 cat = cat.substitute([(cvar,var)])
131 return (cat,var)
132
133 if catstr in primitives:
134 subscrs = parseSubscripts(chunks[1])
135 return (PrimitiveCategory(catstr,subscrs),var)
136 raise AssertionError, 'String \'' + catstr + '\' is neither a family nor primitive category.'
137
138
141
142
143
145 (str,rest) = nextCategory(line)
146
147 if str.startswith('('):
148 (res,var) = augParseCategory(str[1:-1],primitives,families,var)
149
150 else:
151
152 (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
153
154 while rest != "":
155 app = reApp.match(rest).groups()
156 dir = parseApplication(app[0:3])
157 rest = app[3]
158
159 (str,rest) = nextCategory(rest)
160 if str.startswith('('):
161 (arg,var) = augParseCategory(str[1:-1],primitives,families,var)
162 else:
163 (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var)
164 res = FunctionalCategory(res,arg,dir)
165
166 return (res,var)
167
168
170 primitives = []
171 families = {}
172 entries = defaultdict(list)
173 for line in lex_str.splitlines():
174
175 line = reComm.match(line).groups()[0].strip()
176 if line == "":
177 continue
178
179 if line.startswith(':-'):
180
181
182
183 primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ]
184 else:
185
186 (ident, sep, catstr) = reLex.match(line).groups()
187 (cat,var) = augParseCategory(catstr,primitives,families)
188 if sep == '::':
189
190
191 families[ident] = (cat,var)
192 else:
193
194
195 entries[ident].append(cat)
196 return CCGLexicon(primitives[0],primitives,families,entries)
197
198
199 openccg_tinytiny = parseLexicon('''
200 # Rather minimal lexicon based on the openccg `tinytiny' grammar.
201 # Only incorporates a subset of the morphological subcategories, however.
202 :- S,NP,N # Primitive categories
203 Det :: NP/N # Determiners
204 Pro :: NP
205 IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular)
206 IntransVpl :: S\\NP[pl] # Plural
207 TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular)
208 TransVpl :: S\\NP[pl]/NP # Plural
209
210 the => NP[sg]/N[sg]
211 the => NP[pl]/N[pl]
212
213 I => Pro
214 me => Pro
215 we => Pro
216 us => Pro
217
218 book => N[sg]
219 books => N[pl]
220
221 peach => N[sg]
222 peaches => N[pl]
223
224 policeman => N[sg]
225 policemen => N[pl]
226
227 boy => N[sg]
228 boys => N[pl]
229
230 sleep => IntransVsg
231 sleep => IntransVpl
232
233 eat => IntransVpl
234 eat => TransVpl
235 eats => IntransVsg
236 eats => TransVsg
237
238 see => TransVpl
239 sees => TransVsg
240 ''')
241