Package nltk :: Package ccg :: Module lexicon
[hide private]
[frames] | no frames]

Source Code for Module nltk.ccg.lexicon

  1  # Natural Language Toolkit: Combinatory Categorial Grammar 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Graeme Gange <ggange@csse.unimelb.edu.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  import re 
  9  from nltk import defaultdict 
 10  from api import * 
 11   
 12  #------------ 
 13  # Regular expressions used for parsing components of the lexicon 
 14  #------------ 
 15   
 16  # Parses a primitive category and subscripts 
 17  rePrim = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''') 
 18   
 19  # Separates the next primitive category from the remainder of the 
 20  # string 
 21  reNextPrim = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''') 
 22   
 23  # Separates the next application operator from the remainder 
 24  reApp = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''') 
 25   
 26  # Parses the definition of the category of either a word or a family 
 27  reLex = re.compile(r'''([A-Za-z_]+)\s*(::|[-=]+>)\s*(.+)''') 
 28   
 29  # Strips comments from a line 
 30  reComm = re.compile('''([^#]*)(?:#.*)?''') 
 31   
 32  #---------- 
 33  # Lexicons 
 34  #---------- 
35 -class CCGLexicon(object):
36 ''' 37 Class representing a lexicon for CCG grammars. 38 primitives - The list of primitive categories for the lexicon 39 families - Families of categories 40 entries - A mapping of words to possible categories 41 '''
42 - def __init__(self,start,primitives,families,entries):
43 self._start = PrimitiveCategory(start) 44 self._primitives = primitives 45 self._families = families 46 self._entries = entries
47 48 # Returns all the possible categories for a word
49 - def categories(self,word):
50 return self._entries[word]
51 52 # Returns the target category for the parser
53 - def start(self):
54 return self._start
55 56 # String representation of the lexicon 57 # Used for debugging
58 - def __str__(self):
59 st = "" 60 first = True 61 for ident in self._entries.keys(): 62 if not first: 63 st = st + "\n" 64 st = st + ident + " => " 65 66 first = True 67 for cat in self._entries[ident]: 68 if not first: 69 st = st + " | " 70 else: 71 first = False 72 st = st + str(cat) 73 return st
74 75 76 #----------- 77 # Parsing lexicons 78 #----------- 79 80 # Separates the contents matching the first set of brackets 81 # from the rest of the input.
82 -def matchBrackets(string):
83 rest = string[1:] 84 inside = "(" 85 86 while rest != "" and not rest.startswith(')'): 87 if rest.startswith('('): 88 (part,rest) = matchBrackets(rest) 89 inside = inside + part 90 else: 91 inside = inside + rest[0] 92 rest = rest[1:] 93 if rest.startswith(')'): 94 return (inside + ')',rest[1:]) 95 raise AssertionError, 'Unmatched bracket in string \'' + string + '\''
96 97 # Separates the string for the next portion of the category 98 # from the rest of the string
99 -def nextCategory(string):
100 if string.startswith('('): 101 return matchBrackets(string) 102 return reNextPrim.match(string).groups()
103 104 # Parses an application operator
105 -def parseApplication(app):
106 return Direction(app[0],app[1:])
107 108 # Parses the subscripts for a primitive category
109 -def parseSubscripts(subscr):
110 if subscr: 111 return subscr[1:-1].split(',') 112 return []
113 114 # Parse a primitive category
115 -def parsePrimitiveCategory(chunks,primitives,families,var):
116 # If the primitive is the special category 'var', 117 # replace it with the correct CCGVar 118 if chunks[0] == "var": 119 if chunks[1] == None: 120 if var is None: 121 var = CCGVar() 122 return (var,var) 123 124 catstr = chunks[0] 125 if catstr in families: 126 (cat, cvar) = families[catstr] 127 if var is None: 128 var = cvar 129 else: 130 cat = cat.substitute([(cvar,var)]) 131 return (cat,var) 132 133 if catstr in primitives: 134 subscrs = parseSubscripts(chunks[1]) 135 return (PrimitiveCategory(catstr,subscrs),var) 136 raise AssertionError, 'String \'' + catstr + '\' is neither a family nor primitive category.'
137 138 # parseCategory drops the 'var' from the tuple
139 -def parseCategory(line,primitives,families):
140 return augParseCategory(line,primitives,families)[0]
141 142 # Parses a string representing a category, and returns 143 # a tuple with (possibly) the CCG variable for the category
144 -def augParseCategory(line,primitives,families,var = None):
145 (str,rest) = nextCategory(line) 146 147 if str.startswith('('): 148 (res,var) = augParseCategory(str[1:-1],primitives,families,var) 149 150 else: 151 # print rePrim.match(str).groups() 152 (res,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var) 153 154 while rest != "": 155 app = reApp.match(rest).groups() 156 dir = parseApplication(app[0:3]) 157 rest = app[3] 158 159 (str,rest) = nextCategory(rest) 160 if str.startswith('('): 161 (arg,var) = augParseCategory(str[1:-1],primitives,families,var) 162 else: 163 (arg,var) = parsePrimitiveCategory(rePrim.match(str).groups(),primitives,families,var) 164 res = FunctionalCategory(res,arg,dir) 165 166 return (res,var)
167 168 # Takes an input string, and converts it into a lexicon for CCGs.
169 -def parseLexicon(lex_str):
170 primitives = [] 171 families = {} 172 entries = defaultdict(list) 173 for line in lex_str.splitlines(): 174 # Strip comments and leading/trailing whitespace. 175 line = reComm.match(line).groups()[0].strip() 176 if line == "": 177 continue 178 179 if line.startswith(':-'): 180 # A line of primitive categories. 181 # The first line is the target category 182 # ie, :- S, N, NP, VP 183 primitives = primitives + [ prim.strip() for prim in line[2:].strip().split(',') ] 184 else: 185 # Either a family definition, or a word definition 186 (ident, sep, catstr) = reLex.match(line).groups() 187 (cat,var) = augParseCategory(catstr,primitives,families) 188 if sep == '::': 189 # Family definition 190 # ie, Det :: NP/N 191 families[ident] = (cat,var) 192 else: 193 # Word definition 194 # ie, which => (N\N)/(S/NP) 195 entries[ident].append(cat) 196 return CCGLexicon(primitives[0],primitives,families,entries)
197 198 199 openccg_tinytiny = parseLexicon(''' 200 # Rather minimal lexicon based on the openccg `tinytiny' grammar. 201 # Only incorporates a subset of the morphological subcategories, however. 202 :- S,NP,N # Primitive categories 203 Det :: NP/N # Determiners 204 Pro :: NP 205 IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) 206 IntransVpl :: S\\NP[pl] # Plural 207 TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) 208 TransVpl :: S\\NP[pl]/NP # Plural 209 210 the => NP[sg]/N[sg] 211 the => NP[pl]/N[pl] 212 213 I => Pro 214 me => Pro 215 we => Pro 216 us => Pro 217 218 book => N[sg] 219 books => N[pl] 220 221 peach => N[sg] 222 peaches => N[pl] 223 224 policeman => N[sg] 225 policemen => N[pl] 226 227 boy => N[sg] 228 boys => N[pl] 229 230 sleep => IntransVsg 231 sleep => IntransVpl 232 233 eat => IntransVpl 234 eat => TransVpl 235 eats => IntransVsg 236 eats => TransVsg 237 238 see => TransVpl 239 sees => TransVsg 240 ''') 241