Package nltk :: Package ccg :: Module chart
[hide private]
[frames] | no frames]

Source Code for Module nltk.ccg.chart

  1  # Natural Language Toolkit: Combinatory Categorial Grammar 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Graeme Gange <ggange@csse.unimelb.edu.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7   
  8  """ 
  9  The lexicon is constructed by calling 
 10    lexicon.parseLexicon(<lexicon string>). 
 11   
 12  In order to construct a parser, you also need a rule set. 
 13  The standard English rules are provided in chart as 
 14    chart.DefaultRuleSet 
 15   
 16  The parser can then be constructed by calling, for example: 
 17    parser = chart.CCGChartParser(<lexicon>, <ruleset>) 
 18   
 19  Parsing is then performed by running 
 20    parser.nbest_parse(<sentence>.split()) 
 21   
 22  While this returns a list of trees, the default representation 
 23  of the produced trees is not very enlightening, particularly 
 24  given that it uses the same tree class as the CFG parsers. 
 25  It is probably better to call: 
 26    chart.printCCGDerivation(<parse tree extracted from list>) 
 27  which should print a nice representation of the derivation. 
 28   
 29  This entire process is shown far more clearly in the demonstration: 
 30  python chart.py 
 31  """ 
 32   
 33  from nltk.parse.api import * 
 34  from nltk.parse.chart import AbstractChartRule, EdgeI, Chart 
 35  from nltk import Tree, defaultdict 
 36   
 37  import lexicon 
 38  from combinator import * 
 39   
 40  # Based on the EdgeI class from NLTK. 
 41  # A number of the properties of the EdgeI interface don't 
 42  # transfer well to CCGs, however. 
43 -class CCGEdge(EdgeI):
44 - def __init__(self,span,categ,rule):
45 self._span = span 46 self._categ = categ 47 self._rule = rule
48 49 # Accessors
50 - def lhs(self): return self._categ
51 - def span(self): return self._span
52 - def start(self): return self._span[0]
53 - def end(self): return self._span[1]
54 - def length(self): return self._span[1] - self.span[0]
55 - def rhs(self): return ()
56 - def dot(self): return 0
57 - def is_complete(self): return True
58 - def is_incomplete(self): return False
59 - def next(self): return None
60
61 - def categ(self):
62 return self._categ
63 - def rule(self):
64 return self._rule
65
66 - def __cmp__(self, other):
67 if not isinstance(other, CCGEdge): return -1 68 return cmp((self._span,self._categ,self._rule), 69 (other.span(),other.categ(),other.rule()))
70
71 - def __hash__(self):
72 return hash((self._span,self._categ,self._rule))
73
74 -class CCGLeafEdge(EdgeI):
75 ''' 76 Class representing leaf edges in a CCG derivation. 77 '''
78 - def __init__(self,pos,categ,leaf):
79 self._pos = pos 80 self._categ = categ 81 self._leaf = leaf
82 83 # Accessors
84 - def lhs(self): return self._categ
85 - def span(self): return (self._pos,self._pos+1)
86 - def start(self): return self._pos
87 - def end(self): return self._pos + 1
88 - def length(self): return 1
89 - def rhs(self): return self._leaf
90 - def dot(self): return 0
91 - def is_complete(self): return True
92 - def is_incomplete(self): return False
93 - def next(self): return None
94
95 - def categ(self):
96 return self._categ
97
98 - def leaf(self): return self._leaf
99
100 - def __cmp__(self, other):
101 if not isinstance(other, CCGLeafEdge): return -1 102 return cmp((self._span,self._categ,self._rule), 103 other.span(),other.categ(),other.rule())
104
105 - def __hash__(self):
106 return hash((self._pos,self._categ,self._leaf))
107
108 -class BinaryCombinatorRule(AbstractChartRule):
109 ''' 110 Class implementing application of a binary combinator to a chart. 111 Takes the directed combinator to apply. 112 ''' 113 NUMEDGES = 2
114 - def __init__(self,combinator):
115 self._combinator = combinator
116 117 # Apply a combinator
118 - def apply_iter(self, chart, grammar, left_edge, right_edge):
119 # The left & right edges must be touching. 120 if not (left_edge.end() == right_edge.start()): 121 return 122 123 # Check if the two edges are permitted to combine. 124 # If so, generate the corresponding edge. 125 if self._combinator.can_combine(left_edge.categ(),right_edge.categ()): 126 for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): 127 new_edge = CCGEdge(span=(left_edge.start(), right_edge.end()),categ=res,rule=self._combinator) 128 if chart.insert(new_edge,(left_edge,right_edge)): 129 yield new_edge
130 131 # The representation of the combinator (for printing derivations)
132 - def __str__(self):
133 return str(self._combinator)
134 135 # Type-raising must be handled slightly differently to the other rules, as the 136 # resulting rules only span a single edge, rather than both edges.
137 -class ForwardTypeRaiseRule(AbstractChartRule):
138 ''' 139 Class for applying forward type raising 140 ''' 141 NUMEDGES = 2 142
143 - def __init__(self):
144 self._combinator = ForwardT
145 - def apply_iter(self, chart, grammar, left_edge, right_edge):
146 if not (left_edge.end() == right_edge.start()): 147 return 148 149 for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): 150 new_edge = CCGEdge(span=left_edge.span(),categ=res,rule=self._combinator) 151 if chart.insert(new_edge,(left_edge,)): 152 yield new_edge
153 - def __str__(self):
154 return str(self._combinator)
155
156 -class BackwardTypeRaiseRule(AbstractChartRule):
157 ''' 158 Class for applying backward type raising. 159 ''' 160 NUMEDGES = 2 161
162 - def __init__(self):
163 self._combinator = BackwardT
164 - def apply_iter(self, chart, grammar, left_edge, right_edge):
165 if not (left_edge.end() == right_edge.start()): 166 return 167 168 for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): 169 new_edge = CCGEdge(span=right_edge.span(),categ=res,rule=self._combinator) 170 if chart.insert(new_edge,(right_edge,)): 171 yield new_edge
172 - def __str__(self):
173 return str(self._combinator)
174 175 176 # Common sets of combinators used for English derivations. 177 ApplicationRuleSet = [BinaryCombinatorRule(ForwardApplication), \ 178 BinaryCombinatorRule(BackwardApplication)] 179 CompositionRuleSet = [BinaryCombinatorRule(ForwardComposition), \ 180 BinaryCombinatorRule(BackwardComposition), \ 181 BinaryCombinatorRule(BackwardBx)] 182 SubstitutionRuleSet = [BinaryCombinatorRule(ForwardSubstitution), \ 183 BinaryCombinatorRule(BackwardSx)] 184 TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()] 185 186 # The standard English rule set. 187 DefaultRuleSet = ApplicationRuleSet + CompositionRuleSet + \ 188 SubstitutionRuleSet + TypeRaiseRuleSet 189
190 -class CCGChartParser(ParserI):
191 ''' 192 Chart parser for CCGs. 193 Based largely on the ChartParser class from NLTK. 194 '''
195 - def __init__(self, lexicon, rules, trace=0):
196 self._lexicon = lexicon 197 self._rules = rules 198 self._trace = trace
199
200 - def lexicon(self):
201 return self._lexicon
202 203 # Implements the CYK algorithm
204 - def nbest_parse(self, tokens, n=None):
205 tokens = list(tokens) 206 chart = CCGChart(list(tokens)) 207 lex = self._lexicon 208 209 # Initialize leaf edges. 210 for index in range(chart.num_leaves()): 211 for cat in lex.categories(chart.leaf(index)): 212 new_edge = CCGLeafEdge(index, cat, chart.leaf(index)) 213 chart.insert(new_edge, ()) 214 215 216 # Select a span for the new edges 217 for span in range(2,chart.num_leaves()+1): 218 for start in range(0,chart.num_leaves()-span+1): 219 # Try all possible pairs of edges that could generate 220 # an edge for that span 221 for part in range(1,span): 222 lstart = start 223 mid = start + part 224 rend = start + span 225 226 for left in chart.select(span=(lstart,mid)): 227 for right in chart.select(span=(mid,rend)): 228 # Generate all possible combinations of the two edges 229 for rule in self._rules: 230 edges_added_by_rule = 0 231 for newedge in rule.apply_iter(chart,lex,left,right): 232 edges_added_by_rule += 1 233 234 # Output the resulting parses 235 return chart.parses(lex.start())[:n]
236
237 -class CCGChart(Chart):
238 - def __init__(self, tokens):
239 Chart.__init__(self, tokens)
240 241 # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be 242 # constructed slightly differently to those in the default Chart class, so it has to 243 # be reimplemented
244 - def _trees(self, edge, complete, memo, tree_class):
245 if edge in memo: 246 return memo[edge] 247 248 trees = [] 249 memo[edge] = [] 250 251 if isinstance(edge,CCGLeafEdge): 252 word = tree_class(edge.lhs(),[self._tokens[edge.start()]]) 253 leaf = tree_class((edge.lhs(),"Leaf"),[word]) 254 memo[edge] = leaf 255 return leaf 256 257 for cpl in self.child_pointer_lists(edge): 258 child_choices = [self._trees(cp, complete, memo, tree_class) 259 for cp in cpl] 260 if len(child_choices) > 0 and type(child_choices[0]) == type(""): 261 child_choices = [child_choices] 262 for children in self._choose_children(child_choices): 263 lhs = (edge.lhs(),str(edge.rule())) 264 trees.append(tree_class(lhs, children)) 265 266 memo[edge] = trees 267 return trees
268 269 #-------- 270 # Displaying derivations 271 #--------
272 -def printCCGDerivation(tree):
273 # Get the leaves and initial categories 274 leafcats = tree.pos() 275 leafstr = '' 276 catstr = '' 277 278 # Construct a string with both the leaf word and corresponding 279 # category aligned. 280 for (leaf, cat) in leafcats: 281 nextlen = 2 + max(len(leaf),len(str(cat))) 282 lcatlen = (nextlen - len(str(cat)))/2 283 rcatlen = lcatlen + (nextlen - len(str(cat)))%2 284 catstr += ' '*lcatlen + str(cat) + ' '*rcatlen 285 lleaflen = (nextlen - len(leaf))/2 286 rleaflen = lleaflen + (nextlen - len(leaf))%2 287 leafstr += ' '*lleaflen + leaf + ' '*rleaflen 288 print leafstr 289 print catstr 290 291 # Display the derivation steps 292 printCCGTree(0,tree)
293 294 # Prints the sequence of derivation steps.
295 -def printCCGTree(lwidth,tree):
296 rwidth = lwidth 297 298 # Is a leaf (word). 299 # Increment the span by the space occupied by the leaf. 300 if not isinstance(tree,Tree): 301 return 2 + lwidth + len(tree) 302 303 # Find the width of the current derivation step 304 for child in tree: 305 rwidth = max(rwidth,printCCGTree(rwidth,child)) 306 307 # Is a leaf node. 308 # Don't print anything, but account for the space occupied. 309 if not isinstance(tree.node,tuple): 310 return max(rwidth,2 + lwidth + len(str(tree.node)), 311 2 + lwidth + len(tree[0])) 312 313 (res,op) = tree.node 314 # Pad to the left with spaces, followed by a sequence of '-' 315 # and the derivation rule. 316 print lwidth*' ' + (rwidth-lwidth)*'-' + str(op) 317 # Print the resulting category on a new line. 318 respadlen = (rwidth - lwidth - len(str(res)))/2 + lwidth 319 print respadlen*' ' + str(res) 320 return rwidth
321 322 323 ### Demonstration code 324 325 # Construct the lexicon 326 lex = lexicon.parseLexicon(''' 327 :- S, NP, N, VP # Primitive categories, S is the target primitive 328 329 Det :: NP/N # Family of words 330 Pro :: NP 331 TV :: VP/NP 332 Modal :: (S\\NP)/VP # Backslashes need to be escaped 333 334 I => Pro # Word -> Category mapping 335 you => Pro 336 337 the => Det 338 339 # Variables have the special keyword 'var' 340 # '.' prevents permutation 341 # ',' prevents composition 342 and => var\\.,var/.,var 343 344 which => (N\\N)/(S/NP) 345 346 will => Modal # Categories can be either explicit, or families. 347 might => Modal 348 349 cook => TV 350 eat => TV 351 352 mushrooms => N 353 parsnips => N 354 bacon => N 355 ''') 356
357 -def demo():
358 parser = CCGChartParser(lex, DefaultRuleSet) 359 for parse in parser.nbest_parse("I might cook and eat the bacon".split(), 3): 360 printCCGDerivation(parse)
361 362 if __name__ == '__main__': 363 demo() 364