Package nltk :: Package sem :: Module boxer
[hide private]
[frames] | no frames]

Source Code for Module nltk.sem.boxer

   1  # Natural Language Toolkit: Interface to Boxer 
   2  # <http://svn.ask.it.usyd.edu.au/trac/candc/wiki/boxer> 
   3  # 
   4  # Author: Dan Garrette <dhgarrette@gmail.com> 
   5  # 
   6  # Copyright (C) 2001-2011 NLTK Project 
   7  # URL: <http://www.nltk.org/> 
   8  # For license information, see LICENSE.TXT 
   9   
  10  import os 
  11  import subprocess 
  12  from optparse import OptionParser 
  13  import tempfile 
  14  import operator 
  15   
  16  import nltk 
  17  from nltk.sem.logic import * 
  18  from nltk.sem.drt import * 
  19   
  20  """ 
  21  An interface to Boxer. 
  22   
  23  Usage: 
  24    Set the environment variable CANDCHOME to the bin directory of your CandC installation. 
  25    The models directory should be in the CandC root directory. 
  26    For example: 
  27       /path/to/candc/ 
  28          bin/ 
  29              candc 
  30              boxer 
  31          models/ 
  32              boxer/ 
  33  """ 
  34   
35 -class Boxer(object):
36 """ 37 This class is an interface to Johan Bos's program Boxer, a wide-coverage 38 semantic parser that produces Discourse Representation Structures (DRSs). 39 """ 40
41 - def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False):
42 """ 43 @param boxer_drs_interpreter: A class that converts from the 44 C{AbstractBoxerDrs} object hierarchy to a different object. The 45 default is C{NltkDrtBoxerDrsInterpreter}, which converts to the NLTK 46 DRT hierarchy. 47 @param elimeq: When set to true, Boxer removes all equalities from the 48 DRSs and discourse referents standing in the equality relation are 49 unified, but only if this can be done in a meaning-preserving manner. 50 """ 51 if boxer_drs_interpreter is None: 52 boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() 53 self._boxer_drs_interpreter = boxer_drs_interpreter 54 55 self._elimeq = elimeq 56 57 self.set_bin_dir(bin_dir, verbose)
58
59 - def set_bin_dir(self, bin_dir, verbose=False):
60 self._candc_bin = self._find_binary('candc', bin_dir, verbose) 61 self._candc_models_path = os.path.normpath(os.path.join(self._candc_bin[:-5], '../models')) 62 self._boxer_bin = self._find_binary('boxer', bin_dir, verbose)
63
64 - def interpret(self, input, discourse_id=None, question=False, verbose=False):
65 """ 66 Use Boxer to give a first order representation. 67 68 @param input: C{str} Input sentence to parse 69 @param occur_index: C{boolean} Should predicates be occurrence indexed? 70 @param discourse_id: C{str} An identifier to be inserted to each occurrence-indexed predicate. 71 @return: C{drt.AbstractDrs} 72 """ 73 if discourse_id is not None: 74 discourse_ids = [discourse_id] 75 else: 76 discourse_ids = None 77 d, = self.batch_interpret_multisentence([[input]], discourse_ids, question, verbose) 78 if not d: 79 raise Exception('Unable to interpret: "%s"' % input) 80 return d
81
82 - def interpret_multisentence(self, input, discourse_id=None, question=False, verbose=False):
83 """ 84 Use Boxer to give a first order representation. 85 86 @param input: C{list} of C{str} Input sentences to parse as a single discourse 87 @param occur_index: C{boolean} Should predicates be occurrence indexed? 88 @param discourse_id: C{str} An identifier to be inserted to each occurrence-indexed predicate. 89 @return: C{drt.AbstractDrs} 90 """ 91 if discourse_id is not None: 92 discourse_ids = [discourse_id] 93 else: 94 discourse_ids = None 95 d, = self.batch_interpret_multisentence([input], discourse_ids, question, verbose) 96 if not d: 97 raise Exception('Unable to interpret: "%s"' % input) 98 return d
99
100 - def batch_interpret(self, inputs, discourse_ids=None, question=False, verbose=False):
101 """ 102 Use Boxer to give a first order representation. 103 104 @param inputs: C{list} of C{str} Input sentences to parse as individual discourses 105 @param occur_index: C{boolean} Should predicates be occurrence indexed? 106 @param discourse_ids: C{list} of C{str} Identifiers to be inserted to each occurrence-indexed predicate. 107 @return: C{list} of C{drt.AbstractDrs} 108 """ 109 return self.batch_interpret_multisentence([[input] for input in inputs], discourse_ids, question, verbose)
110
111 - def batch_interpret_multisentence(self, inputs, discourse_ids=None, question=False, verbose=False):
112 """ 113 Use Boxer to give a first order representation. 114 115 @param inputs: C{list} of C{list} of C{str} Input discourses to parse 116 @param occur_index: C{boolean} Should predicates be occurrence indexed? 117 @param discourse_ids: C{list} of C{str} Identifiers to be inserted to each occurrence-indexed predicate. 118 @return: C{drt.AbstractDrs} 119 """ 120 _, temp_filename = tempfile.mkstemp(prefix='boxer-', suffix='.in', text=True) 121 122 if discourse_ids is not None: 123 assert len(inputs) == len(discourse_ids) 124 assert reduce(operator.and_, (id is not None for id in discourse_ids)) 125 use_disc_id = True 126 else: 127 discourse_ids = map(str, xrange(len(inputs))) 128 use_disc_id = False 129 130 candc_out = self._call_candc(inputs, discourse_ids, question, temp_filename, verbose=verbose) 131 boxer_out = self._call_boxer(temp_filename, verbose=verbose) 132 133 os.remove(temp_filename) 134 135 # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out: 136 # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str) 137 138 drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id) 139 return [drs_dict.get(id, None) for id in discourse_ids]
140
141 - def _call_candc(self, inputs, discourse_ids, question, filename, verbose=False):
142 """ 143 Call the C{candc} binary with the given input. 144 145 @param inputs: C{list} of C{list} of C{str} Input discourses to parse 146 @param discourse_ids: C{list} of C{str} Identifiers to be inserted to each occurrence-indexed predicate. 147 @param filename: C{str} A filename for the output file 148 @return: stdout 149 """ 150 args = ['--models', os.path.join(self._candc_models_path, ['boxer','questions'][question]), 151 '--output', filename, 152 '--candc-printer', 'boxer'] 153 return self._call('\n'.join(sum((["<META>'%s'" % id] + d for d,id in zip(inputs,discourse_ids)), [])), self._candc_bin, args, verbose)
154
155 - def _call_boxer(self, filename, verbose=False):
156 """ 157 Call the C{boxer} binary with the given input. 158 159 @param filename: C{str} A filename for the input file 160 @return: stdout 161 """ 162 args = ['--box', 'false', 163 '--semantics', 'drs', 164 '--flat', 'false', 165 '--resolve', 'true', 166 '--elimeq', ['false','true'][self._elimeq], 167 '--format', 'prolog', 168 '--instantiate', 'true', 169 '--input', filename] 170 171 return self._call(None, self._boxer_bin, args, verbose)
172
173 - def _find_binary(self, name, bin_dir, verbose=False):
174 return nltk.internals.find_binary(name, 175 path_to_bin=bin_dir, 176 env_vars=['CANDCHOME'], 177 url='http://svn.ask.it.usyd.edu.au/trac/candc/', 178 binary_names=[name, name + '.exe'], 179 verbose=verbose)
180
181 - def _call(self, input_str, binary, args=[], verbose=False):
182 """ 183 Call the binary with the given input. 184 185 @param input_str: A string whose contents are used as stdin. 186 @param binary: The location of the binary to call 187 @param args: A list of command-line arguments. 188 @return: stdout 189 """ 190 if verbose: 191 print 'Calling:', binary 192 print 'Args:', args 193 print 'Input:', input_str 194 print 'Command:', binary + ' ' + ' '.join(args) 195 196 # Call via a subprocess 197 if input_str is None: 198 cmd = [binary] + args 199 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 200 else: 201 cmd = 'echo "%s" | %s %s' % (input_str, binary, ' '.join(args)) 202 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) 203 stdout, stderr = p.communicate() 204 205 if verbose: 206 print 'Return code:', p.returncode 207 if stdout: print 'stdout:\n', stdout, '\n' 208 if stderr: print 'stderr:\n', stderr, '\n' 209 if p.returncode != 0: 210 raise Exception('ERROR CALLING: %s %s\nReturncode: %d\n%s' % (binary, ' '.join(args), p.returncode, stderr)) 211 212 return stdout
213
214 - def _parse_to_drs_dict(self, boxer_out, use_disc_id):
215 lines = boxer_out.split('\n') 216 drs_dict = {} 217 i = 0 218 while i < len(lines): 219 line = lines[i] 220 if line.startswith('id('): 221 comma_idx = line.index(',') 222 discourse_id = line[3:comma_idx] 223 if discourse_id[0] == "'" and discourse_id[-1] == "'": 224 discourse_id = discourse_id[1:-1] 225 drs_id = line[comma_idx+1:line.index(')')] 226 i += 1 227 line = lines[i] 228 assert line.startswith('sem(%s,' % drs_id) 229 230 i += 4 231 line = lines[i] 232 assert line.endswith(').') 233 drs_input = line[:-2].strip() 234 parsed = self._parse_drs(drs_input, discourse_id, use_disc_id) 235 drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed) 236 i += 1 237 return drs_dict
238
239 - def _parse_drs(self, drs_string, discourse_id, use_disc_id):
240 return BoxerOutputDrsParser([None,discourse_id][use_disc_id]).parse(drs_string)
241 242
243 -class BoxerOutputDrsParser(DrtParser):
244 - def __init__(self, discourse_id=None):
245 """ 246 This class is used to parse the Prolog DRS output from Boxer into a 247 hierarchy of python objects. 248 """ 249 DrtParser.__init__(self) 250 self.discourse_id = discourse_id 251 self.sentence_id_offset = None 252 self.quote_chars = [("'", "'", "\\", False)] 253 self._label_counter = None
254
255 - def parse(self, data, signature=None):
256 self._label_counter = Counter(-1) 257 return DrtParser.parse(self, data, signature)
258
259 - def get_all_symbols(self):
260 return ['(', ')', ',', '[', ']',':']
261
262 - def handle(self, tok, context):
263 return self.handle_drs(tok)
264
265 - def attempt_adjuncts(self, expression, context):
266 return expression
267
268 - def parse_condition(self, indices):
269 """ 270 Parse a DRS condition 271 272 @return: C{list} of C{AbstractDrs} 273 """ 274 tok = self.token() 275 accum = self.handle_condition(tok, indices) 276 if accum is None: 277 raise UnexpectedTokenException(tok) 278 return accum
279
280 - def handle_drs(self, tok):
281 if tok == 'drs': 282 return self.parse_drs() 283 elif tok in ['merge', 'smerge']: 284 return self._handle_binary_expression(self._make_merge_expression)(None, [])
285
286 - def handle_condition(self, tok, indices):
287 """ 288 Handle a DRS condition 289 290 @param indices: C{list} of C{int} 291 @return: C{list} of C{AbstractDrs} 292 """ 293 if tok == 'not': 294 return [self._handle_not()] 295 296 if tok == 'or': 297 conds = [self._handle_binary_expression(self._make_or_expression)] 298 elif tok == 'imp': 299 conds = [self._handle_binary_expression(self._make_imp_expression)] 300 elif tok == 'eq': 301 conds = [self._handle_eq()] 302 elif tok == 'prop': 303 conds = [self._handle_prop()] 304 305 elif tok == 'pred': 306 conds = [self._handle_pred()] 307 elif tok == 'named': 308 conds = [self._handle_named()] 309 elif tok == 'rel': 310 conds = [self._handle_rel()] 311 elif tok == 'timex': 312 conds = self._handle_timex() 313 elif tok == 'card': 314 conds = [self._handle_card()] 315 316 elif tok == 'whq': 317 conds = [self._handle_whq()] 318 319 else: 320 conds = [] 321 322 return sum([[cond(sent_index, word_indices) for cond in conds] for sent_index, word_indices in self._sent_and_word_indices(indices)], [])
323
324 - def _handle_not(self):
325 self.assertToken(self.token(), '(') 326 drs = self.parse_Expression(None) 327 self.assertToken(self.token(), ')') 328 return BoxerNot(drs)
329
330 - def _handle_pred(self):
331 #pred(_G3943, dog, n, 0) 332 self.assertToken(self.token(), '(') 333 variable = self.parse_variable() 334 self.assertToken(self.token(), ',') 335 name = self.token() 336 self.assertToken(self.token(), ',') 337 pos = self.token() 338 self.assertToken(self.token(), ',') 339 sense = int(self.token()) 340 self.assertToken(self.token(), ')') 341 342 def _handle_pred_f(sent_index, word_indices): 343 if name=='event' and sent_index is None and ((pos=='n' and sense==1) or (pos=='v' and sense==0)): 344 return BoxerEvent(variable) 345 else: 346 return BoxerPred(self.discourse_id, sent_index, word_indices, variable, name, pos, sense)
347 return _handle_pred_f
348
349 - def _handle_named(self):
350 #named(x0, john, per, 0) 351 self.assertToken(self.token(), '(') 352 variable = self.parse_variable() 353 self.assertToken(self.token(), ',') 354 name = self.token() 355 self.assertToken(self.token(), ',') 356 type = self.token() 357 self.assertToken(self.token(), ',') 358 sense = int(self.token()) 359 self.assertToken(self.token(), ')') 360 return lambda sent_index, word_indices: BoxerNamed(self.discourse_id, sent_index, word_indices, variable, name, type, sense)
361
362 - def _handle_rel(self):
363 #rel(_G3993, _G3943, agent, 0) 364 self.assertToken(self.token(), '(') 365 var1 = self.parse_variable() 366 self.assertToken(self.token(), ',') 367 var2 = self.parse_variable() 368 self.assertToken(self.token(), ',') 369 rel = self.token() 370 self.assertToken(self.token(), ',') 371 sense = int(self.token()) 372 self.assertToken(self.token(), ')') 373 return lambda sent_index, word_indices: BoxerRel(self.discourse_id, sent_index, word_indices, var1, var2, rel, sense)
374
375 - def _handle_timex(self):
376 #timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX')) 377 self.assertToken(self.token(), '(') 378 arg = self.parse_variable() 379 self.assertToken(self.token(), ',') 380 new_conds = self._handle_time_expression(arg) 381 self.assertToken(self.token(), ')') 382 return new_conds
383
384 - def _handle_time_expression(self, arg):
385 #date([]: (+), []:'XXXX', [1004]:'04', []:'XX') 386 tok = self.token() 387 self.assertToken(self.token(), '(') 388 if tok == 'date': 389 conds = self._handle_date(arg) 390 elif tok == 'time': 391 conds = self._handle_time(arg) 392 else: 393 return None 394 self.assertToken(self.token(), ')') 395 return [lambda sent_index, word_indices: BoxerPred(self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0)] + \ 396 [lambda sent_index, word_indices: cond for cond in conds]
397
398 - def _handle_date(self, arg):
399 #[]: (+), []:'XXXX', [1004]:'04', []:'XX' 400 conds = [] 401 (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) 402 self.assertToken(self.token(), '(') 403 pol = self.token() 404 self.assertToken(self.token(), ')') 405 conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_pol_%s' % (pol), 'a', 0)) 406 self.assertToken(self.token(), ',') 407 408 (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) 409 year = self.token() 410 if year != 'XXXX': 411 year = year.replace(':', '_') 412 conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_year_%s' % (year), 'a', 0)) 413 self.assertToken(self.token(), ',') 414 415 (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) 416 month = self.token() 417 if month != 'XX': 418 conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_month_%s' % (month), 'a', 0)) 419 self.assertToken(self.token(), ',') 420 421 (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) 422 day = self.token() 423 if day != 'XX': 424 conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_day_%s' % (day), 'a', 0)) 425 426 return conds
427
428 - def _handle_time(self, arg):
429 #time([1018]:'18', []:'XX', []:'XX') 430 conds = [] 431 self._parse_index_list() 432 hour = self.token() 433 if hour != 'XX': 434 conds.append(self._make_atom('r_hour_2',arg,hour)) 435 self.assertToken(self.token(), ',') 436 437 self._parse_index_list() 438 min = self.token() 439 if min != 'XX': 440 conds.append(self._make_atom('r_min_2',arg,min)) 441 self.assertToken(self.token(), ',') 442 443 self._parse_index_list() 444 sec = self.token() 445 if sec != 'XX': 446 conds.append(self._make_atom('r_sec_2',arg,sec)) 447 448 return conds
449
450 - def _handle_card(self):
451 #card(_G18535, 28, ge) 452 self.assertToken(self.token(), '(') 453 variable = self.parse_variable() 454 self.assertToken(self.token(), ',') 455 value = self.token() 456 self.assertToken(self.token(), ',') 457 type = self.token() 458 self.assertToken(self.token(), ')') 459 return lambda sent_index, word_indices: BoxerCard(self.discourse_id, sent_index, word_indices, variable, value, type)
460
461 - def _handle_prop(self):
462 #prop(_G15949, drs(...)) 463 self.assertToken(self.token(), '(') 464 variable = self.parse_variable() 465 self.assertToken(self.token(), ',') 466 drs = self.parse_Expression(None) 467 self.assertToken(self.token(), ')') 468 return lambda sent_index, word_indices: BoxerProp(self.discourse_id, sent_index, word_indices, variable, drs)
469
470 - def _parse_index_list(self):
471 #[1001,1002]: 472 indices = [] 473 self.assertToken(self.token(), '[') 474 while self.token(0) != ']': 475 indices.append(self.parse_index()) 476 if self.token(0) == ',': 477 self.token() #swallow ',' 478 self.token() #swallow ']' 479 self.assertToken(self.token(), ':') 480 return indices
481
482 - def parse_drs(self):
483 #drs([[1001]:_G3943], 484 # [[1002]:pred(_G3943, dog, n, 0)] 485 # ) 486 label = self._label_counter.get() 487 self.assertToken(self.token(), '(') 488 self.assertToken(self.token(), '[') 489 refs = set() 490 while self.token(0) != ']': 491 indices = self._parse_index_list() 492 refs.add(self.parse_variable()) 493 if self.token(0) == ',': 494 self.token() #swallow ',' 495 self.token() #swallow ']' 496 self.assertToken(self.token(), ',') 497 self.assertToken(self.token(), '[') 498 conds = [] 499 while self.token(0) != ']': 500 indices = self._parse_index_list() 501 conds.extend(self.parse_condition(indices)) 502 if self.token(0) == ',': 503 self.token() #swallow ',' 504 self.token() #swallow ']' 505 self.assertToken(self.token(), ')') 506 return BoxerDrs(label, list(refs), conds)
507
508 - def _handle_binary_expression(self, make_callback):
509 self.assertToken(self.token(), '(') 510 drs1 = self.parse_Expression(None) 511 self.assertToken(self.token(), ',') 512 drs2 = self.parse_Expression(None) 513 self.assertToken(self.token(), ')') 514 return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2)
515
516 - def _handle_eq(self):
517 self.assertToken(self.token(), '(') 518 var1 = self.parse_variable() 519 self.assertToken(self.token(), ',') 520 var2 = self.parse_variable() 521 self.assertToken(self.token(), ')') 522 return lambda sent_index, word_indices: BoxerEq(self.discourse_id, sent_index, word_indices, var1, var2)
523 524
525 - def _handle_whq(self):
526 self.assertToken(self.token(), '(') 527 self.assertToken(self.token(), '[') 528 ans_types = [] 529 while self.token(0) != ']': 530 cat = self.token() 531 self.assertToken(self.token(), ':') 532 if cat == 'des': 533 ans_types.append(self.token()) 534 elif cat == 'num': 535 ans_types.append('number') 536 typ = self.token() 537 if typ == 'cou': 538 ans_types.append('count') 539 else: 540 ans_types.append(typ) 541 else: 542 ans_types.append(self.token()) 543 self.token() #swallow the ']' 544 545 self.assertToken(self.token(), ',') 546 d1 = self.parse_Expression(None) 547 self.assertToken(self.token(), ',') 548 ref = self.parse_variable() 549 self.assertToken(self.token(), ',') 550 d2 = self.parse_Expression(None) 551 self.assertToken(self.token(), ')') 552 return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2)
553
554 - def _make_merge_expression(self, sent_index, word_indices, drs1, drs2):
555 return BoxerDrs(drs1.label, drs1.refs + drs2.refs, drs1.conds + drs2.conds)
556
557 - def _make_or_expression(self, sent_index, word_indices, drs1, drs2):
558 return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2)
559
560 - def _make_imp_expression(self, sent_index, word_indices, drs1, drs2):
561 return BoxerDrs(drs1.label, drs1.refs, drs1.conds, drs2)
562
563 - def parse_variable(self):
564 var = self.token() 565 assert re.match('^x\d+$', var) 566 return int(var[1:])
567
568 - def parse_index(self):
569 return int(self.token())
570
571 - def _sent_and_word_indices(self, indices):
572 """ 573 @return: C{list} of (sent_index, word_indices) tuples 574 """ 575 sent_indices = set((i / 1000)-1 for i in indices if i>=0) 576 if sent_indices: 577 pairs = [] 578 for sent_index in sent_indices: 579 word_indices = [(i % 1000)-1 for i in indices if sent_index == (i / 1000)-1] 580 pairs.append((sent_index, word_indices)) 581 return pairs 582 else: 583 word_indices = [(i % 1000)-1 for i in indices] 584 return [(None, word_indices)]
585 586
587 -class BoxerDrsParser(DrtParser):
588 """ 589 Reparse the str form of subclasses of C{AbstractBoxerDrs} 590 """
591 - def __init__(self, discourse_id=None):
592 DrtParser.__init__(self) 593 self.discourse_id = discourse_id
594
595 - def get_all_symbols(self):
597
598 - def attempt_adjuncts(self, expression, context):
599 return expression
600
601 - def handle(self, tok, context):
602 try: 603 if tok == 'drs': 604 self.assertNextToken(DrtTokens.OPEN) 605 label = int(self.token()) 606 self.assertNextToken(DrtTokens.COMMA) 607 refs = map(int, self.handle_refs()) 608 self.assertNextToken(DrtTokens.COMMA) 609 conds = self.handle_conds(None) 610 self.assertNextToken(DrtTokens.CLOSE) 611 return BoxerDrs(label, refs, conds) 612 elif tok == 'pred': 613 self.assertNextToken(DrtTokens.OPEN) 614 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 615 self.assertNextToken(DrtTokens.COMMA) 616 sent_id = self.nullableIntToken() 617 self.assertNextToken(DrtTokens.COMMA) 618 word_ids = map(int, self.handle_refs()) 619 self.assertNextToken(DrtTokens.COMMA) 620 variable = int(self.token()) 621 self.assertNextToken(DrtTokens.COMMA) 622 name = self.token() 623 self.assertNextToken(DrtTokens.COMMA) 624 pos = self.token() 625 self.assertNextToken(DrtTokens.COMMA) 626 sense = int(self.token()) 627 self.assertNextToken(DrtTokens.CLOSE) 628 return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense) 629 elif tok == 'named': 630 self.assertNextToken(DrtTokens.OPEN) 631 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 632 self.assertNextToken(DrtTokens.COMMA) 633 sent_id = int(self.token()) 634 self.assertNextToken(DrtTokens.COMMA) 635 word_ids = map(int, self.handle_refs()) 636 self.assertNextToken(DrtTokens.COMMA) 637 variable = int(self.token()) 638 self.assertNextToken(DrtTokens.COMMA) 639 name = self.token() 640 self.assertNextToken(DrtTokens.COMMA) 641 type = self.token() 642 self.assertNextToken(DrtTokens.COMMA) 643 sense = int(self.token()) 644 self.assertNextToken(DrtTokens.CLOSE) 645 return BoxerNamed(disc_id, sent_id, word_ids, variable, name, type, sense) 646 elif tok == 'rel': 647 self.assertNextToken(DrtTokens.OPEN) 648 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 649 self.assertNextToken(DrtTokens.COMMA) 650 sent_id = self.nullableIntToken() 651 self.assertNextToken(DrtTokens.COMMA) 652 word_ids = map(int, self.handle_refs()) 653 self.assertNextToken(DrtTokens.COMMA) 654 var1 = int(self.token()) 655 self.assertNextToken(DrtTokens.COMMA) 656 var2 = int(self.token()) 657 self.assertNextToken(DrtTokens.COMMA) 658 rel = self.token() 659 self.assertNextToken(DrtTokens.COMMA) 660 sense = int(self.token()) 661 self.assertNextToken(DrtTokens.CLOSE) 662 return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense) 663 elif tok == 'event': 664 self.assertNextToken(DrtTokens.OPEN) 665 var = int(self.token()) 666 self.assertNextToken(DrtTokens.CLOSE) 667 return BoxerEvent(var) 668 elif tok == 'prop': 669 self.assertNextToken(DrtTokens.OPEN) 670 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 671 self.assertNextToken(DrtTokens.COMMA) 672 sent_id = int(self.token()) 673 self.assertNextToken(DrtTokens.COMMA) 674 word_ids = map(int, self.handle_refs()) 675 self.assertNextToken(DrtTokens.COMMA) 676 variable = int(self.token()) 677 self.assertNextToken(DrtTokens.COMMA) 678 drs = self.parse_Expression(None) 679 self.assertNextToken(DrtTokens.CLOSE) 680 return BoxerProp(disc_id, sent_id, word_ids, variable, drs) 681 elif tok == 'not': 682 self.assertNextToken(DrtTokens.OPEN) 683 drs = self.parse_Expression(None) 684 self.assertNextToken(DrtTokens.CLOSE) 685 return BoxerNot(drs) 686 elif tok == 'imp': 687 self.assertNextToken(DrtTokens.OPEN) 688 drs1 = self.parse_Expression(None) 689 self.assertNextToken(DrtTokens.COMMA) 690 drs2 = self.parse_Expression(None) 691 self.assertNextToken(DrtTokens.CLOSE) 692 return BoxerDrs(drs1.label, drs1.refs, drs1.conds, drs2) 693 elif tok == 'or': 694 self.assertNextToken(DrtTokens.OPEN) 695 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 696 self.assertNextToken(DrtTokens.COMMA) 697 sent_id = self.nullableIntToken() 698 self.assertNextToken(DrtTokens.COMMA) 699 word_ids = map(int, self.handle_refs()) 700 self.assertNextToken(DrtTokens.COMMA) 701 drs1 = self.parse_Expression(None) 702 self.assertNextToken(DrtTokens.COMMA) 703 drs2 = self.parse_Expression(None) 704 self.assertNextToken(DrtTokens.CLOSE) 705 return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2) 706 elif tok == 'eq': 707 self.assertNextToken(DrtTokens.OPEN) 708 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 709 self.assertNextToken(DrtTokens.COMMA) 710 sent_id = self.nullableIntToken() 711 self.assertNextToken(DrtTokens.COMMA) 712 word_ids = map(int, self.handle_refs()) 713 self.assertNextToken(DrtTokens.COMMA) 714 var1 = int(self.token()) 715 self.assertNextToken(DrtTokens.COMMA) 716 var2 = int(self.token()) 717 self.assertNextToken(DrtTokens.CLOSE) 718 return BoxerEq(disc_id, sent_id, word_ids, var1, var2) 719 elif tok == 'card': 720 self.assertNextToken(DrtTokens.OPEN) 721 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 722 self.assertNextToken(DrtTokens.COMMA) 723 sent_id = self.nullableIntToken() 724 self.assertNextToken(DrtTokens.COMMA) 725 word_ids = map(int, self.handle_refs()) 726 self.assertNextToken(DrtTokens.COMMA) 727 var = int(self.token()) 728 self.assertNextToken(DrtTokens.COMMA) 729 value = self.token() 730 self.assertNextToken(DrtTokens.COMMA) 731 type = self.token() 732 self.assertNextToken(DrtTokens.CLOSE) 733 return BoxerCard(disc_id, sent_id, word_ids, var, value, type) 734 elif tok == 'whq': 735 self.assertNextToken(DrtTokens.OPEN) 736 disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] 737 self.assertNextToken(DrtTokens.COMMA) 738 sent_id = self.nullableIntToken() 739 self.assertNextToken(DrtTokens.COMMA) 740 word_ids = map(int, self.handle_refs()) 741 self.assertNextToken(DrtTokens.COMMA) 742 ans_types = self.handle_refs() 743 self.assertNextToken(DrtTokens.COMMA) 744 drs1 = self.parse_Expression(None) 745 self.assertNextToken(DrtTokens.COMMA) 746 var = int(self.token()) 747 self.assertNextToken(DrtTokens.COMMA) 748 drs2 = self.parse_Expression(None) 749 self.assertNextToken(DrtTokens.CLOSE) 750 return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2) 751 except Exception, e: 752 raise ParseException(self._currentIndex, str(e)) 753 assert False, repr(tok)
754
755 - def nullableIntToken(self):
756 t = self.token() 757 return [None,int(t)][t != 'None']
758
759 - def get_next_token_variable(self, description):
760 try: 761 return self.token() 762 except ExpectedMoreTokensException, e: 763 raise ExpectedMoreTokensException(e.index, 'Variable expected.')
764 765 766
767 -class AbstractBoxerDrs(object):
768 - def variables(self):
769 """ 770 @return: (set<variables>, set<events>, set<propositions>) 771 """ 772 variables, events, propositions = self._variables() 773 return (variables - (events | propositions), events, propositions - events)
774
775 - def variable_types(self):
776 vartypes = {} 777 for t,vars in zip(('z','e','p'), self.variables()): 778 for v in vars: 779 vartypes[v] = t 780 return vartypes
781
782 - def _variables(self):
783 """ 784 @return: (set<variables>, set<events>, set<propositions>) 785 """ 786 return (set(), set(), set())
787
788 - def atoms(self):
789 return set()
790
791 - def clean(self):
792 return self
793
794 - def _clean_name(self, name):
795 return name.replace('-','_').replace("'", "_")
796
797 - def renumber_sentences(self, f):
798 return self
799
800 - def __hash__(self):
801 return hash(str(self))
802
803 -class BoxerDrs(AbstractBoxerDrs):
804 - def __init__(self, label, refs, conds, consequent=None):
805 AbstractBoxerDrs.__init__(self) 806 self.label = label 807 self.refs = refs 808 self.conds = conds 809 self.consequent = consequent
810
811 - def _variables(self):
812 variables = (set(), set(), set()) 813 for cond in self.conds: 814 for s,v in zip(variables, cond._variables()): 815 s.update(v) 816 if self.consequent is not None: 817 for s,v in zip(variables, self.consequent._variables()): 818 s.update(v) 819 return variables
820
821 - def atoms(self):
822 atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set()) 823 if self.consequent is not None: 824 atoms.update(self.consequent.atoms()) 825 return atoms
826
827 - def clean(self):
828 if self.consequent: 829 consequent = self.consequent.clean() 830 else: 831 consequent = None 832 return BoxerDrs(self.label, self.refs, [c.clean() for c in self.conds], consequent)
833
834 - def renumber_sentences(self, f):
835 if self.consequent: 836 consequent = self.consequent.renumber_sentences(f) 837 else: 838 consequent = None 839 return BoxerDrs(self.label, self.refs, [c.renumber_sentences(f) for c in self.conds], consequent)
840
841 - def __repr__(self):
842 s = 'drs(%s, [%s], [%s])' % (self.label, 843 ', '.join(map(str, self.refs)), 844 ', '.join(map(str, self.conds))) 845 if self.consequent is not None: 846 s = 'imp(%s, %s)' % (s, self.consequent) 847 return s
848
849 - def __eq__(self, other):
850 return self.__class__ == other.__class__ and \ 851 self.label == other.label and \ 852 self.refs == other.refs and \ 853 len(self.conds) == len(other.conds) and \ 854 reduce(operator.and_, (c1==c2 for c1,c2 in zip(self.conds, other.conds))) and \ 855 self.consequent == other.consequent
856
857 -class BoxerNot(AbstractBoxerDrs):
858 - def __init__(self, drs):
859 AbstractBoxerDrs.__init__(self) 860 self.drs = drs
861
862 - def _variables(self):
863 return self.drs._variables()
864
865 - def atoms(self):
866 return self.drs.atoms()
867
868 - def clean(self):
869 return BoxerNot(self.drs.clean())
870
871 - def renumber_sentences(self, f):
872 return BoxerNot(self.drs.renumber_sentences(f))
873
874 - def __repr__(self):
875 return 'not(%s)' % (self.drs)
876
877 - def __eq__(self, other):
878 return self.__class__ == other.__class__ and self.drs == other.drs
879
880 -class BoxerEvent(AbstractBoxerDrs):
881 - def __init__(self, var):
882 AbstractBoxerDrs.__init__(self) 883 self.var = var
884
885 - def _variables(self):
886 return (set(), set([self.var]), set())
887
888 - def __repr__(self):
889 return 'event(%s)' % (self.var)
890
891 - def __eq__(self, other):
892 return self.__class__ == other.__class__ and self.var == other.var
893
894 -class BoxerIndexed(AbstractBoxerDrs):
895 - def __init__(self, discourse_id, sent_index, word_indices):
896 AbstractBoxerDrs.__init__(self) 897 self.discourse_id = discourse_id 898 self.sent_index = sent_index 899 self.word_indices = word_indices
900
901 - def atoms(self):
902 return set([self])
903
904 - def __eq__(self, other):
905 return self.__class__ == other.__class__ and \ 906 self.discourse_id == other.discourse_id and \ 907 self.sent_index == other.sent_index and \ 908 self.word_indices == other.word_indices and \ 909 reduce(operator.and_, (s==o for s,o in zip(self, other)))
910
911 - def __repr__(self):
912 s = '%s(%s, %s, [%s]' % (self._pred(), self.discourse_id, self.sent_index, ', '.join(map(str, self.word_indices))) 913 for v in self: 914 s += ', %s' % v 915 return s + ')'
916
917 -class BoxerPred(BoxerIndexed):
918 - def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense):
919 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 920 self.var = var 921 self.name = name 922 self.pos = pos 923 self.sense = sense
924
925 - def _variables(self):
926 return (set([self.var]), set(), set())
927
928 - def change_var(self, var):
929 return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.pos, self.sense)
930
931 - def clean(self):
932 return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.pos, self.sense)
933
934 - def renumber_sentences(self, f):
935 new_sent_index = f(self.sent_index) 936 return BoxerPred(self.discourse_id, new_sent_index, self.word_indices, self.var, self.name, self.pos, self.sense)
937
938 - def __iter__(self):
939 return iter((self.var, self.name, self.pos, self.sense))
940
941 - def _pred(self):
942 return 'pred'
943
944 -class BoxerNamed(BoxerIndexed):
945 - def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense):
946 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 947 self.var = var 948 self.name = name 949 self.type = type 950 self.sense = sense
951
952 - def _variables(self):
953 return (set([self.var]), set(), set())
954
955 - def change_var(self, var):
956 return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.type, self.sense)
957
958 - def clean(self):
959 return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.type, self.sense)
960
961 - def renumber_sentences(self, f):
962 return BoxerNamed(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.name, self.type, self.sense)
963
964 - def __iter__(self):
965 return iter((self.var, self.name, self.type, self.sense))
966
967 - def _pred(self):
968 return 'named'
969
970 -class BoxerRel(BoxerIndexed):
971 - def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense):
972 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 973 self.var1 = var1 974 self.var2 = var2 975 self.rel = rel 976 self.sense = sense
977
978 - def _variables(self):
979 return (set([self.var1, self.var2]), set(), set())
980
981 - def clean(self):
982 return BoxerRel(self.discourse_id, self.sent_index, self.word_indices, self.var1, self.var2, self._clean_name(self.rel), self.sense)
983
984 - def renumber_sentences(self, f):
985 return BoxerRel(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, self.rel, self.sense)
986
987 - def __iter__(self):
988 return iter((self.var1, self.var2, self.rel, self.sense))
989
990 - def _pred(self):
991 return 'rel'
992
993 -class BoxerProp(BoxerIndexed):
994 - def __init__(self, discourse_id, sent_index, word_indices, var, drs):
995 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 996 self.var = var 997 self.drs = drs
998
999 - def _variables(self):
1000 return tuple(map(operator.or_, (set(), set(), set([self.var])), self.drs._variables()))
1001
1002 - def referenced_labels(self):
1003 return set([self.drs])
1004
1005 - def atoms(self):
1006 return self.drs.atoms()
1007
1008 - def clean(self):
1009 return BoxerProp(self.discourse_id, self.sent_index, self.word_indices, self.var, self.drs.clean())
1010
1011 - def renumber_sentences(self, f):
1012 return BoxerProp(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.drs.renumber_sentences(f))
1013
1014 - def __iter__(self):
1015 return iter((self.var, self.drs))
1016
1017 - def _pred(self):
1018 return 'prop'
1019
1020 -class BoxerEq(BoxerIndexed):
1021 - def __init__(self, discourse_id, sent_index, word_indices, var1, var2):
1022 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 1023 self.var1 = var1 1024 self.var2 = var2
1025
1026 - def _variables(self):
1027 return (set([self.var1, self.var2]), set(), set())
1028
1029 - def atoms(self):
1030 return set()
1031
1032 - def renumber_sentences(self, f):
1033 return BoxerEq(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2)
1034
1035 - def __iter__(self):
1036 return iter((self.var1, self.var2))
1037
1038 - def _pred(self):
1039 return 'eq'
1040
1041 -class BoxerCard(BoxerIndexed):
1042 - def __init__(self, discourse_id, sent_index, word_indices, var, value, type):
1043 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 1044 self.var = var 1045 self.value = value 1046 self.type = type
1047
1048 - def _variables(self):
1049 return (set([self.var]), set(), set())
1050
1051 - def renumber_sentences(self, f):
1052 return BoxerCard(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.value, self.type)
1053
1054 - def __iter__(self):
1055 return iter((self.var, self.value, self.type))
1056
1057 - def _pred(self):
1058 return 'card'
1059
1060 -class BoxerOr(BoxerIndexed):
1061 - def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2):
1062 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 1063 self.drs1 = drs1 1064 self.drs2 = drs2
1065
1066 - def _variables(self):
1067 return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables()))
1068
1069 - def atoms(self):
1070 return self.drs1.atoms() | self.drs2.atoms()
1071
1072 - def clean(self):
1073 return BoxerOr(self.discourse_id, self.sent_index, self.word_indices, self.drs1.clean(), self.drs2.clean())
1074
1075 - def renumber_sentences(self, f):
1076 return BoxerOr(self.discourse_id, f(self.sent_index), self.word_indices, self.drs1, self.drs2)
1077
1078 - def __iter__(self):
1079 return iter((self.drs1, self.drs2))
1080
1081 - def _pred(self):
1082 return 'or'
1083
1084 -class BoxerWhq(BoxerIndexed):
1085 - def __init__(self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2):
1086 BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) 1087 self.ans_types = ans_types 1088 self.drs1 = drs1 1089 self.variable = variable 1090 self.drs2 = drs2
1091
1092 - def _variables(self):
1093 return tuple(map(operator.or_, (set([self.variable]), set(), set()), self.drs1._variables(), self.drs2._variables()))
1094
1095 - def atoms(self):
1096 return self.drs1.atoms() | self.drs2.atoms()
1097
1098 - def clean(self):
1099 return BoxerWhq(self.discourse_id, self.sent_index, self.word_indices, self.ans_types, self.drs1.clean(), self.variable, self.drs2.clean())
1100
1101 - def renumber_sentences(self, f):
1102 return BoxerWhq(self.discourse_id, f(self.sent_index), self.word_indices, self.ans_types, self.drs1, self.variable, self.drs2)
1103
1104 - def __iter__(self):
1105 return iter(('['+','.join(self.ans_types)+']', self.drs1, self.variable, self.drs2))
1106
1107 - def _pred(self):
1108 return 'whq'
1109 1110 1111
1112 -class PassthroughBoxerDrsInterpreter(object):
1113 - def interpret(self, ex):
1114 return ex
1115 1116
1117 -class NltkDrtBoxerDrsInterpreter(object):
1118 - def __init__(self, occur_index=False):
1119 self._occur_index = occur_index
1120
1121 - def interpret(self, ex):
1122 """ 1123 @param ex: C{AbstractBoxerDrs} 1124 @return: C{AbstractDrs} 1125 """ 1126 if isinstance(ex, BoxerDrs): 1127 drs = DRS([Variable('x%d' % r) for r in ex.refs], map(self.interpret, ex.conds)) 1128 if ex.label is not None: 1129 drs.label = Variable('x%d' % ex.label) 1130 if ex.consequent is not None: 1131 drs.consequent = self.interpret(ex.consequent) 1132 return drs 1133 elif isinstance(ex, BoxerNot): 1134 return DrtNegatedExpression(self.interpret(ex.drs)) 1135 elif isinstance(ex, BoxerEvent): 1136 return self._make_atom('event', 'x%d' % ex.var) 1137 elif isinstance(ex, BoxerPred): 1138 pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex) 1139 return self._make_atom(pred, 'x%d' % ex.var) 1140 elif isinstance(ex, BoxerNamed): 1141 pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex) 1142 return self._make_atom(pred, 'x%d' % ex.var) 1143 elif isinstance(ex, BoxerRel): 1144 pred = self._add_occur_indexing('%s' % (ex.rel), ex) 1145 return self._make_atom(pred, 'x%d' % ex.var1, 'x%d' % ex.var2) 1146 elif isinstance(ex, BoxerProp): 1147 return DrtProposition(Variable('x%d' % ex.var), self.interpret(ex.drs)) 1148 elif isinstance(ex, BoxerEq): 1149 return DrtEqualityExpression(DrtVariableExpression(Variable('x%d' % ex.var1)), 1150 DrtVariableExpression(Variable('x%d' % ex.var2))) 1151 elif isinstance(ex, BoxerCard): 1152 pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex) 1153 return self._make_atom(pred, 'x%d' % ex.var) 1154 elif isinstance(ex, BoxerOr): 1155 return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2)) 1156 elif isinstance(ex, BoxerWhq): 1157 drs1 = self.interpret(ex.drs1) 1158 drs2 = self.interpret(ex.drs2) 1159 return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds) 1160 assert False, '%s: %s' % (ex.__class__.__name__, ex)
1161
1162 - def _make_atom(self, pred, *args):
1163 accum = DrtVariableExpression(Variable(pred)) 1164 for arg in args: 1165 accum = DrtApplicationExpression(accum, DrtVariableExpression(Variable(arg))) 1166 return accum
1167
1168 - def _add_occur_indexing(self, base, ex):
1169 if self._occur_index and ex.sent_index is not None: 1170 if ex.discourse_id: 1171 base += '_%s' % ex.discourse_id 1172 base += '_s%s' % ex.sent_index 1173 base += '_w%s' % sorted(ex.word_indices)[0] 1174 return base
1175 1176
1177 -class UnparseableInputException(Exception):
1178 pass
1179 1180 1181 if __name__ == '__main__': 1182 opts = OptionParser("usage: %prog TEXT [options]") 1183 opts.add_option("--verbose", "-v", help="display verbose logs", action="store_true", default=False, dest="verbose") 1184 opts.add_option("--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol") 1185 opts.add_option("--question", "-q", help="input is a question", action="store_true", default=False, dest="question") 1186 opts.add_option("--occur", "-o", help="occurrence index", action="store_true", default=False, dest="occur_index") 1187 (options, args) = opts.parse_args() 1188 1189 if len(args) != 1: 1190 opts.error("incorrect number of arguments") 1191 1192 interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index) 1193 drs = Boxer(interpreter).interpret_multisentence(args[0].split(r'\n'), question=options.question, verbose=options.verbose) 1194 if drs is None: 1195 print None 1196 else: 1197 drs = drs.simplify().eliminate_equality() 1198 if options.fol: 1199 print drs.fol().normalize() 1200 else: 1201 drs.normalize().pprint() 1202