Package nltk :: Package app :: Module concordance_app
[hide private]
[frames] | no frames]

Source Code for Module nltk.app.concordance_app

  1  # Natural Language Toolkit: Concordance Application 
  2  # 
  3  # Copyright (C) 2001-2011 NLTK Project 
  4  # Author: Sumukh Ghodke <sghodke@csse.unimelb.edu.au> 
  5  # URL: <http://www.nltk.org/> 
  6  # For license information, see LICENSE.TXT 
  7  # 
  8  # $Id: concordance.py 6121 2008-07-11 02:10:33Z stevenbird $ 
  9   
 10  import re 
 11  import threading 
 12   
 13  import nltk 
 14  from nltk.util import in_idle 
 15  from nltk.draw.util import * 
 16   
 17  WORD_OR_TAG = '[^/ ]+' 
 18  BOUNDARY = r'\b' 
 19   
 20  CORPUS_LOADED_EVENT = '<<CL_EVENT>>' 
 21  SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>' 
 22  SEARCH_ERROR_EVENT = '<<SE_EVENT>>' 
 23  ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>' 
 24   
 25  # NB All corpora must be specified in a lambda expression so as not to be 
 26  # loaded when the module is imported. 
 27   
 28  _DEFAULT = 'English: Brown Corpus (Humor, simplified)' 
 29  _CORPORA = { 
 30              'Catalan: CESS-CAT Corpus (simplified)': 
 31                  lambda: nltk.corpus.cess_cat.tagged_sents(simplify_tags=True), 
 32              'English: Brown Corpus': 
 33                  lambda: nltk.corpus.brown.tagged_sents(), 
 34              'English: Brown Corpus (simplified)': 
 35                  lambda: nltk.corpus.brown.tagged_sents(simplify_tags=True), 
 36              'English: Brown Corpus (Press, simplified)': 
 37                  lambda: nltk.corpus.brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True), 
 38              'English: Brown Corpus (Religion, simplified)': 
 39                  lambda: nltk.corpus.brown.tagged_sents(categories='religion', simplify_tags=True), 
 40              'English: Brown Corpus (Learned, simplified)': 
 41                  lambda: nltk.corpus.brown.tagged_sents(categories='learned', simplify_tags=True), 
 42              'English: Brown Corpus (Science Fiction, simplified)': 
 43                  lambda: nltk.corpus.brown.tagged_sents(categories='science_fiction', simplify_tags=True), 
 44              'English: Brown Corpus (Romance, simplified)': 
 45                  lambda: nltk.corpus.brown.tagged_sents(categories='romance', simplify_tags=True), 
 46              'English: Brown Corpus (Humor, simplified)': 
 47                  lambda: nltk.corpus.brown.tagged_sents(categories='humor', simplify_tags=True), 
 48              'English: NPS Chat Corpus': 
 49                  lambda: nltk.corpus.nps_chat.tagged_posts(), 
 50              'English: NPS Chat Corpus (simplified)': 
 51                  lambda: nltk.corpus.nps_chat.tagged_posts(simplify_tags=True), 
 52              'English: Wall Street Journal Corpus': 
 53                  lambda: nltk.corpus.treebank.tagged_sents(), 
 54              'English: Wall Street Journal Corpus (simplified)': 
 55                  lambda: nltk.corpus.treebank.tagged_sents(simplify_tags=True), 
 56              'Chinese: Sinica Corpus': 
 57                  lambda: nltk.corpus.sinica_treebank.tagged_sents(), 
 58              'Chinese: Sinica Corpus (simplified)': 
 59                  lambda: nltk.corpus.sinica_treebank.tagged_sents(simplify_tags=True), 
 60              'Dutch: Alpino Corpus': 
 61                  lambda: nltk.corpus.alpino.tagged_sents(), 
 62              'Dutch: Alpino Corpus (simplified)': 
 63                  lambda: nltk.corpus.alpino.tagged_sents(simplify_tags=True), 
 64              'Hindi: Indian Languages Corpus': 
 65                  lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos'), 
 66              'Hindi: Indian Languages Corpus (simplified)': 
 67                  lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos', simplify_tags=True), 
 68              'Portuguese: Floresta Corpus (Portugal)': 
 69                  lambda: nltk.corpus.floresta.tagged_sents(), 
 70              'Portuguese: Floresta Corpus (Portugal, simplified)': 
 71                  lambda: nltk.corpus.floresta.tagged_sents(simplify_tags=True), 
 72              'Portuguese: MAC-MORPHO Corpus (Brazil)': 
 73                  lambda: nltk.corpus.mac_morpho.tagged_sents(), 
 74              'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': 
 75                  lambda: nltk.corpus.mac_morpho.tagged_sents(simplify_tags=True), 
 76              'Spanish: CESS-ESP Corpus (simplified)': 
 77                  lambda: nltk.corpus.cess_esp.tagged_sents(simplify_tags=True), 
 78             } 
 79   
80 -class ConcordanceSearchView(object):
81 _BACKGROUND_COLOUR='#FFF' #white 82 83 #Colour of highlighted results 84 _HIGHLIGHT_WORD_COLOUR='#F00' #red 85 _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' 86 87 _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey 88 _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG' 89 90 91 #Percentage of text left of the scrollbar position 92 _FRACTION_LEFT_TEXT=0.30 93
94 - def __init__(self):
95 self.model = ConcordanceSearchModel() 96 self.model.add_listener(self) 97 self.top = Tk() 98 self._init_top(self.top) 99 self._init_menubar() 100 self._init_widgets(self.top) 101 self._bind_event_handlers() 102 self.load_corpus(self.model.DEFAULT_CORPUS)
103
104 - def _init_top(self, top):
105 top.geometry('950x680+50+50') 106 top.title('NLTK Concordance Search') 107 top.bind('<Control-q>', self.destroy) 108 top.minsize(950,680)
109
110 - def _init_widgets(self, parent):
111 self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)) 112 self._init_corpus_select(self.main_frame) 113 self._init_query_box(self.main_frame) 114 self._init_results_box(self.main_frame) 115 self._init_paging(self.main_frame) 116 self._init_status(self.main_frame) 117 self.main_frame.pack(fill='both', expand=True)
118
119 - def _init_menubar(self):
120 self._result_size = IntVar(self.top) 121 self._cntx_bf_len = IntVar(self.top) 122 self._cntx_af_len = IntVar(self.top) 123 menubar = Menu(self.top) 124 125 filemenu = Menu(menubar, tearoff=0, borderwidth=0) 126 filemenu.add_command(label='Exit', underline=1, 127 command=self.destroy, accelerator='Ctrl-q') 128 menubar.add_cascade(label='File', underline=0, menu=filemenu) 129 130 editmenu = Menu(menubar, tearoff=0) 131 rescntmenu = Menu(editmenu, tearoff=0) 132 rescntmenu.add_radiobutton(label='20', variable=self._result_size, 133 underline=0, value=20, 134 command=self.set_result_size) 135 rescntmenu.add_radiobutton(label='50', variable=self._result_size, 136 underline=0, value=50, 137 command=self.set_result_size) 138 rescntmenu.add_radiobutton(label='100', variable=self._result_size, 139 underline=0, value=100, 140 command=self.set_result_size) 141 rescntmenu.invoke(1) 142 editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu) 143 144 cntxmenu = Menu(editmenu, tearoff=0) 145 cntxbfmenu = Menu(cntxmenu, tearoff=0) 146 cntxbfmenu.add_radiobutton(label='60 characters', 147 variable=self._cntx_bf_len, 148 underline=0, value=60, 149 command=self.set_cntx_bf_len) 150 cntxbfmenu.add_radiobutton(label='80 characters', 151 variable=self._cntx_bf_len, 152 underline=0, value=80, 153 command=self.set_cntx_bf_len) 154 cntxbfmenu.add_radiobutton(label='100 characters', 155 variable=self._cntx_bf_len, 156 underline=0, value=100, 157 command=self.set_cntx_bf_len) 158 cntxbfmenu.invoke(1) 159 cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu) 160 161 cntxafmenu = Menu(cntxmenu, tearoff=0) 162 cntxafmenu.add_radiobutton(label='70 characters', 163 variable=self._cntx_af_len, 164 underline=0, value=70, 165 command=self.set_cntx_af_len) 166 cntxafmenu.add_radiobutton(label='90 characters', 167 variable=self._cntx_af_len, 168 underline=0, value=90, 169 command=self.set_cntx_af_len) 170 cntxafmenu.add_radiobutton(label='110 characters', 171 variable=self._cntx_af_len, 172 underline=0, value=110, 173 command=self.set_cntx_af_len) 174 cntxafmenu.invoke(1) 175 cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu) 176 177 editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu) 178 179 menubar.add_cascade(label='Edit', underline=0, menu=editmenu) 180 181 self.top.config(menu=menubar)
182
183 - def set_result_size(self, **kwargs):
184 self.model.result_count = self._result_size.get()
185
186 - def set_cntx_af_len(self, **kwargs):
187 self._char_after = self._cntx_af_len.get()
188
189 - def set_cntx_bf_len(self, **kwargs):
190 self._char_before = self._cntx_bf_len.get()
191
192 - def _init_corpus_select(self, parent):
193 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) 194 self.var = StringVar(innerframe) 195 self.var.set(self.model.DEFAULT_CORPUS) 196 Label(innerframe, justify=LEFT, text=' Corpus: ', 197 background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left') 198 199 other_corpora = self.model.CORPORA.keys().remove(self.model.DEFAULT_CORPUS) 200 om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora()) 201 om['borderwidth'] = 0 202 om['highlightthickness'] = 1 203 om.pack(side='left') 204 innerframe.pack(side='top', fill='x', anchor='n')
205
206 - def _init_status(self, parent):
207 self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0) 208 self.status.pack(side='top', anchor='sw')
209
210 - def _init_query_box(self, parent):
211 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) 212 another = Frame(innerframe, background=self._BACKGROUND_COLOUR) 213 self.query_box = Entry(another, width=60) 214 self.query_box.pack(side='left', fill='x', pady=25, anchor='center') 215 self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1) 216 self.search_button.pack(side='left', fill='x', pady=25, anchor='center') 217 self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler) 218 another.pack() 219 innerframe.pack(side='top', fill='x', anchor='n')
220
221 - def search_enter_keypress_handler(self, *event):
222 self.search()
223
224 - def _init_results_box(self, parent):
225 innerframe = Frame(parent) 226 i1 = Frame(innerframe) 227 i2 = Frame(innerframe) 228 vscrollbar = Scrollbar(i1, borderwidth=1) 229 hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz') 230 self.results_box = Text(i1, 231 font=tkFont.Font(family='courier', size='16'), 232 state='disabled', borderwidth=1, 233 yscrollcommand=vscrollbar.set, 234 xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1) 235 self.results_box.pack(side='left', fill='both', expand=True) 236 self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR) 237 self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR) 238 vscrollbar.pack(side='left', fill='y', anchor='e') 239 vscrollbar.config(command=self.results_box.yview) 240 hscrollbar.pack(side='left', fill='x', expand=True, anchor='w') 241 hscrollbar.config(command=self.results_box.xview) 242 #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! 243 Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e') 244 i1.pack(side='top', fill='both', expand=True, anchor='n') 245 i2.pack(side='bottom', fill='x', anchor='s') 246 innerframe.pack(side='top', fill='both', expand=True)
247
248 - def _init_paging(self, parent):
249 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) 250 self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled') 251 prev.pack(side='left', anchor='center') 252 self.next = next = Button(innerframe, text='Next', command=self.next, width='10', borderwidth=1, highlightthickness=1, state='disabled') 253 next.pack(side='right', anchor='center') 254 innerframe.pack(side='top', fill='y') 255 self.current_page = 0
256
257 - def previous(self):
258 self.clear_results_box() 259 self.freeze_editable() 260 self.model.prev(self.current_page - 1)
261
262 - def next(self):
263 self.clear_results_box() 264 self.freeze_editable() 265 self.model.next(self.current_page + 1)
266
267 - def about(self, *e):
268 ABOUT = ("NLTK Concordance Search Demo\n") 269 TITLE = 'About: NLTK Concordance Search Demo' 270 try: 271 from tkMessageBox import Message 272 Message(message=ABOUT, title=TITLE, parent=self.main_frame).show() 273 except: 274 ShowText(self.top, TITLE, ABOUT)
275
276 - def _bind_event_handlers(self):
281
282 - def handle_error_loading_corpus(self, event):
283 self.status['text'] = 'Error in loading ' + self.var.get() 284 self.unfreeze_editable() 285 self.clear_all() 286 self.freeze_editable()
287
288 - def handle_corpus_loaded(self, event):
289 self.status['text'] = self.var.get() + ' is loaded' 290 self.unfreeze_editable() 291 self.clear_all() 292 self.query_box.focus_set()
293
294 - def handle_search_terminated(self, event):
295 #todo: refactor the model such that it is less state sensitive 296 results = self.model.get_results() 297 self.write_results(results) 298 self.status['text'] = '' 299 if len(results) == 0: 300 self.status['text'] = 'No results found for ' + self.model.query 301 else: 302 self.current_page = self.model.last_requested_page 303 self.unfreeze_editable() 304 self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT)
305 306
307 - def handle_search_error(self, event):
308 self.status['text'] = 'Error in query ' + self.model.query 309 self.unfreeze_editable()
310
311 - def corpus_selected(self, *args):
312 new_selection = self.var.get() 313 self.load_corpus(new_selection)
314
315 - def load_corpus(self, selection):
316 if self.model.selected_corpus != selection: 317 self.status['text'] = 'Loading ' + selection + '...' 318 self.freeze_editable() 319 self.model.load_corpus(selection)
320
321 - def search(self):
322 self.current_page = 0 323 self.clear_results_box() 324 self.model.reset_results() 325 query = self.query_box.get() 326 if (len(query.strip()) == 0): return 327 self.status['text'] = 'Searching for ' + query 328 self.freeze_editable() 329 self.model.search(query, self.current_page + 1, )
330 331
332 - def write_results(self, results):
333 self.results_box['state'] = 'normal' 334 row = 1 335 for each in results: 336 sent, pos1, pos2 = each[0].strip(), each[1], each[2] 337 if len(sent) != 0: 338 if (pos1 < self._char_before): 339 sent, pos1, pos2 = self.pad(sent, pos1, pos2) 340 sentence = sent[pos1-self._char_before:pos1+self._char_after] 341 if not row == len(results): 342 sentence += '\n' 343 self.results_box.insert(str(row) + '.0', sentence) 344 word_markers, label_markers = self.words_and_labels(sent, pos1, pos2) 345 for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1])) 346 for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1])) 347 row += 1 348 self.results_box['state'] = 'disabled'
349
350 - def words_and_labels(self, sentence, pos1, pos2):
351 search_exp = sentence[pos1:pos2] 352 words, labels = [], [] 353 labeled_words = search_exp.split(' ') 354 index = 0 355 for each in labeled_words: 356 if each == '': 357 index += 1 358 else: 359 word, label = each.split('/') 360 words.append((self._char_before + index, self._char_before + index + len(word))) 361 index += len(word) + 1 362 labels.append((self._char_before + index, self._char_before + index + len(label))) 363 index += len(label) 364 index += 1 365 return words, labels
366
367 - def pad(self, sent, hstart, hend):
368 if hstart >= self._char_before: 369 return sent, hstart, hend 370 d = self._char_before - hstart 371 sent = ''.join([' '] * d) + sent 372 return sent, hstart + d, hend + d
373
374 - def destroy(self, *e):
375 if self.top is None: return 376 self.top.destroy() 377 self.top = None
378
379 - def clear_all(self):
380 self.query_box.delete(0, END) 381 self.model.reset_query() 382 self.clear_results_box()
383
384 - def clear_results_box(self):
385 self.results_box['state'] = 'normal' 386 self.results_box.delete("1.0", END) 387 self.results_box['state'] = 'disabled'
388
389 - def freeze_editable(self):
390 self.query_box['state'] = 'disabled' 391 self.search_button['state'] = 'disabled' 392 self.prev['state'] = 'disabled' 393 self.next['state'] = 'disabled'
394
395 - def unfreeze_editable(self):
396 self.query_box['state'] = 'normal' 397 self.search_button['state'] = 'normal' 398 self.set_paging_button_states()
399
400 - def set_paging_button_states(self):
401 if self.current_page == 0 or self.current_page == 1: 402 self.prev['state'] = 'disabled' 403 else: 404 self.prev['state'] = 'normal' 405 if self.model.has_more_pages(self.current_page): 406 self.next['state'] = 'normal' 407 else: 408 self.next['state'] = 'disabled'
409
410 - def fire_event(self, event):
411 #Firing an event so that rendering of widgets happen in the mainloop thread 412 self.top.event_generate(event, when='tail')
413
414 - def mainloop(self, *args, **kwargs):
415 if in_idle(): return 416 self.top.mainloop(*args, **kwargs)
417
418 -class ConcordanceSearchModel(object):
419 - def __init__(self):
420 self.listeners = [] 421 self.CORPORA = _CORPORA 422 self.DEFAULT_CORPUS = _DEFAULT 423 self.selected_corpus = None 424 self.reset_query() 425 self.reset_results() 426 self.result_count = None 427 self.last_sent_searched = 0
428
429 - def non_default_corpora(self):
430 copy = [] 431 copy.extend(self.CORPORA.keys()) 432 copy.remove(self.DEFAULT_CORPUS) 433 copy.sort() 434 return copy
435
436 - def load_corpus(self, name):
437 self.selected_corpus = name 438 self.tagged_sents = [] 439 runner_thread = self.LoadCorpus(name, self) 440 runner_thread.start()
441
442 - def search(self, query, page):
443 self.query = query 444 self.last_requested_page = page 445 self.SearchCorpus(self, page, self.result_count).start()
446
447 - def next(self, page):
448 self.last_requested_page = page 449 if len(self.results) < page: 450 self.search(self.query, page) 451 else: 452 self.notify_listeners(SEARCH_TERMINATED_EVENT)
453
454 - def prev(self, page):
455 self.last_requested_page = page 456 self.notify_listeners(SEARCH_TERMINATED_EVENT)
457
458 - def add_listener(self, listener):
459 self.listeners.append(listener)
460
461 - def notify_listeners(self, event):
462 for each in self.listeners: 463 each.fire_event(event)
464
465 - def reset_results(self):
466 self.last_sent_searched = 0 467 self.results = [] 468 self.last_page = None
469
470 - def reset_query(self):
471 self.query = None
472
473 - def set_results(self, page, resultset):
474 self.results.insert(page - 1, resultset)
475
476 - def get_results(self):
477 return self.results[self.last_requested_page - 1]
478
479 - def has_more_pages(self, page):
480 if self.results == [] or self.results[0] == []: 481 return False 482 if self.last_page == None: 483 return True 484 return page < self.last_page
485
486 - class LoadCorpus(threading.Thread):
487 - def __init__(self, name, model):
488 threading.Thread.__init__(self) 489 self.model, self.name = model, name
490
491 - def run(self):
492 try: 493 ts = self.model.CORPORA[self.name]() 494 self.model.tagged_sents = [' '.join(w+'/'+t for (w,t) in sent) for sent in ts] 495 self.model.notify_listeners(CORPUS_LOADED_EVENT) 496 except Exception, e: 497 print e 498 self.model.notify_listeners(ERROR_LOADING_CORPUS_EVENT)
499
500 - class SearchCorpus(threading.Thread):
501 - def __init__(self, model, page, count):
502 self.model, self.count, self.page = model, count, page 503 threading.Thread.__init__(self)
504
505 - def run(self):
506 q = self.processed_query() 507 sent_pos, i, sent_count = [], 0, 0 508 for sent in self.model.tagged_sents[self.model.last_sent_searched:]: 509 try: 510 m = re.search(q, sent) 511 except re.error: 512 self.model.reset_results() 513 self.model.notify_listeners(SEARCH_ERROR_EVENT) 514 return 515 if m: 516 sent_pos.append((sent, m.start(), m.end())) 517 i += 1 518 if i > self.count: 519 self.model.last_sent_searched += sent_count - 1 520 break 521 sent_count += 1 522 if (self.count >= len(sent_pos)): 523 self.model.last_sent_searched += sent_count - 1 524 self.model.last_page = self.page 525 self.model.set_results(self.page, sent_pos) 526 else: 527 self.model.set_results(self.page, sent_pos[:-1]) 528 self.model.notify_listeners(SEARCH_TERMINATED_EVENT)
529
530 - def processed_query(self):
531 new = [] 532 for term in self.model.query.split(): 533 term = re.sub(r'\.', r'[^/ ]', term) 534 if re.match('[A-Z]+$', term): 535 new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY) 536 elif '/' in term: 537 new.append(BOUNDARY + term + BOUNDARY) 538 else: 539 new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY) 540 return ' '.join(new)
541
542 -def app():
543 d = ConcordanceSearchView() 544 d.mainloop()
545 546 if __name__ == '__main__': 547 app() 548 549 __all__ = ['app'] 550