1
2
3
4
5
6
7
8
9
10 import re
11 import threading
12
13 import nltk
14 from nltk.util import in_idle
15 from nltk.draw.util import *
16
17 WORD_OR_TAG = '[^/ ]+'
18 BOUNDARY = r'\b'
19
20 CORPUS_LOADED_EVENT = '<<CL_EVENT>>'
21 SEARCH_TERMINATED_EVENT = '<<ST_EVENT>>'
22 SEARCH_ERROR_EVENT = '<<SE_EVENT>>'
23 ERROR_LOADING_CORPUS_EVENT = '<<ELC_EVENT>>'
24
25
26
27
28 _DEFAULT = 'English: Brown Corpus (Humor, simplified)'
29 _CORPORA = {
30 'Catalan: CESS-CAT Corpus (simplified)':
31 lambda: nltk.corpus.cess_cat.tagged_sents(simplify_tags=True),
32 'English: Brown Corpus':
33 lambda: nltk.corpus.brown.tagged_sents(),
34 'English: Brown Corpus (simplified)':
35 lambda: nltk.corpus.brown.tagged_sents(simplify_tags=True),
36 'English: Brown Corpus (Press, simplified)':
37 lambda: nltk.corpus.brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True),
38 'English: Brown Corpus (Religion, simplified)':
39 lambda: nltk.corpus.brown.tagged_sents(categories='religion', simplify_tags=True),
40 'English: Brown Corpus (Learned, simplified)':
41 lambda: nltk.corpus.brown.tagged_sents(categories='learned', simplify_tags=True),
42 'English: Brown Corpus (Science Fiction, simplified)':
43 lambda: nltk.corpus.brown.tagged_sents(categories='science_fiction', simplify_tags=True),
44 'English: Brown Corpus (Romance, simplified)':
45 lambda: nltk.corpus.brown.tagged_sents(categories='romance', simplify_tags=True),
46 'English: Brown Corpus (Humor, simplified)':
47 lambda: nltk.corpus.brown.tagged_sents(categories='humor', simplify_tags=True),
48 'English: NPS Chat Corpus':
49 lambda: nltk.corpus.nps_chat.tagged_posts(),
50 'English: NPS Chat Corpus (simplified)':
51 lambda: nltk.corpus.nps_chat.tagged_posts(simplify_tags=True),
52 'English: Wall Street Journal Corpus':
53 lambda: nltk.corpus.treebank.tagged_sents(),
54 'English: Wall Street Journal Corpus (simplified)':
55 lambda: nltk.corpus.treebank.tagged_sents(simplify_tags=True),
56 'Chinese: Sinica Corpus':
57 lambda: nltk.corpus.sinica_treebank.tagged_sents(),
58 'Chinese: Sinica Corpus (simplified)':
59 lambda: nltk.corpus.sinica_treebank.tagged_sents(simplify_tags=True),
60 'Dutch: Alpino Corpus':
61 lambda: nltk.corpus.alpino.tagged_sents(),
62 'Dutch: Alpino Corpus (simplified)':
63 lambda: nltk.corpus.alpino.tagged_sents(simplify_tags=True),
64 'Hindi: Indian Languages Corpus':
65 lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos'),
66 'Hindi: Indian Languages Corpus (simplified)':
67 lambda: nltk.corpus.indian.tagged_sents(files='hindi.pos', simplify_tags=True),
68 'Portuguese: Floresta Corpus (Portugal)':
69 lambda: nltk.corpus.floresta.tagged_sents(),
70 'Portuguese: Floresta Corpus (Portugal, simplified)':
71 lambda: nltk.corpus.floresta.tagged_sents(simplify_tags=True),
72 'Portuguese: MAC-MORPHO Corpus (Brazil)':
73 lambda: nltk.corpus.mac_morpho.tagged_sents(),
74 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)':
75 lambda: nltk.corpus.mac_morpho.tagged_sents(simplify_tags=True),
76 'Spanish: CESS-ESP Corpus (simplified)':
77 lambda: nltk.corpus.cess_esp.tagged_sents(simplify_tags=True),
78 }
79
81 _BACKGROUND_COLOUR='#FFF'
82
83
84 _HIGHLIGHT_WORD_COLOUR='#F00'
85 _HIGHLIGHT_WORD_TAG='HL_WRD_TAG'
86
87 _HIGHLIGHT_LABEL_COLOUR='#C0C0C0'
88 _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG'
89
90
91
92 _FRACTION_LEFT_TEXT=0.30
93
103
105 top.geometry('950x680+50+50')
106 top.title('NLTK Concordance Search')
107 top.bind('<Control-q>', self.destroy)
108 top.minsize(950,680)
109
118
120 self._result_size = IntVar(self.top)
121 self._cntx_bf_len = IntVar(self.top)
122 self._cntx_af_len = IntVar(self.top)
123 menubar = Menu(self.top)
124
125 filemenu = Menu(menubar, tearoff=0, borderwidth=0)
126 filemenu.add_command(label='Exit', underline=1,
127 command=self.destroy, accelerator='Ctrl-q')
128 menubar.add_cascade(label='File', underline=0, menu=filemenu)
129
130 editmenu = Menu(menubar, tearoff=0)
131 rescntmenu = Menu(editmenu, tearoff=0)
132 rescntmenu.add_radiobutton(label='20', variable=self._result_size,
133 underline=0, value=20,
134 command=self.set_result_size)
135 rescntmenu.add_radiobutton(label='50', variable=self._result_size,
136 underline=0, value=50,
137 command=self.set_result_size)
138 rescntmenu.add_radiobutton(label='100', variable=self._result_size,
139 underline=0, value=100,
140 command=self.set_result_size)
141 rescntmenu.invoke(1)
142 editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu)
143
144 cntxmenu = Menu(editmenu, tearoff=0)
145 cntxbfmenu = Menu(cntxmenu, tearoff=0)
146 cntxbfmenu.add_radiobutton(label='60 characters',
147 variable=self._cntx_bf_len,
148 underline=0, value=60,
149 command=self.set_cntx_bf_len)
150 cntxbfmenu.add_radiobutton(label='80 characters',
151 variable=self._cntx_bf_len,
152 underline=0, value=80,
153 command=self.set_cntx_bf_len)
154 cntxbfmenu.add_radiobutton(label='100 characters',
155 variable=self._cntx_bf_len,
156 underline=0, value=100,
157 command=self.set_cntx_bf_len)
158 cntxbfmenu.invoke(1)
159 cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu)
160
161 cntxafmenu = Menu(cntxmenu, tearoff=0)
162 cntxafmenu.add_radiobutton(label='70 characters',
163 variable=self._cntx_af_len,
164 underline=0, value=70,
165 command=self.set_cntx_af_len)
166 cntxafmenu.add_radiobutton(label='90 characters',
167 variable=self._cntx_af_len,
168 underline=0, value=90,
169 command=self.set_cntx_af_len)
170 cntxafmenu.add_radiobutton(label='110 characters',
171 variable=self._cntx_af_len,
172 underline=0, value=110,
173 command=self.set_cntx_af_len)
174 cntxafmenu.invoke(1)
175 cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu)
176
177 editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu)
178
179 menubar.add_cascade(label='Edit', underline=0, menu=editmenu)
180
181 self.top.config(menu=menubar)
182
184 self.model.result_count = self._result_size.get()
185
187 self._char_after = self._cntx_af_len.get()
188
190 self._char_before = self._cntx_bf_len.get()
191
193 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
194 self.var = StringVar(innerframe)
195 self.var.set(self.model.DEFAULT_CORPUS)
196 Label(innerframe, justify=LEFT, text=' Corpus: ',
197 background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left')
198
199 other_corpora = self.model.CORPORA.keys().remove(self.model.DEFAULT_CORPUS)
200 om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora())
201 om['borderwidth'] = 0
202 om['highlightthickness'] = 1
203 om.pack(side='left')
204 innerframe.pack(side='top', fill='x', anchor='n')
205
209
211 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
212 another = Frame(innerframe, background=self._BACKGROUND_COLOUR)
213 self.query_box = Entry(another, width=60)
214 self.query_box.pack(side='left', fill='x', pady=25, anchor='center')
215 self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1)
216 self.search_button.pack(side='left', fill='x', pady=25, anchor='center')
217 self.query_box.bind('<KeyPress-Return>', self.search_enter_keypress_handler)
218 another.pack()
219 innerframe.pack(side='top', fill='x', anchor='n')
220
223
225 innerframe = Frame(parent)
226 i1 = Frame(innerframe)
227 i2 = Frame(innerframe)
228 vscrollbar = Scrollbar(i1, borderwidth=1)
229 hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz')
230 self.results_box = Text(i1,
231 font=tkFont.Font(family='courier', size='16'),
232 state='disabled', borderwidth=1,
233 yscrollcommand=vscrollbar.set,
234 xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1)
235 self.results_box.pack(side='left', fill='both', expand=True)
236 self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR)
237 self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR)
238 vscrollbar.pack(side='left', fill='y', anchor='e')
239 vscrollbar.config(command=self.results_box.yview)
240 hscrollbar.pack(side='left', fill='x', expand=True, anchor='w')
241 hscrollbar.config(command=self.results_box.xview)
242
243 Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e')
244 i1.pack(side='top', fill='both', expand=True, anchor='n')
245 i2.pack(side='bottom', fill='x', anchor='s')
246 innerframe.pack(side='top', fill='both', expand=True)
247
249 innerframe = Frame(parent, background=self._BACKGROUND_COLOUR)
250 self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled')
251 prev.pack(side='left', anchor='center')
252 self.next = next = Button(innerframe, text='Next', command=self.next, width='10', borderwidth=1, highlightthickness=1, state='disabled')
253 next.pack(side='right', anchor='center')
254 innerframe.pack(side='top', fill='y')
255 self.current_page = 0
256
261
266
268 ABOUT = ("NLTK Concordance Search Demo\n")
269 TITLE = 'About: NLTK Concordance Search Demo'
270 try:
271 from tkMessageBox import Message
272 Message(message=ABOUT, title=TITLE, parent=self.main_frame).show()
273 except:
274 ShowText(self.top, TITLE, ABOUT)
275
281
287
293
305
306
310
312 new_selection = self.var.get()
313 self.load_corpus(new_selection)
314
320
330
331
333 self.results_box['state'] = 'normal'
334 row = 1
335 for each in results:
336 sent, pos1, pos2 = each[0].strip(), each[1], each[2]
337 if len(sent) != 0:
338 if (pos1 < self._char_before):
339 sent, pos1, pos2 = self.pad(sent, pos1, pos2)
340 sentence = sent[pos1-self._char_before:pos1+self._char_after]
341 if not row == len(results):
342 sentence += '\n'
343 self.results_box.insert(str(row) + '.0', sentence)
344 word_markers, label_markers = self.words_and_labels(sent, pos1, pos2)
345 for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
346 for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1]))
347 row += 1
348 self.results_box['state'] = 'disabled'
349
366
367 - def pad(self, sent, hstart, hend):
368 if hstart >= self._char_before:
369 return sent, hstart, hend
370 d = self._char_before - hstart
371 sent = ''.join([' '] * d) + sent
372 return sent, hstart + d, hend + d
373
375 if self.top is None: return
376 self.top.destroy()
377 self.top = None
378
383
385 self.results_box['state'] = 'normal'
386 self.results_box.delete("1.0", END)
387 self.results_box['state'] = 'disabled'
388
390 self.query_box['state'] = 'disabled'
391 self.search_button['state'] = 'disabled'
392 self.prev['state'] = 'disabled'
393 self.next['state'] = 'disabled'
394
399
409
411
412 self.top.event_generate(event, when='tail')
413
414 - def mainloop(self, *args, **kwargs):
415 if in_idle(): return
416 self.top.mainloop(*args, **kwargs)
417
420 self.listeners = []
421 self.CORPORA = _CORPORA
422 self.DEFAULT_CORPUS = _DEFAULT
423 self.selected_corpus = None
424 self.reset_query()
425 self.reset_results()
426 self.result_count = None
427 self.last_sent_searched = 0
428
435
441
442 - def search(self, query, page):
443 self.query = query
444 self.last_requested_page = page
445 self.SearchCorpus(self, page, self.result_count).start()
446
447 - def next(self, page):
453
454 - def prev(self, page):
457
459 self.listeners.append(listener)
460
462 for each in self.listeners:
463 each.fire_event(event)
464
466 self.last_sent_searched = 0
467 self.results = []
468 self.last_page = None
469
472
474 self.results.insert(page - 1, resultset)
475
477 return self.results[self.last_requested_page - 1]
478
479 - def has_more_pages(self, page):
480 if self.results == [] or self.results[0] == []:
481 return False
482 if self.last_page == None:
483 return True
484 return page < self.last_page
485
499
501 - def __init__(self, model, page, count):
504
529
541
545
546 if __name__ == '__main__':
547 app()
548
549 __all__ = ['app']
550