1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42 """Porter Stemming Algorithm
43
44 This is the Porter stemming algorithm, ported to Python from the
45 version coded up in ANSI C by the author. It follows the algorithm
46 presented in
47
48 Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137.
49
50 only differing from it at the points maked --DEPARTURE-- and --NEW--
51 below.
52
53 For a more faithful version of the Porter algorithm, see
54
55 http://www.tartarus.org/~martin/PorterStemmer/
56
57 Later additions:
58
59 June 2000
60
61 The 'l' of the 'logi' -> 'log' rule is put with the stem, so that
62 short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc.
63
64 This follows a suggestion of Barry Wilkins, reasearch student at
65 Birmingham.
66
67
68 February 2000
69
70 the cvc test for not dropping final -e now looks after vc at the
71 beginning of a word, so are, eve, ice, ore, use keep final -e. In this
72 test c is any consonant, including w, x and y. This extension was
73 suggested by Chris Emerson.
74
75 -fully -> -ful treated like -fulness -> -ful, and
76 -tionally -> -tion treated like -tional -> -tion
77
78 both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi.
79
80 Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh.
81
82 Additional modifications were made to incorperate this module into
83 nltk. All such modifications are marked with \"--NLTK--\". The nltk
84 version of this module is maintained by the NLTK developers, and is
85 available from <http://nltk.sourceforge.net>
86 """
87
88
89
90 __docformat__ = 'plaintext'
91
92 import sys
93 import re
94
95
96
97 from api import *
98
100
101
102
103 """
104 A word stemmer based on the Porter stemming algorithm.
105
106 Porter, M. \"An algorithm for suffix stripping.\"
107 Program 14.3 (1980): 130-137.
108
109 A few minor modifications have been made to Porter's basic
110 algorithm. See the source code of this module for more
111 information.
112
113 The Porter Stemmer requires that all tokens have string types.
114 """
115
117 """The main part of the stemming algorithm starts here.
118 b is a buffer holding a word to be stemmed. The letters are in b[k0],
119 b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
120 readjusted downwards as the stemming progresses. Zero termination is
121 not in fact used in the algorithm.
122
123 Note that only lower case sequences are stemmed. Forcing to lower case
124 should be done before stem(...) is called.
125 """
126
127 self.b = ""
128 self.k = 0
129 self.k0 = 0
130 self.j = 0
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150 irregular_forms = {
151 "sky" : ["sky", "skies"],
152 "die" : ["dying"],
153 "lie" : ["lying"],
154 "tie" : ["tying"],
155 "news" : ["news"],
156 "inning" : ["innings", "inning"],
157 "outing" : ["outings", "outing"],
158 "canning" : ["cannings", "canning"],
159 "howe" : ["howe"],
160
161
162 "proceed" : ["proceed"],
163 "exceed" : ["exceed"],
164 "succeed" : ["succeed"],
165 }
166
167 self.pool = {}
168 for key in irregular_forms.keys():
169 for val in irregular_forms[key]:
170 self.pool[val] = key
171
173 """cons(i) is TRUE <=> b[i] is a consonant."""
174 if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
175 return 0
176 if self.b[i] == 'y':
177 if i == self.k0:
178 return 1
179 else:
180 return (not self.cons(i - 1))
181 return 1
182
184 """m() measures the number of consonant sequences between k0 and j.
185 if c is a consonant sequence and v a vowel sequence, and <..>
186 indicates arbitrary presence,
187
188 <c><v> gives 0
189 <c>vc<v> gives 1
190 <c>vcvc<v> gives 2
191 <c>vcvcvc<v> gives 3
192 ....
193 """
194 n = 0
195 i = self.k0
196 while 1:
197 if i > self.j:
198 return n
199 if not self.cons(i):
200 break
201 i = i + 1
202 i = i + 1
203 while 1:
204 while 1:
205 if i > self.j:
206 return n
207 if self.cons(i):
208 break
209 i = i + 1
210 i = i + 1
211 n = n + 1
212 while 1:
213 if i > self.j:
214 return n
215 if not self.cons(i):
216 break
217 i = i + 1
218 i = i + 1
219
221 """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
222 for i in range(self.k0, self.j + 1):
223 if not self.cons(i):
224 return 1
225 return 0
226
228 """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
229 if j < (self.k0 + 1):
230 return 0
231 if (self.b[j] != self.b[j-1]):
232 return 0
233 return self.cons(j)
234
236 """cvc(i) is TRUE <=>
237
238 a) ( --NEW--) i == 1, and p[0] p[1] is vowel consonant, or
239
240 b) p[i - 2], p[i - 1], p[i] has the form consonant -
241 vowel - consonant and also if the second c is not w, x or y. this
242 is used when trying to restore an e at the end of a short word.
243 e.g.
244
245 cav(e), lov(e), hop(e), crim(e), but
246 snow, box, tray.
247 """
248 if i == 0: return 0
249 if i == 1: return (not self.cons(0) and self.cons(1))
250 if not self.cons(i) or self.cons(i-1) or not self.cons(i-2): return 0
251
252 ch = self.b[i]
253 if ch == 'w' or ch == 'x' or ch == 'y':
254 return 0
255
256 return 1
257
259 """ends(s) is TRUE <=> k0,...k ends with the string s."""
260 length = len(s)
261 if s[length - 1] != self.b[self.k]:
262 return 0
263 if length > (self.k - self.k0 + 1):
264 return 0
265 if self.b[self.k-length+1:self.k+1] != s:
266 return 0
267 self.j = self.k - length
268 return 1
269
271 """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
272 length = len(s)
273 self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
274 self.k = self.j + length
275
277 """r(s) is used further down."""
278 if self.m() > 0:
279 self.setto(s)
280
282 """step1ab() gets rid of plurals and -ed or -ing. e.g.
283
284 caresses -> caress
285 ponies -> poni
286 sties -> sti
287 tie -> tie (--NEW--: see below)
288 caress -> caress
289 cats -> cat
290
291 feed -> feed
292 agreed -> agree
293 disabled -> disable
294
295 matting -> mat
296 mating -> mate
297 meeting -> meet
298 milling -> mill
299 messing -> mess
300
301 meetings -> meet
302 """
303 if self.b[self.k] == 's':
304 if self.ends("sses"):
305 self.k = self.k - 2
306 elif self.ends("ies"):
307 if self.j == 0:
308 self.k = self.k - 1
309
310
311 else:
312 self.k = self.k - 2
313 elif self.b[self.k - 1] != 's':
314 self.k = self.k - 1
315
316 if self.ends("ied"):
317 if self.j == 0:
318 self.k = self.k - 1
319 else:
320 self.k = self.k - 2
321
322
323
324 elif self.ends("eed"):
325 if self.m() > 0:
326 self.k = self.k - 1
327 elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
328 self.k = self.j
329 if self.ends("at"): self.setto("ate")
330 elif self.ends("bl"): self.setto("ble")
331 elif self.ends("iz"): self.setto("ize")
332 elif self.doublec(self.k):
333 self.k = self.k - 1
334 ch = self.b[self.k]
335 if ch == 'l' or ch == 's' or ch == 'z':
336 self.k = self.k + 1
337 elif (self.m() == 1 and self.cvc(self.k)):
338 self.setto("e")
339
341 """step1c() turns terminal y to i when there is another vowel in the stem.
342 --NEW--: This has been modified from the original Porter algorithm so that y->i
343 is only done when y is preceded by a consonant, but not if the stem
344 is only a single consonant, i.e.
345
346 (*c and not c) Y -> I
347
348 So 'happy' -> 'happi', but
349 'enjoy' -> 'enjoy' etc
350
351 This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'->
352 'enjoy'. Step 1c is perhaps done too soon; but with this modification that
353 no longer really matters.
354
355 Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly',
356 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried',
357 'flies' ...
358 """
359 if self.ends("y") and self.j > 0 and self.cons(self.k - 1):
360 self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
361
363 """step2() maps double suffices to single ones.
364 so -ization ( = -ize plus -ation) maps to -ize etc. note that the
365 string before the suffix must give m() > 0.
366 """
367 if self.b[self.k - 1] == 'a':
368 if self.ends("ational"): self.r("ate")
369 elif self.ends("tional"): self.r("tion")
370 elif self.b[self.k - 1] == 'c':
371 if self.ends("enci"): self.r("ence")
372 elif self.ends("anci"): self.r("ance")
373 elif self.b[self.k - 1] == 'e':
374 if self.ends("izer"): self.r("ize")
375 elif self.b[self.k - 1] == 'l':
376 if self.ends("bli"): self.r("ble")
377
378
379 elif self.ends("alli"):
380 if self.m() > 0:
381 self.setto("al")
382 self.step2()
383 elif self.ends("fulli"): self.r("ful")
384 elif self.ends("entli"): self.r("ent")
385 elif self.ends("eli"): self.r("e")
386 elif self.ends("ousli"): self.r("ous")
387 elif self.b[self.k - 1] == 'o':
388 if self.ends("ization"): self.r("ize")
389 elif self.ends("ation"): self.r("ate")
390 elif self.ends("ator"): self.r("ate")
391 elif self.b[self.k - 1] == 's':
392 if self.ends("alism"): self.r("al")
393 elif self.ends("iveness"): self.r("ive")
394 elif self.ends("fulness"): self.r("ful")
395 elif self.ends("ousness"): self.r("ous")
396 elif self.b[self.k - 1] == 't':
397 if self.ends("aliti"): self.r("al")
398 elif self.ends("iviti"): self.r("ive")
399 elif self.ends("biliti"): self.r("ble")
400 elif self.b[self.k - 1] == 'g':
401 if self.ends("logi"):
402 self.j = self.j + 1
403 self.r("og")
404
405
407 """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
408 if self.b[self.k] == 'e':
409 if self.ends("icate"): self.r("ic")
410 elif self.ends("ative"): self.r("")
411 elif self.ends("alize"): self.r("al")
412 elif self.b[self.k] == 'i':
413 if self.ends("iciti"): self.r("ic")
414 elif self.b[self.k] == 'l':
415 if self.ends("ical"): self.r("ic")
416 elif self.ends("ful"): self.r("")
417 elif self.b[self.k] == 's':
418 if self.ends("ness"): self.r("")
419
421 """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
422 if self.b[self.k - 1] == 'a':
423 if self.ends("al"): pass
424 else: return
425 elif self.b[self.k - 1] == 'c':
426 if self.ends("ance"): pass
427 elif self.ends("ence"): pass
428 else: return
429 elif self.b[self.k - 1] == 'e':
430 if self.ends("er"): pass
431 else: return
432 elif self.b[self.k - 1] == 'i':
433 if self.ends("ic"): pass
434 else: return
435 elif self.b[self.k - 1] == 'l':
436 if self.ends("able"): pass
437 elif self.ends("ible"): pass
438 else: return
439 elif self.b[self.k - 1] == 'n':
440 if self.ends("ant"): pass
441 elif self.ends("ement"): pass
442 elif self.ends("ment"): pass
443 elif self.ends("ent"): pass
444 else: return
445 elif self.b[self.k - 1] == 'o':
446 if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
447 elif self.ends("ou"): pass
448
449 else: return
450 elif self.b[self.k - 1] == 's':
451 if self.ends("ism"): pass
452 else: return
453 elif self.b[self.k - 1] == 't':
454 if self.ends("ate"): pass
455 elif self.ends("iti"): pass
456 else: return
457 elif self.b[self.k - 1] == 'u':
458 if self.ends("ous"): pass
459 else: return
460 elif self.b[self.k - 1] == 'v':
461 if self.ends("ive"): pass
462 else: return
463 elif self.b[self.k - 1] == 'z':
464 if self.ends("ize"): pass
465 else: return
466 else:
467 return
468 if self.m() > 1:
469 self.k = self.j
470
472 """step5() removes a final -e if m() > 1, and changes -ll to -l if
473 m() > 1.
474 """
475 self.j = self.k
476 if self.b[self.k] == 'e':
477 a = self.m()
478 if a > 1 or (a == 1 and not self.cvc(self.k-1)):
479 self.k = self.k - 1
480 if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
481 self.k = self.k -1
482
484 """In stem(p,i,j), p is a char pointer, and the string to be stemmed
485 is from p[i] to p[j] inclusive. Typically i is zero and j is the
486 offset to the last character of a string, (p[j+1] == '\0'). The
487 stemmer adjusts the characters p[i] ... p[j] and returns the new
488 end-point of the string, k. Stemming never increases word length, so
489 i <= k <= j. To turn the stemmer into a module, declare 'stem' as
490 extern, and delete the remainder of this file.
491 """
492
493
494
495 if j == None:
496 j = len(p) - 1
497
498
499 self.b = p
500 self.k = j
501 self.k0 = i
502
503 if self.pool.has_key(self.b[self.k0:self.k+1]):
504 return self.pool[self.b[self.k0:self.k+1]]
505
506 if self.k <= self.k0 + 1:
507 return self.b
508
509
510
511
512
513
514 self.step1ab()
515 self.step1c()
516 self.step2()
517 self.step3()
518 self.step4()
519 self.step5()
520 return self.b[self.k0:self.k+1]
521
523 lower = word.lower()
524
525 ret = ""
526 for x in xrange(len(stem)):
527 if lower[x] == stem[x]:
528 ret += word[x]
529 else:
530 ret += stem[x]
531
532 return ret
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555 - def stem(self, word):
558
559
560
562 return '<PorterStemmer>'
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
582 """
583 A demonstration of the porter stemmer on a sample from
584 the Penn Treebank corpus.
585 """
586
587 from nltk.corpus import treebank
588 from nltk import stem
589
590 stemmer = stem.PorterStemmer()
591
592 orig = []
593 stemmed = []
594 for item in treebank.files()[:3]:
595 for (word, tag) in treebank.tagged_words(item):
596 orig.append(word)
597 stemmed.append(stemmer.stem(word))
598
599
600 results = ' '.join(stemmed)
601 results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip()
602
603
604 original = ' '.join(orig)
605 original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip()
606
607
608 print '-Original-'.center(70).replace(' ', '*').replace('-', ' ')
609 print original
610 print '-Results-'.center(70).replace(' ', '*').replace('-', ' ')
611 print results
612 print '*'*70
613
614
615
616 if __name__ == '__main__': demo()
617