en_stemmer.js 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
  2. // paper, in
  3. //
  4. // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  5. // no. 3, pp 130-137,
  6. //
  7. // see also http://www.tartarus.org/~martin/PorterStemmer
  8. // Release 1
  9. // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
  10. var stemmer = (function(){
  11. var step2list = {
  12. "ational" : "ate",
  13. "tional" : "tion",
  14. "enci" : "ence",
  15. "anci" : "ance",
  16. "izer" : "ize",
  17. "bli" : "ble",
  18. "alli" : "al",
  19. "entli" : "ent",
  20. "eli" : "e",
  21. "ousli" : "ous",
  22. "ization" : "ize",
  23. "ation" : "ate",
  24. "ator" : "ate",
  25. "alism" : "al",
  26. "iveness" : "ive",
  27. "fulness" : "ful",
  28. "ousness" : "ous",
  29. "aliti" : "al",
  30. "iviti" : "ive",
  31. "biliti" : "ble",
  32. "logi" : "log"
  33. },
  34. step3list = {
  35. "icate" : "ic",
  36. "ative" : "",
  37. "alize" : "al",
  38. "iciti" : "ic",
  39. "ical" : "ic",
  40. "ful" : "",
  41. "ness" : ""
  42. },
  43. c = "[^aeiou]", // consonant
  44. v = "[aeiouy]", // vowel
  45. C = c + "[^aeiouy]*", // consonant sequence
  46. V = v + "[aeiou]*", // vowel sequence
  47. mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
  48. meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
  49. mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
  50. s_v = "^(" + C + ")?" + v; // vowel in stem
  51. return function (w) {
  52. var stem,
  53. suffix,
  54. firstch,
  55. re,
  56. re2,
  57. re3,
  58. re4,
  59. origword = w;
  60. if (w.length < 3) { return w; }
  61. firstch = w.substr(0,1);
  62. if (firstch == "y") {
  63. w = firstch.toUpperCase() + w.substr(1);
  64. }
  65. // Step 1a
  66. re = /^(.+?)(ss|i)es$/;
  67. re2 = /^(.+?)([^s])s$/;
  68. if (re.test(w)) { w = w.replace(re,"$1$2"); }
  69. else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
  70. // Step 1b
  71. re = /^(.+?)eed$/;
  72. re2 = /^(.+?)(ed|ing)$/;
  73. if (re.test(w)) {
  74. var fp = re.exec(w);
  75. re = new RegExp(mgr0);
  76. if (re.test(fp[1])) {
  77. re = /.$/;
  78. w = w.replace(re,"");
  79. }
  80. } else if (re2.test(w)) {
  81. var fp = re2.exec(w);
  82. stem = fp[1];
  83. re2 = new RegExp(s_v);
  84. if (re2.test(stem)) {
  85. w = stem;
  86. re2 = /(at|bl|iz)$/;
  87. re3 = new RegExp("([^aeiouylsz])\\1$");
  88. re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  89. if (re2.test(w)) { w = w + "e"; }
  90. else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
  91. else if (re4.test(w)) { w = w + "e"; }
  92. }
  93. }
  94. // Step 1c
  95. re = new RegExp("^(.+" + c + ")y$");
  96. if (re.test(w)) {
  97. var fp = re.exec(w);
  98. stem = fp[1];
  99. w = stem + "i";
  100. }
  101. // Step 2
  102. re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
  103. if (re.test(w)) {
  104. var fp = re.exec(w);
  105. stem = fp[1];
  106. suffix = fp[2];
  107. re = new RegExp(mgr0);
  108. if (re.test(stem)) {
  109. w = stem + step2list[suffix];
  110. }
  111. }
  112. // Step 3
  113. re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
  114. if (re.test(w)) {
  115. var fp = re.exec(w);
  116. stem = fp[1];
  117. suffix = fp[2];
  118. re = new RegExp(mgr0);
  119. if (re.test(stem)) {
  120. w = stem + step3list[suffix];
  121. }
  122. }
  123. // Step 4
  124. re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
  125. re2 = /^(.+?)(s|t)(ion)$/;
  126. if (re.test(w)) {
  127. var fp = re.exec(w);
  128. stem = fp[1];
  129. re = new RegExp(mgr1);
  130. if (re.test(stem)) {
  131. w = stem;
  132. }
  133. } else if (re2.test(w)) {
  134. var fp = re2.exec(w);
  135. stem = fp[1] + fp[2];
  136. re2 = new RegExp(mgr1);
  137. if (re2.test(stem)) {
  138. w = stem;
  139. }
  140. }
  141. // Step 5
  142. re = /^(.+?)e$/;
  143. if (re.test(w)) {
  144. var fp = re.exec(w);
  145. stem = fp[1];
  146. re = new RegExp(mgr1);
  147. re2 = new RegExp(meq1);
  148. re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
  149. if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
  150. w = stem;
  151. }
  152. }
  153. re = /ll$/;
  154. re2 = new RegExp(mgr1);
  155. if (re.test(w) && re2.test(w)) {
  156. re = /.$/;
  157. w = w.replace(re,"");
  158. }
  159. // and turn initial Y back to y
  160. if (firstch == "y") {
  161. w = firstch.toLowerCase() + w.substr(1);
  162. }
  163. // See http://snowball.tartarus.org/algorithms/english/stemmer.html
  164. // "Exceptional forms in general"
  165. var specialWords = {
  166. "skis" : "ski",
  167. "skies" : "sky",
  168. "dying" : "die",
  169. "lying" : "lie",
  170. "tying" : "tie",
  171. "idly" : "idl",
  172. "gently" : "gentl",
  173. "ugly" : "ugli",
  174. "early": "earli",
  175. "only": "onli",
  176. "singly": "singl"
  177. };
  178. if(specialWords[origword]){
  179. w = specialWords[origword];
  180. }
  181. if( "sky news howe atlas cosmos bias \
  182. andes inning outing canning herring \
  183. earring proceed exceed succeed".indexOf(origword) !== -1 ){
  184. w = origword;
  185. }
  186. // Address words overstemmed as gener-
  187. re = /.*generate?s?d?(ing)?$/;
  188. if( re.test(origword) ){
  189. w = w + 'at';
  190. }
  191. re = /.*general(ly)?$/;
  192. if( re.test(origword) ){
  193. w = w + 'al';
  194. }
  195. re = /.*generic(ally)?$/;
  196. if( re.test(origword) ){
  197. w = w + 'ic';
  198. }
  199. re = /.*generous(ly)?$/;
  200. if( re.test(origword) ){
  201. w = w + 'ous';
  202. }
  203. // Address words overstemmed as commun-
  204. re = /.*communit(ies)?y?/;
  205. if( re.test(origword) ){
  206. w = w + 'iti';
  207. }
  208. return w;
  209. }
  210. })();