123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original
- // paper, in
- //
- // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
- // no. 3, pp 130-137,
- //
- // see also http://www.tartarus.org/~martin/PorterStemmer
- // Release 1
- // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009
- var stemmer = (function(){
- var step2list = {
- "ational" : "ate",
- "tional" : "tion",
- "enci" : "ence",
- "anci" : "ance",
- "izer" : "ize",
- "bli" : "ble",
- "alli" : "al",
- "entli" : "ent",
- "eli" : "e",
- "ousli" : "ous",
- "ization" : "ize",
- "ation" : "ate",
- "ator" : "ate",
- "alism" : "al",
- "iveness" : "ive",
- "fulness" : "ful",
- "ousness" : "ous",
- "aliti" : "al",
- "iviti" : "ive",
- "biliti" : "ble",
- "logi" : "log"
- },
- step3list = {
- "icate" : "ic",
- "ative" : "",
- "alize" : "al",
- "iciti" : "ic",
- "ical" : "ic",
- "ful" : "",
- "ness" : ""
- },
- c = "[^aeiou]", // consonant
- v = "[aeiouy]", // vowel
- C = c + "[^aeiouy]*", // consonant sequence
- V = v + "[aeiou]*", // vowel sequence
- mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0
- meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1
- mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1
- s_v = "^(" + C + ")?" + v; // vowel in stem
- return function (w) {
- var stem,
- suffix,
- firstch,
- re,
- re2,
- re3,
- re4,
- origword = w;
- if (w.length < 3) { return w; }
- firstch = w.substr(0,1);
- if (firstch == "y") {
- w = firstch.toUpperCase() + w.substr(1);
- }
- // Step 1a
- re = /^(.+?)(ss|i)es$/;
- re2 = /^(.+?)([^s])s$/;
- if (re.test(w)) { w = w.replace(re,"$1$2"); }
- else if (re2.test(w)) { w = w.replace(re2,"$1$2"); }
- // Step 1b
- re = /^(.+?)eed$/;
- re2 = /^(.+?)(ed|ing)$/;
- if (re.test(w)) {
- var fp = re.exec(w);
- re = new RegExp(mgr0);
- if (re.test(fp[1])) {
- re = /.$/;
- w = w.replace(re,"");
- }
- } else if (re2.test(w)) {
- var fp = re2.exec(w);
- stem = fp[1];
- re2 = new RegExp(s_v);
- if (re2.test(stem)) {
- w = stem;
- re2 = /(at|bl|iz)$/;
- re3 = new RegExp("([^aeiouylsz])\\1$");
- re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
- if (re2.test(w)) { w = w + "e"; }
- else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }
- else if (re4.test(w)) { w = w + "e"; }
- }
- }
- // Step 1c
- re = new RegExp("^(.+" + c + ")y$");
- if (re.test(w)) {
- var fp = re.exec(w);
- stem = fp[1];
- w = stem + "i";
- }
- // Step 2
- re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
- if (re.test(w)) {
- var fp = re.exec(w);
- stem = fp[1];
- suffix = fp[2];
- re = new RegExp(mgr0);
- if (re.test(stem)) {
- w = stem + step2list[suffix];
- }
- }
- // Step 3
- re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
- if (re.test(w)) {
- var fp = re.exec(w);
- stem = fp[1];
- suffix = fp[2];
- re = new RegExp(mgr0);
- if (re.test(stem)) {
- w = stem + step3list[suffix];
- }
- }
- // Step 4
- re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
- re2 = /^(.+?)(s|t)(ion)$/;
- if (re.test(w)) {
- var fp = re.exec(w);
- stem = fp[1];
- re = new RegExp(mgr1);
- if (re.test(stem)) {
- w = stem;
- }
- } else if (re2.test(w)) {
- var fp = re2.exec(w);
- stem = fp[1] + fp[2];
- re2 = new RegExp(mgr1);
- if (re2.test(stem)) {
- w = stem;
- }
- }
- // Step 5
- re = /^(.+?)e$/;
- if (re.test(w)) {
- var fp = re.exec(w);
- stem = fp[1];
- re = new RegExp(mgr1);
- re2 = new RegExp(meq1);
- re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
- if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {
- w = stem;
- }
- }
- re = /ll$/;
- re2 = new RegExp(mgr1);
- if (re.test(w) && re2.test(w)) {
- re = /.$/;
- w = w.replace(re,"");
- }
- // and turn initial Y back to y
- if (firstch == "y") {
- w = firstch.toLowerCase() + w.substr(1);
- }
- // See http://snowball.tartarus.org/algorithms/english/stemmer.html
- // "Exceptional forms in general"
- var specialWords = {
- "skis" : "ski",
- "skies" : "sky",
- "dying" : "die",
- "lying" : "lie",
- "tying" : "tie",
- "idly" : "idl",
- "gently" : "gentl",
- "ugly" : "ugli",
- "early": "earli",
- "only": "onli",
- "singly": "singl"
- };
- if(specialWords[origword]){
- w = specialWords[origword];
- }
- if( "sky news howe atlas cosmos bias \
- andes inning outing canning herring \
- earring proceed exceed succeed".indexOf(origword) !== -1 ){
- w = origword;
- }
- // Address words overstemmed as gener-
- re = /.*generate?s?d?(ing)?$/;
- if( re.test(origword) ){
- w = w + 'at';
- }
- re = /.*general(ly)?$/;
- if( re.test(origword) ){
- w = w + 'al';
- }
- re = /.*generic(ally)?$/;
- if( re.test(origword) ){
- w = w + 'ic';
- }
- re = /.*generous(ly)?$/;
- if( re.test(origword) ){
- w = w + 'ous';
- }
- // Address words overstemmed as commun-
- re = /.*communit(ies)?y?/;
- if( re.test(origword) ){
- w = w + 'iti';
- }
- return w;
- }
- })();
|