| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 | // Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original// paper, in////  Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,//  no. 3, pp 130-137,//// see also http://www.tartarus.org/~martin/PorterStemmer// Release 1// Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009var stemmer = (function(){	var step2list = {			"ational" : "ate",			"tional" : "tion",			"enci" : "ence",			"anci" : "ance",			"izer" : "ize",			"bli" : "ble",			"alli" : "al",			"entli" : "ent",			"eli" : "e",			"ousli" : "ous",			"ization" : "ize",			"ation" : "ate",			"ator" : "ate",			"alism" : "al",			"iveness" : "ive",			"fulness" : "ful",			"ousness" : "ous",			"aliti" : "al",			"iviti" : "ive",			"biliti" : "ble",			"logi" : "log"		},		step3list = {			"icate" : "ic",			"ative" : "",			"alize" : "al",			"iciti" : "ic",			"ical" : "ic",			"ful" : "",			"ness" : ""		},		c = "[^aeiou]",          // consonant		v = "[aeiouy]",          // vowel		C = c + "[^aeiouy]*",    // consonant sequence		V = v + "[aeiou]*",      // vowel sequence		mgr0 = "^(" + C + ")?" + V + C,               // [C]VC... is m>0		meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$",  // [C]VC[V] is m=1		mgr1 = "^(" + C + ")?" + V + C + V + C,       // [C]VCVC... is m>1		s_v = "^(" + C + ")?" + v;                   // vowel in stem	return function (w) {		var 	stem,			suffix,			firstch,			re,			re2,			re3,			re4,			origword = w;		if (w.length < 3) { return w; }		firstch = w.substr(0,1);		if (firstch == "y") {			w = firstch.toUpperCase() + w.substr(1);		}		// Step 1a		re = /^(.+?)(ss|i)es$/;		re2 = /^(.+?)([^s])s$/;		if (re.test(w)) { w = w.replace(re,"$1$2"); }		else if (re2.test(w)) {	w = w.replace(re2,"$1$2"); }		// Step 1b		re = /^(.+?)eed$/;		re2 = /^(.+?)(ed|ing)$/;		if (re.test(w)) {			var fp = re.exec(w);			re = new RegExp(mgr0);			if (re.test(fp[1])) {				re = /.$/;				w = w.replace(re,"");			}		} else if (re2.test(w)) {			var fp = re2.exec(w);			stem = fp[1];			re2 = new RegExp(s_v);			if (re2.test(stem)) {				w = stem;				re2 = /(at|bl|iz)$/;				re3 = new RegExp("([^aeiouylsz])\\1$");				re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");				if (re2.test(w)) { w = w + "e"; }				else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); }				else if (re4.test(w)) { w = w + "e"; }			}		}		// Step 1c	        re = new RegExp("^(.+" + c + ")y$");		    if (re.test(w)) {			var fp = re.exec(w);			stem = fp[1];		    w = stem + "i";		}		// Step 2		re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;		if (re.test(w)) {			var fp = re.exec(w);			stem = fp[1];			suffix = fp[2];			re = new RegExp(mgr0);			if (re.test(stem)) {				w = stem + step2list[suffix];			}		}		// Step 3		re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;		if (re.test(w)) {			var fp = re.exec(w);			stem = fp[1];			suffix = fp[2];			re = new RegExp(mgr0);			if (re.test(stem)) {				w = stem + step3list[suffix];			}		}		// Step 4		re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;		re2 = /^(.+?)(s|t)(ion)$/;		if (re.test(w)) {			var fp = re.exec(w);			stem = fp[1];			re = new RegExp(mgr1);			if (re.test(stem)) {				w = stem;			}		} else if (re2.test(w)) {			var fp = re2.exec(w);			stem = fp[1] + fp[2];			re2 = new RegExp(mgr1);			if (re2.test(stem)) {				w = stem;			}		}		// Step 5		re = /^(.+?)e$/;		if (re.test(w)) {			var fp = re.exec(w);			stem = fp[1];			re = new RegExp(mgr1);			re2 = new RegExp(meq1);			re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");			if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) {				w = stem;			}		}		re = /ll$/;		re2 = new RegExp(mgr1);		if (re.test(w) && re2.test(w)) {			re = /.$/;			w = w.replace(re,"");		}		// and turn initial Y back to y		if (firstch == "y") {			w = firstch.toLowerCase() + w.substr(1);		}	    // See http://snowball.tartarus.org/algorithms/english/stemmer.html	    // "Exceptional forms in general"	    var specialWords = {	    	"skis" : "ski",	    	"skies" : "sky",	    	"dying" : "die",	    	"lying" : "lie",	    	"tying" : "tie",	    	"idly" : "idl",	    	"gently" : "gentl",	    	"ugly" : "ugli",	    	"early": "earli",	    	"only": "onli",	    	"singly": "singl"	    };	    if(specialWords[origword]){	    	w = specialWords[origword];	    }	    if( "sky news howe atlas cosmos bias \	    	 andes inning outing canning herring \	    	 earring proceed exceed succeed".indexOf(origword) !== -1 ){	    	w = origword;	    }	    // Address words overstemmed as gener-	    re = /.*generate?s?d?(ing)?$/;	    if( re.test(origword) ){		w = w + 'at';	    }	    re = /.*general(ly)?$/;	    if( re.test(origword) ){		w = w + 'al';	    }	    re = /.*generic(ally)?$/;	    if( re.test(origword) ){		w = w + 'ic';	    }	    re = /.*generous(ly)?$/;	    if( re.test(origword) ){		w = w + 'ous';	    }	    // Address words overstemmed as commun-	    re = /.*communit(ies)?y?/;	    if( re.test(origword) ){		w = w + 'iti';	    }	    return w;	}})();
 |