// Porter stemmer in Javascript. Few comments, but it's easy to follow against the rules in the original // paper, in // // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, // no. 3, pp 130-137, // // see also http://www.tartarus.org/~martin/PorterStemmer // Release 1 // Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009 var stemmer = (function(){ var step2list = { "ational" : "ate", "tional" : "tion", "enci" : "ence", "anci" : "ance", "izer" : "ize", "bli" : "ble", "alli" : "al", "entli" : "ent", "eli" : "e", "ousli" : "ous", "ization" : "ize", "ation" : "ate", "ator" : "ate", "alism" : "al", "iveness" : "ive", "fulness" : "ful", "ousness" : "ous", "aliti" : "al", "iviti" : "ive", "biliti" : "ble", "logi" : "log" }, step3list = { "icate" : "ic", "ative" : "", "alize" : "al", "iciti" : "ic", "ical" : "ic", "ful" : "", "ness" : "" }, c = "[^aeiou]", // consonant v = "[aeiouy]", // vowel C = c + "[^aeiouy]*", // consonant sequence V = v + "[aeiou]*", // vowel sequence mgr0 = "^(" + C + ")?" + V + C, // [C]VC... is m>0 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$", // [C]VC[V] is m=1 mgr1 = "^(" + C + ")?" + V + C + V + C, // [C]VCVC... is m>1 s_v = "^(" + C + ")?" + v; // vowel in stem return function (w) { var stem, suffix, firstch, re, re2, re3, re4, origword = w; if (w.length < 3) { return w; } firstch = w.substr(0,1); if (firstch == "y") { w = firstch.toUpperCase() + w.substr(1); } // Step 1a re = /^(.+?)(ss|i)es$/; re2 = /^(.+?)([^s])s$/; if (re.test(w)) { w = w.replace(re,"$1$2"); } else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } // Step 1b re = /^(.+?)eed$/; re2 = /^(.+?)(ed|ing)$/; if (re.test(w)) { var fp = re.exec(w); re = new RegExp(mgr0); if (re.test(fp[1])) { re = /.$/; w = w.replace(re,""); } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1]; re2 = new RegExp(s_v); if (re2.test(stem)) { w = stem; re2 = /(at|bl|iz)$/; re3 = new RegExp("([^aeiouylsz])\\1$"); re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re2.test(w)) { w = w + "e"; } else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } else if (re4.test(w)) { w = w + "e"; } } } // Step 1c re = new RegExp("^(.+" + c + ")y$"); if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; w = stem + "i"; } // Step 2 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) { w = stem + step2list[suffix]; } } // Step 3 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) { w = stem + step3list[suffix]; } } // Step 4 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; re2 = /^(.+?)(s|t)(ion)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); if (re.test(stem)) { w = stem; } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1] + fp[2]; re2 = new RegExp(mgr1); if (re2.test(stem)) { w = stem; } } // Step 5 re = /^(.+?)e$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); re2 = new RegExp(meq1); re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { w = stem; } } re = /ll$/; re2 = new RegExp(mgr1); if (re.test(w) && re2.test(w)) { re = /.$/; w = w.replace(re,""); } // and turn initial Y back to y if (firstch == "y") { w = firstch.toLowerCase() + w.substr(1); } // See http://snowball.tartarus.org/algorithms/english/stemmer.html // "Exceptional forms in general" var specialWords = { "skis" : "ski", "skies" : "sky", "dying" : "die", "lying" : "lie", "tying" : "tie", "idly" : "idl", "gently" : "gentl", "ugly" : "ugli", "early": "earli", "only": "onli", "singly": "singl" }; if(specialWords[origword]){ w = specialWords[origword]; } if( "sky news howe atlas cosmos bias \ andes inning outing canning herring \ earring proceed exceed succeed".indexOf(origword) !== -1 ){ w = origword; } // Address words overstemmed as gener- re = /.*generate?s?d?(ing)?$/; if( re.test(origword) ){ w = w + 'at'; } re = /.*general(ly)?$/; if( re.test(origword) ){ w = w + 'al'; } re = /.*generic(ally)?$/; if( re.test(origword) ){ w = w + 'ic'; } re = /.*generous(ly)?$/; if( re.test(origword) ){ w = w + 'ous'; } // Address words overstemmed as commun- re = /.*communit(ies)?y?/; if( re.test(origword) ){ w = w + 'iti'; } return w; } })();
# | Change | User | Description | Committed | |
---|---|---|---|---|---|
#1 | 13895 | Paul Allen | Copying using p4convert-docbook | ||
//guest/perforce_software/doc_build/main/docbook-xsl-ns-1.78.1/webhelp/template/search/stemmers/en_stemmer.js | |||||
#1 | 12728 | eedwards |
Upgrade ANT doc build infrastructure to assemble PDFs: - remove non-namespaced DocBook source and add namespaced DocBook source. - add Apache FOP 1.1 - copy fonts, images, XSL into _build, establishing new asset structure. The original structure remains until all guides using it can be upgraded, and several other issues can be resolved. - updated build.xml to allow for per-target build properties. - upgraded the P4SAG to use the new infrastructure. - tweaked admonition presentation in PDFs to remove admonition graphics, and resemble closely the presentation used in the new HTML layout, including the same colors. With these changes, building PDFs involves using a shell, navigating into the guide's directory (just P4SAG for now), and executing "ant pdf". Issues still to be resolved: - PDF generation encounters several warnings about missing fonts (bold versions of Symbol and ZapfDingbats), and a couple of locations where the page content exceeds the defined content area. - Due to issues within Apache FOP, PDF generation emits a substantial amount of output that is not easily suppressed without losing important warning information. - Apache FOP's interface to ANT does not expose a way to set the font base directory. The current configuration does work under Mac OSX, but further testing on Windows will need to be done to determine if the relative paths defined continue to work. The workaround is for Windows users to customize the fop-config.xml to provide absolute system paths to the required fonts. - HTML generation needs further browser testing, and exhibits broken navigation on iOS browsers within the TOC sidebar. - A number of PDF and HTML presentation tweaks still need to be made, for example: sidebars, gui* DocBook tags, whitespace, section separation, etc. |