/*
 * Decompiled with CFR 0.152.
 */
package com.nexwave.nquindexer;

import com.nexwave.nquindexer.SaxDocFileParser;
import com.nexwave.nquindexer.WordAndScoring;
import com.nexwave.nsidita.DocFileInfo;
import com.nexwave.stemmer.snowball.SnowballStemmer;
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;
import com.nexwave.stemmer.snowball.ext.GermanStemmer;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.util.Version;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class SaxHTMLIndex
extends SaxDocFileParser {
    private Map<String, String> tempDico;
    private int i = 0;
    private ArrayList<String> cleanUpList = null;
    private ArrayList<String> cleanUpPunctuation = null;
    private int SCORING_FOR_H1 = 50;
    private int SCORING_FOR_H2 = 45;
    private int SCORING_FOR_H3 = 40;
    private int SCORING_FOR_H4 = 35;
    private int SCORING_FOR_H5 = 30;
    private int SCORING_FOR_H6 = 25;
    private int SCORING_FOR_BOLD = 5;
    private int SCORING_FOR_ITALIC = 3;
    private int SCORING_FOR_NORMAL_TEXT = 1;
    private int SCORING_FOR_KEYWORD = 100;
    private int SCORING_FOR_INDEXTERM = 75;
    private List<WordAndScoring> wsList = null;

    public List<WordAndScoring> getWsList() {
        return this.wsList;
    }

    public SaxHTMLIndex() {
    }

    public SaxHTMLIndex(ArrayList<String> cleanUpStrings) {
        this.cleanUpList = cleanUpStrings;
    }

    public SaxHTMLIndex(ArrayList<String> cleanUpStrings, ArrayList<String> cleanUpChars) {
        this.cleanUpList = cleanUpStrings;
        this.cleanUpPunctuation = cleanUpChars;
    }

    public int init(Map<String, String> tempMap) {
        this.tempDico = tempMap;
        return 0;
    }

    public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) {
        this.fileDesc = new DocFileInfo(file);
        this.strbf = new StringBuffer("");
        this.parseDocument(file);
        String str = this.cleanBuffer(this.strbf);
        str = str.replaceAll("\\s+", " ");
        this.wsList = new ArrayList<WordAndScoring>();
        if (indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh") || indexerLanguage.equalsIgnoreCase("ko")) {
            LinkedList<String> tokens = new LinkedList<String>();
            try {
                str = str.replaceAll("@@@([^\\s]*)@@@", "");
                CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
                StringReader reader = new StringReader(str);
                TokenStream stream = analyzer.tokenStream("", (Reader)reader);
                TermAttribute termAtt = (TermAttribute)stream.addAttribute(TermAttribute.class);
                OffsetAttribute offAtt = (OffsetAttribute)stream.addAttribute(OffsetAttribute.class);
                while (stream.incrementToken()) {
                    String term = termAtt.term();
                    tokens.add(term);
                    WordAndScoring ws = new WordAndScoring(term, term, 1);
                    boolean found = false;
                    for (WordAndScoring aWsList : this.wsList) {
                        if (!aWsList.getStem().equals(ws.getStem())) continue;
                        found = true;
                        int scoring = aWsList.getScoring();
                        aWsList.setScoring(scoring + ws.getScoring());
                        break;
                    }
                    if (found) continue;
                    this.wsList.add(ws);
                }
            }
            catch (IOException ex) {
                System.out.println("Error tokenizing content using CJK Analyzer. IOException");
                ex.printStackTrace();
            }
        } else {
            SnowballStemmer stemmer = indexerLanguage.equalsIgnoreCase("en") ? new EnglishStemmer() : (indexerLanguage.equalsIgnoreCase("de") ? new GermanStemmer() : (indexerLanguage.equalsIgnoreCase("fr") ? new FrenchStemmer() : null));
            this.wsList = new ArrayList<WordAndScoring>();
            StringTokenizer st = new StringTokenizer(str, " ");
            while (st.hasMoreTokens()) {
                String token = st.nextToken();
                WordAndScoring ws = this.getWordAndScoring(token, stemmer, stem);
                if (ws == null) continue;
                boolean found = false;
                for (WordAndScoring aWsList : this.wsList) {
                    if (!aWsList.getStem().equals(ws.getStem())) continue;
                    found = true;
                    int scoring = aWsList.getScoring();
                    aWsList.setScoring(scoring + ws.getScoring());
                    break;
                }
                if (found) continue;
                this.wsList.add(ws);
            }
        }
        for (WordAndScoring s : this.wsList) {
            String temp;
            if (s != null && this.tempDico.containsKey(s.getStem())) {
                temp = this.tempDico.get(s.getStem());
                temp = temp.concat(",").concat(Integer.toString(this.i)).concat("*").concat(Integer.toString(s.getScoring()));
                this.tempDico.put(s.getStem(), temp);
                continue;
            }
            if (s == null) continue;
            temp = null;
            temp = Integer.toString(this.i).concat("*").concat(Integer.toString(s.getScoring()));
            this.tempDico.put(s.getStem(), temp);
        }
        ++this.i;
        return this.fileDesc;
    }

    private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) {
        WordAndScoring wordScoring = null;
        if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) {
            String word = token.substring(0, token.indexOf("@@@"));
            if (word.length() > 0) {
                String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@"));
                int scoring = this.SCORING_FOR_NORMAL_TEXT;
                if ("h1".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_H1;
                } else if ("h2".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_H2;
                } else if ("h3".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_H3;
                } else if ("h4".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_H4;
                } else if ("h5".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_H5;
                } else if ("h6".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_H6;
                } else if ("em".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_ITALIC;
                } else if ("strong".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_BOLD;
                } else if ("meta_keywords".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_KEYWORD;
                } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {
                    scoring = this.SCORING_FOR_INDEXTERM;
                }
                String stemWord = word;
                if (stemmer != null && doStemming) {
                    stemWord = stemmer.doStem(word);
                }
                wordScoring = new WordAndScoring(word, stemWord, scoring);
            }
        } else {
            String stemWord = token;
            if (stemmer != null && doStemming) {
                stemWord = stemmer.doStem(token);
            }
            wordScoring = new WordAndScoring(token, stemWord, this.SCORING_FOR_NORMAL_TEXT);
        }
        return wordScoring;
    }

    private String cleanBuffer(StringBuffer strbf) {
        String str = strbf.toString().toLowerCase();
        StringBuffer tempStrBuf = new StringBuffer("");
        StringBuffer tempCharBuf = new StringBuffer("");
        if (this.cleanUpList == null || this.cleanUpList.isEmpty()) {
            tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");
            tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");
            tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");
            tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");
            tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");
            tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");
            str = str.replaceFirst("Copyright \ufffd\ufffd\ufffd 1998-2007 NexWave Solutions.", " ");
        } else {
            tempStrBuf.append("\\ba\\b");
            for (String aCleanUp : this.cleanUpList) {
                tempStrBuf.append("|\\b").append(aCleanUp).append("\\b");
            }
        }
        if (this.cleanUpPunctuation != null && !this.cleanUpPunctuation.isEmpty()) {
            tempCharBuf.append("\\u3002");
            for (String aCleanUpPunctuation : this.cleanUpPunctuation) {
                tempCharBuf.append("|").append(aCleanUpPunctuation);
            }
        }
        str = this.minimalClean(str, tempStrBuf, tempCharBuf);
        return str;
    }
}

