package com.openkm.kea.filter;

import com.openkm.kea.stemmers.SremovalStemmer;
import com.openkm.kea.stemmers.Stemmer;
import com.openkm.kea.stopwords.Stopwords;
import com.openkm.kea.stopwords.StopwordsEnglish;
import com.openkm.kea.util.Counter;
import com.openkm.kea.vocab.Vocabulary;
import com.openkm.util.WebUtils;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import weka.classifiers.Classifier;
import weka.classifiers.bayes.NaiveBayesSimple;
import weka.classifiers.meta.FilteredClassifier;
import weka.classifiers.meta.RegressionByDiscretization;
import weka.core.Attribute;
import weka.core.Capabilities;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.supervised.attribute.Discretize;

/* loaded from: input_file:com/openkm/kea/filter/KEAFilter.class */
public class KEAFilter extends Filter implements OptionHandler {
    private static Logger log = LoggerFactory.getLogger(KEAFilter.class);
    private static final long serialVersionUID = 1;
    private Stopwords m_Stopwords;
    public Vocabulary m_Vocabulary;
    private int m_DocumentAtt = 0;
    private int m_KeyphrasesAtt = 1;
    private int m_MaxPhraseLength = 5;
    private int m_MinPhraseLength = 1;
    private int m_numPhrases = 10;
    private int m_Indexers = 1;
    private boolean m_DESCRreplace = true;
    public boolean m_NODEfeature = true;
    private boolean m_LENGTHfeature = true;
    private boolean m_STDEVfeature = false;
    private boolean m_KFused = false;
    private boolean m_Debug = false;
    private boolean m_DisallowInternalPeriods = false;
    private int m_MinNumOccur = 2;
    private int m_NumFeatures = 2;
    private int m_TfidfIndex = 0;
    private int m_FirstOccurIndex = 1;
    private int m_LengthIndex = 2;
    private int m_NodeIndex = 3;
    private int m_STDEVIndex = 4;
    private int m_KeyFreqIndex = 3;
    private KEAPhraseFilter m_PunctFilter = null;
    private NumbersFilter m_NumbersFilter = null;
    private Classifier m_Classifier = null;
    public HashMap<String, Counter> m_Dictionary = null;
    private HashMap<String, Counter> m_KeyphraseDictionary = null;
    private int m_NumDocs = 0;
    private Instances m_ClassifierData = null;
    private Stemmer m_Stemmer = new SremovalStemmer();
    private String m_documentLanguage = "en";
    private String m_vocabulary = "agrovoc";
    private String m_vocabularyFormat = "skos";
    private boolean m_CheckForProperNouns = true;

    public KEAFilter(Stopwords stopwords) {
        this.m_Stopwords = stopwords;
    }

    public void clearVocabulary() {
        this.m_Vocabulary = null;
    }

    public String getVocabulary() {
        return this.m_vocabulary;
    }

    public void setVocabulary(String str) {
        this.m_vocabulary = str;
    }

    public String getVocabularyFormat() {
        return this.m_vocabularyFormat;
    }

    public void setVocabularyFormat(String str) {
        this.m_vocabularyFormat = str;
    }

    public String getDocumentLanguage() {
        return this.m_documentLanguage;
    }

    public void setDocumentLanguage(String str) {
        this.m_documentLanguage = str;
    }

    public boolean getCheckForProperNouns() {
        return this.m_CheckForProperNouns;
    }

    public void setCheckForProperNouns(boolean z) {
        this.m_CheckForProperNouns = z;
    }

    public Stopwords getStopwords() {
        return this.m_Stopwords;
    }

    public void setStopwords(Stopwords stopwords) {
        this.m_Stopwords = stopwords;
    }

    public Stemmer getStemmer() {
        return this.m_Stemmer;
    }

    public void setStemmer(Stemmer stemmer) {
        this.m_Stemmer = stemmer;
    }

    public int getMinNumOccur() {
        return this.m_MinNumOccur;
    }

    public void setMinNumOccur(int i) {
        this.m_MinNumOccur = i;
    }

    public int getMaxPhraseLength() {
        return this.m_MaxPhraseLength;
    }

    public void setMaxPhraseLength(int i) {
        this.m_MaxPhraseLength = i;
    }

    public int getMinPhraseLength() {
        return this.m_MinPhraseLength;
    }

    public void setMinPhraseLength(int i) {
        this.m_MinPhraseLength = i;
    }

    public int getNumPhrases() {
        return this.m_numPhrases;
    }

    public void setNumPhrases(int i) {
        this.m_numPhrases = i;
    }

    public int getStemmedPhraseIndex() {
        return this.m_DocumentAtt;
    }

    public int getUnstemmedPhraseIndex() {
        return this.m_DocumentAtt + 1;
    }

    public int getProbabilityIndex() {
        int i = this.m_DocumentAtt + 4;
        if (this.m_Debug && this.m_KFused) {
            i++;
        }
        if (this.m_STDEVfeature) {
            i++;
        }
        if (this.m_NODEfeature) {
            i++;
        }
        if (this.m_LENGTHfeature) {
            i++;
        }
        return i;
    }

    public int getRankIndex() {
        return getProbabilityIndex() + 1;
    }

    public int getDocumentAtt() {
        return this.m_DocumentAtt;
    }

    public void setDocumentAtt(int i) {
        this.m_DocumentAtt = i;
    }

    public int getKeyphrasesAtt() {
        return this.m_KeyphrasesAtt;
    }

    public void setKeyphrasesAtt(int i) {
        this.m_KeyphrasesAtt = i;
    }

    public boolean getDebug() {
        return this.m_Debug;
    }

    public void setDebug(boolean z) {
        this.m_Debug = z;
    }

    public void setKFused(boolean z) {
        this.m_KFused = z;
        if (z) {
            this.m_NumFeatures++;
        }
    }

    public void setNumFeature() {
        if (this.m_STDEVfeature) {
            this.m_NumFeatures++;
        }
        if (this.m_NODEfeature) {
            this.m_NumFeatures++;
        }
        if (this.m_LENGTHfeature) {
            this.m_NumFeatures++;
        }
    }

    public boolean getKFused() {
        return this.m_KFused;
    }

    public boolean getDisallowInternalPeriods() {
        return this.m_DisallowInternalPeriods;
    }

    public void setDisallowInternalPeriods(boolean z) {
        this.m_DisallowInternalPeriods = z;
    }

    public void loadThesaurus(Stemmer stemmer, Stopwords stopwords) {
        this.m_Vocabulary = new Vocabulary(this.m_vocabulary, this.m_vocabularyFormat, this.m_documentLanguage);
        this.m_Vocabulary.setStemmer(stemmer);
        this.m_Vocabulary.setStopwords(stopwords);
        this.m_Vocabulary.initialize();
        try {
            if (this.m_DESCRreplace) {
                this.m_Vocabulary.buildUSE();
            }
            if (this.m_NODEfeature) {
                this.m_Vocabulary.buildREL();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public void setOptions(String[] strArr) throws Exception {
        setKFused(Utils.getFlag('K', strArr));
        setDebug(Utils.getFlag('D', strArr));
        String option = Utils.getOption('I', strArr);
        if (option.length() > 0) {
            setDocumentAtt(Integer.parseInt(option) - 1);
        } else {
            setDocumentAtt(0);
        }
        String option2 = Utils.getOption('J', strArr);
        if (option2.length() > 0) {
            setKeyphrasesAtt(Integer.parseInt(option2) - 1);
        } else {
            setKeyphrasesAtt(1);
        }
        String option3 = Utils.getOption('M', strArr);
        if (option3.length() > 0) {
            setMaxPhraseLength(Integer.parseInt(option3));
        } else {
            setMaxPhraseLength(3);
        }
        String option4 = Utils.getOption('M', strArr);
        if (option4.length() > 0) {
            setMinPhraseLength(Integer.parseInt(option4));
        } else {
            setMinPhraseLength(1);
        }
        String option5 = Utils.getOption('O', strArr);
        if (option5.length() > 0) {
            setMinNumOccur(Integer.parseInt(option5));
        } else {
            setMinNumOccur(2);
        }
        setDisallowInternalPeriods(Utils.getFlag('P', strArr));
    }

    public String[] getOptions() {
        String[] strArr = new String[13];
        int i = 0;
        if (getKFused()) {
            i = 0 + 1;
            strArr[0] = "-K";
        }
        if (getDebug()) {
            int i2 = i;
            i++;
            strArr[i2] = "-D";
        }
        int i3 = i;
        int i4 = i + 1;
        strArr[i3] = "-I";
        int i5 = i4 + 1;
        strArr[i4] = WebUtils.EMPTY_STRING + (getDocumentAtt() + 1);
        int i6 = i5 + 1;
        strArr[i5] = "-J";
        int i7 = i6 + 1;
        strArr[i6] = WebUtils.EMPTY_STRING + (getKeyphrasesAtt() + 1);
        int i8 = i7 + 1;
        strArr[i7] = "-M";
        int i9 = i8 + 1;
        strArr[i8] = WebUtils.EMPTY_STRING + getMaxPhraseLength();
        int i10 = i9 + 1;
        strArr[i9] = "-L";
        int i11 = i10 + 1;
        strArr[i10] = WebUtils.EMPTY_STRING + getMinPhraseLength();
        int i12 = i11 + 1;
        strArr[i11] = "-O";
        int i13 = i12 + 1;
        strArr[i12] = WebUtils.EMPTY_STRING + getMinNumOccur();
        if (getDisallowInternalPeriods()) {
            i13++;
            strArr[i13] = "-P";
        }
        while (i13 < strArr.length) {
            int i14 = i13;
            i13++;
            strArr[i14] = WebUtils.EMPTY_STRING;
        }
        return strArr;
    }

    public Enumeration<Option> listOptions() {
        Vector vector = new Vector(7);
        vector.addElement(new Option("\tSpecifies whether keyphrase frequency statistic is used.", "K", 0, "-K"));
        vector.addElement(new Option("\tSets the maximum phrase length (default: 3).", "M", 1, "-M <length>"));
        vector.addElement(new Option("\tSets the minimum phrase length (default: 1).", "L", 1, "-L <length>"));
        vector.addElement(new Option("\tTurns debugging mode on.", "D", 0, "-D"));
        vector.addElement(new Option("\tSets the index of the document attribute (default: 0).", "I", 1, "-I"));
        vector.addElement(new Option("\tSets the index of the keyphrase attribute (default: 1).", "J", 1, "-J"));
        vector.addElement(new Option("\tDisallow internal periods.", "P", 0, "-P"));
        vector.addElement(new Option("\tSet the minimum number of occurences (default: 2).", "O", 1, "-O"));
        return vector.elements();
    }

    public String globalInfo() {
        return "Converts incoming data into data appropriate for keyphrase classification.";
    }

    public boolean setInputFormat(Instances instances) throws Exception {
        if (instances.classIndex() >= 0) {
            throw new Exception("Don't know what do to if class index set!");
        }
        if (!instances.attribute(this.m_KeyphrasesAtt).isString() || !instances.attribute(this.m_DocumentAtt).isString()) {
            throw new Exception("Keyphrase attribute and document attribute need to be string attributes.");
        }
        this.m_PunctFilter = new KEAPhraseFilter();
        this.m_PunctFilter.setAttributeIndicesArray(new int[]{this.m_DocumentAtt});
        this.m_PunctFilter.setInputFormat(instances);
        this.m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());
        if (!this.m_vocabulary.equals("none")) {
            super.setInputFormat(this.m_PunctFilter.getOutputFormat());
            return false;
        }
        this.m_NumbersFilter = new NumbersFilter();
        this.m_NumbersFilter.setInputFormat(this.m_PunctFilter.getOutputFormat());
        super.setInputFormat(this.m_NumbersFilter.getOutputFormat());
        return false;
    }

    public Capabilities getCapabilities() {
        Capabilities capabilities = super.getCapabilities();
        capabilities.enableAllAttributes();
        capabilities.enable(Capabilities.Capability.MISSING_VALUES);
        capabilities.enable(Capabilities.Capability.NOMINAL_CLASS);
        capabilities.enable(Capabilities.Capability.NO_CLASS);
        capabilities.enableAllClasses();
        return capabilities;
    }

    public boolean input(Instance instance) throws Exception {
        if (getInputFormat() == null) {
            throw new Exception("No input instance format defined");
        }
        if (this.m_NewBatch) {
            resetQueue();
            this.m_NewBatch = false;
        }
        if (this.m_Debug) {
            log.info("-- Reading instance");
        }
        this.m_PunctFilter.input(instance);
        this.m_PunctFilter.batchFinished();
        Instance output = this.m_PunctFilter.output();
        if (this.m_vocabulary.equals("none")) {
            this.m_NumbersFilter.input(output);
            this.m_NumbersFilter.batchFinished();
            output = this.m_NumbersFilter.output();
        }
        if (this.m_Dictionary == null) {
            bufferInput(output);
            return false;
        }
        Enumeration elements = convertInstance(output, false).elements();
        while (elements.hasMoreElements()) {
            push((Instance) elements.nextElement());
        }
        return true;
    }

    public boolean batchFinished() throws Exception {
        if (getInputFormat() == null) {
            throw new Exception("No input instance format defined");
        }
        if (this.m_Dictionary == null) {
            buildGlobalDictionaries();
            buildClassifier();
            convertPendingInstances();
        }
        flushInput();
        this.m_NewBatch = true;
        return numPendingOutput() != 0;
    }

    public void buildGlobalDictionaries() throws Exception {
        if (this.m_Debug) {
            log.info("--- Building global dictionaries");
        }
        this.m_Dictionary = new HashMap<>();
        for (int i = 0; i < getInputFormat().numInstances(); i++) {
            for (String str : getPhrasesForDictionary(getInputFormat().instance(i).stringValue(this.m_DocumentAtt)).keySet()) {
                Counter counter = this.m_Dictionary.get(str);
                if (counter == null) {
                    this.m_Dictionary.put(str, new Counter());
                } else {
                    counter.increment();
                }
            }
        }
        if (this.m_KFused) {
            if (this.m_Debug) {
                log.info("KF_used feature");
            }
            this.m_KeyphraseDictionary = new HashMap<>();
            for (int i2 = 0; i2 < getInputFormat().numInstances(); i2++) {
                HashMap<String, Counter> givenKeyphrases = getGivenKeyphrases(getInputFormat().instance(i2).stringValue(this.m_KeyphrasesAtt), false);
                if (givenKeyphrases != null) {
                    for (String str2 : givenKeyphrases.keySet()) {
                        Counter counter2 = this.m_KeyphraseDictionary.get(str2);
                        if (counter2 == null) {
                            this.m_KeyphraseDictionary.put(str2, new Counter());
                        } else {
                            counter2.increment();
                        }
                    }
                }
            }
        } else {
            this.m_KeyphraseDictionary = null;
        }
        this.m_NumDocs = getInputFormat().numInstances();
    }

    private void buildClassifier() throws Exception {
        FastVector fastVector = new FastVector();
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (i == this.m_DocumentAtt) {
                fastVector.addElement(new Attribute("TFxIDF"));
                fastVector.addElement(new Attribute("First_occurrence"));
                if (this.m_KFused) {
                    fastVector.addElement(new Attribute("Keyphrase_frequency"));
                }
                if (this.m_STDEVfeature) {
                    fastVector.addElement(new Attribute("Standard_deviation"));
                }
                if (this.m_NODEfeature) {
                    fastVector.addElement(new Attribute("Relations_number"));
                }
                if (this.m_LENGTHfeature) {
                    fastVector.addElement(new Attribute("Phrase_length"));
                }
            } else if (i == this.m_KeyphrasesAtt) {
                FastVector fastVector2 = new FastVector(2);
                fastVector2.addElement("False");
                fastVector2.addElement("True");
                fastVector.addElement(new Attribute("Keyphrase?"));
            }
        }
        this.m_ClassifierData = new Instances("ClassifierData", fastVector, 0);
        this.m_ClassifierData.setClassIndex(this.m_NumFeatures);
        if (this.m_Debug) {
            log.info("--- Converting instances for classifier");
        }
        for (int i2 = 0; i2 < getInputFormat().numInstances(); i2++) {
            Instance instance = getInputFormat().instance(i2);
            String stringValue = instance.stringValue(this.m_KeyphrasesAtt);
            HashMap<String, Counter> givenKeyphrases = getGivenKeyphrases(stringValue, false);
            HashMap<String, Counter> givenKeyphrases2 = getGivenKeyphrases(stringValue, true);
            HashMap<String, FastVector> hashMap = new HashMap<>();
            int phrases = getPhrases(hashMap, instance.stringValue(this.m_DocumentAtt));
            for (String str : hashMap.keySet()) {
                this.m_ClassifierData.add(new Instance(instance.weight(), featVals(str, hashMap.get(str), true, givenKeyphrases2, givenKeyphrases, phrases, hashMap)));
            }
        }
        if (this.m_Debug) {
            log.info("--- Building classifier");
        }
        RegressionByDiscretization regressionByDiscretization = new RegressionByDiscretization();
        FilteredClassifier filteredClassifier = new FilteredClassifier();
        filteredClassifier.setClassifier(new NaiveBayesSimple());
        filteredClassifier.setFilter(new Discretize());
        regressionByDiscretization.setClassifier(filteredClassifier);
        regressionByDiscretization.setNumBins(this.m_Indexers + 1);
        this.m_Classifier = regressionByDiscretization;
        this.m_Classifier.buildClassifier(this.m_ClassifierData);
        if (this.m_Debug) {
            log.info(WebUtils.EMPTY_STRING + this.m_Classifier);
        }
        this.m_ClassifierData = new Instances(this.m_ClassifierData, 0);
    }

    private double[] featVals(String str, FastVector fastVector, boolean z, HashMap<String, Counter> hashMap, HashMap<String, Counter> hashMap2, int i, HashMap<String, FastVector> hashMap3) {
        Counter counter = (Counter) fastVector.elementAt(1);
        double[] dArr = new double[this.m_NumFeatures + 1];
        Counter counter2 = this.m_Dictionary.get(str);
        double value = counter.value();
        double d = 0.0d;
        if (counter2 != null) {
            d = counter2.value();
            if (z) {
                d -= 1.0d;
            }
        }
        dArr[this.m_TfidfIndex] = (value / i) * (-Math.log((d + 1.0d) / (this.m_NumDocs + 1.0d)));
        dArr[this.m_FirstOccurIndex] = ((Counter) fastVector.elementAt(0)).value() / i;
        if (this.m_KFused) {
            Counter counter3 = this.m_KeyphraseDictionary.get(str);
            if (z && hashMap2 != null && hashMap2.containsKey(str)) {
                dArr[this.m_KeyFreqIndex] = counter3.value() - 1;
            } else if (counter3 != null) {
                dArr[this.m_KeyFreqIndex] = counter3.value();
            } else {
                dArr[this.m_KeyFreqIndex] = 0.0d;
            }
        }
        if (this.m_STDEVfeature) {
            double[] dArr2 = new double[((FastVector) fastVector.elementAt(3)).size()];
            for (int i2 = 0; i2 < dArr2.length; i2++) {
                dArr2[i2] = ((Counter) r0.elementAt(i2)).value() / i;
            }
            double mean = Utils.mean(dArr2);
            double d2 = 0.0d;
            for (double d3 : dArr2) {
                d2 += (d3 - mean) * (d3 - mean);
            }
            dArr[this.m_STDEVIndex] = Math.sqrt(d2 / r0.size());
        }
        if (this.m_NODEfeature) {
            Vector<String> related = this.m_Vocabulary.getRelated(str);
            int i3 = 0;
            if (related != null) {
                for (int i4 = 0; i4 < related.size(); i4++) {
                    if (hashMap3.get(related.elementAt(i4)) != null) {
                        i3++;
                    }
                }
            }
            dArr[this.m_NodeIndex] = i3;
        }
        if (this.m_LENGTHfeature) {
            if ((this.m_vocabulary.equals("none") ? str : this.m_Vocabulary.getOrig(str)) == null) {
                log.info("problem with id " + str);
                dArr[this.m_LengthIndex] = 1.0d;
            } else {
                dArr[this.m_LengthIndex] = split(r27, " ").length;
            }
        }
        if (hashMap == null) {
            dArr[this.m_NumFeatures] = Instance.missingValue();
        } else if (hashMap.containsKey(str)) {
            dArr[this.m_NumFeatures] = hashMap.get(str).value() / this.m_Indexers;
        } else {
            dArr[this.m_NumFeatures] = 0.0d;
        }
        return dArr;
    }

    private void convertPendingInstances() throws Exception {
        if (this.m_Debug) {
            log.info("--- Converting pending instances");
        }
        FastVector fastVector = new FastVector();
        for (int i = 0; i < getInputFormat().numAttributes(); i++) {
            if (i == this.m_DocumentAtt) {
                fastVector.addElement(new Attribute("N-gram", (FastVector) null));
                fastVector.addElement(new Attribute("N-gram-original", (FastVector) null));
                fastVector.addElement(new Attribute("TFxIDF"));
                fastVector.addElement(new Attribute("First_occurrence"));
                if (this.m_Debug && this.m_KFused) {
                    fastVector.addElement(new Attribute("Keyphrase_frequency"));
                }
                if (this.m_STDEVfeature) {
                    fastVector.addElement(new Attribute("Standard_deviation"));
                }
                if (this.m_NODEfeature) {
                    fastVector.addElement(new Attribute("Relations_number"));
                }
                if (this.m_LENGTHfeature) {
                    fastVector.addElement(new Attribute("Phrase_length"));
                }
                fastVector.addElement(new Attribute("Probability"));
                fastVector.addElement(new Attribute("Rank"));
            } else if (i == this.m_KeyphrasesAtt) {
                FastVector fastVector2 = new FastVector(2);
                fastVector2.addElement("False");
                fastVector2.addElement("True");
                fastVector.addElement(new Attribute("Keyphrase?"));
            } else {
                fastVector.addElement(getInputFormat().attribute(i));
            }
        }
        setOutputFormat(new Instances("KEAdata", fastVector, 0));
        for (int i2 = 0; i2 < getInputFormat().numInstances(); i2++) {
            Enumeration elements = convertInstance(getInputFormat().instance(i2), true).elements();
            while (elements.hasMoreElements()) {
                push((Instance) elements.nextElement());
            }
        }
    }

    private FastVector convertInstance(Instance instance, boolean z) throws Exception {
        FastVector fastVector = new FastVector();
        if (this.m_Debug) {
            log.info("-- Converting instance");
        }
        HashMap<String, Counter> hashMap = null;
        HashMap<String, Counter> hashMap2 = null;
        if (!instance.isMissing(this.m_KeyphrasesAtt)) {
            String stringValue = instance.stringValue(this.m_KeyphrasesAtt);
            hashMap = getGivenKeyphrases(stringValue, false);
            hashMap2 = getGivenKeyphrases(stringValue, true);
        }
        HashMap<String, FastVector> hashMap3 = new HashMap<>();
        int phrases = getPhrases(hashMap3, instance.stringValue(this.m_DocumentAtt));
        int i = 5;
        if (this.m_Debug && this.m_KFused) {
            i = 5 + 1;
        }
        if (this.m_STDEVfeature) {
            i++;
        }
        if (this.m_NODEfeature) {
            i++;
        }
        if (this.m_LENGTHfeature) {
            i++;
        }
        int i2 = this.m_DocumentAtt + 2;
        int i3 = this.m_DocumentAtt + 3;
        int i4 = (this.m_DocumentAtt + i) - 1;
        for (String str : hashMap3.keySet()) {
            FastVector fastVector2 = hashMap3.get(str);
            Instance instance2 = new Instance(instance.weight(), featVals(str, fastVector2, z, hashMap2, hashMap, phrases, hashMap3));
            instance2.setDataset(this.m_ClassifierData);
            double d = this.m_Classifier.distributionForInstance(instance2)[0];
            double[] dArr = new double[instance.numAttributes() + i];
            int i5 = 0;
            for (int i6 = 0; i6 < instance.numAttributes(); i6++) {
                if (i6 == this.m_DocumentAtt) {
                    int addStringValue = outputFormatPeek().attribute(i5).addStringValue(str);
                    int i7 = i5;
                    int i8 = i5 + 1;
                    dArr[i7] = addStringValue;
                    String str2 = (String) fastVector2.elementAt(2);
                    int addStringValue2 = str2 != null ? outputFormatPeek().attribute(i8).addStringValue(str2) : outputFormatPeek().attribute(i8).addStringValue(str);
                    int i9 = i8 + 1;
                    dArr[i8] = addStringValue2;
                    int i10 = i9 + 1;
                    dArr[i9] = instance2.value(this.m_TfidfIndex);
                    int i11 = i10 + 1;
                    dArr[i10] = instance2.value(this.m_FirstOccurIndex);
                    if (this.m_Debug && this.m_KFused) {
                        i11++;
                        dArr[i11] = instance2.value(this.m_KeyFreqIndex);
                    }
                    if (this.m_STDEVfeature) {
                        int i12 = i11;
                        i11++;
                        dArr[i12] = instance2.value(this.m_STDEVIndex);
                    }
                    if (this.m_NODEfeature) {
                        int i13 = i11;
                        i11++;
                        dArr[i13] = instance2.value(this.m_NodeIndex);
                    }
                    if (this.m_LENGTHfeature) {
                        int i14 = i11;
                        i11++;
                        dArr[i14] = instance2.value(this.m_LengthIndex);
                    }
                    i4 = i11;
                    int i15 = i11;
                    int i16 = i11 + 1;
                    dArr[i15] = d;
                    i5 = i16 + 1;
                    dArr[i16] = Instance.missingValue();
                } else if (i6 == this.m_KeyphrasesAtt) {
                    int i17 = i5;
                    i5++;
                    dArr[i17] = instance2.classValue();
                } else {
                    int i18 = i5;
                    i5++;
                    dArr[i18] = instance.value(i6);
                }
            }
            Instance instance3 = new Instance(instance.weight(), dArr);
            instance3.setDataset(outputFormatPeek());
            fastVector.addElement(instance3);
        }
        if (hashMap2 != null) {
            for (String str3 : hashMap2.keySet()) {
                double[] dArr2 = new double[instance.numAttributes() + i];
                int i19 = 0;
                for (int i20 = 0; i20 < instance.numAttributes(); i20++) {
                    if (i20 == this.m_DocumentAtt) {
                        int addStringValue3 = outputFormatPeek().attribute(i19).addStringValue(str3);
                        int i21 = i19;
                        int i22 = i19 + 1;
                        dArr2[i21] = addStringValue3;
                        int addStringValue4 = outputFormatPeek().attribute(i22).addStringValue(str3);
                        int i23 = i22 + 1;
                        dArr2[i22] = addStringValue4;
                        int i24 = i23 + 1;
                        dArr2[i23] = Instance.missingValue();
                        int i25 = i24 + 1;
                        dArr2[i24] = Instance.missingValue();
                        if (this.m_Debug && this.m_KFused) {
                            i25++;
                            dArr2[i25] = Instance.missingValue();
                        }
                        if (this.m_STDEVfeature) {
                            int i26 = i25;
                            i25++;
                            dArr2[i26] = Instance.missingValue();
                        }
                        if (this.m_NODEfeature) {
                            int i27 = i25;
                            i25++;
                            dArr2[i27] = Instance.missingValue();
                        }
                        if (this.m_LENGTHfeature) {
                            int i28 = i25;
                            i25++;
                            dArr2[i28] = Instance.missingValue();
                        }
                        int i29 = i25;
                        i19 = i25 + 1;
                        dArr2[i29] = -1.7976931348623157E308d;
                    } else if (i20 == this.m_KeyphrasesAtt) {
                        int i30 = i19;
                        i19++;
                        dArr2[i30] = 1.0d;
                    } else {
                        int i31 = i19;
                        i19++;
                        dArr2[i31] = instance.value(i20);
                    }
                    Instance instance4 = new Instance(instance.weight(), dArr2);
                    instance4.setDataset(outputFormatPeek());
                    fastVector.addElement(instance4);
                }
            }
        }
        double[] dArr3 = new double[fastVector.size()];
        for (int i32 = 0; i32 < dArr3.length; i32++) {
            dArr3[i32] = ((Instance) fastVector.elementAt(i32)).value(i3);
        }
        FastVector fastVector3 = new FastVector(fastVector.size());
        int[] stableSort = Utils.stableSort(dArr3);
        for (int i33 = 0; i33 < dArr3.length; i33++) {
            fastVector3.addElement(fastVector.elementAt(stableSort[i33]));
        }
        for (int i34 = 0; i34 < dArr3.length; i34++) {
            dArr3[i34] = -((Instance) fastVector3.elementAt(i34)).value(i2);
        }
        FastVector fastVector4 = new FastVector(fastVector3.size());
        int[] stableSort2 = Utils.stableSort(dArr3);
        for (int i35 = 0; i35 < dArr3.length; i35++) {
            fastVector4.addElement(fastVector3.elementAt(stableSort2[i35]));
        }
        for (int i36 = 0; i36 < dArr3.length; i36++) {
            dArr3[i36] = 1.0d - ((Instance) fastVector4.elementAt(i36)).value(i4);
        }
        FastVector fastVector5 = new FastVector(fastVector4.size());
        int[] stableSort3 = Utils.stableSort(dArr3);
        for (int i37 = 0; i37 < dArr3.length; i37++) {
            fastVector5.addElement(fastVector4.elementAt(stableSort3[i37]));
        }
        int i38 = 1;
        for (int i39 = 0; i39 < dArr3.length; i39++) {
            Instance instance5 = (Instance) fastVector5.elementAt(i39);
            if (Utils.grOrEq(dArr3[i39], 1.0d)) {
                instance5.setValue(i4 + 1, 2.147483647E9d);
            } else {
                for (int i40 = i39; i40 < dArr3.length; i40++) {
                    Instance instance6 = (Instance) fastVector5.elementAt(i40);
                    if (instance6.value(i2) != instance5.value(i2) || instance6.value(i4) != instance5.value(i4) || instance6.value(i3) != instance5.value(i3)) {
                        break;
                    }
                }
                int i41 = i38;
                i38++;
                instance5.setValue(i4 + 1, i41);
            }
        }
        return fastVector5;
    }

    public HashMap<String, Counter> getPhrasesForDictionary(String str) {
        String[] strArr = new String[this.m_MaxPhraseLength];
        HashMap<String, Counter> hashMap = new HashMap<>();
        StringTokenizer stringTokenizer = new StringTokenizer(str, "\n");
        while (stringTokenizer.hasMoreTokens()) {
            int i = 0;
            StringTokenizer stringTokenizer2 = new StringTokenizer(stringTokenizer.nextToken(), " ");
            while (stringTokenizer2.hasMoreTokens()) {
                String nextToken = stringTokenizer2.nextToken();
                for (int i2 = 0; i2 < this.m_MaxPhraseLength - 1; i2++) {
                    strArr[i2] = strArr[i2 + 1];
                }
                strArr[this.m_MaxPhraseLength - 1] = nextToken;
                i++;
                if (i > this.m_MaxPhraseLength) {
                    i = this.m_MaxPhraseLength;
                }
                if (!this.m_Stopwords.isStopword(strArr[this.m_MaxPhraseLength - 1])) {
                    StringBuffer stringBuffer = new StringBuffer();
                    for (int i3 = 1; i3 <= i; i3++) {
                        if (i3 > 1) {
                            stringBuffer.insert(0, ' ');
                        }
                        stringBuffer.insert(0, strArr[this.m_MaxPhraseLength - i3]);
                        if ((i3 <= 1 || !this.m_Stopwords.isStopword(strArr[this.m_MaxPhraseLength - i3])) && i3 >= this.m_MinPhraseLength) {
                            String stringBuffer2 = stringBuffer.toString();
                            String pseudoPhrase = this.m_vocabulary.equals("none") ? pseudoPhrase(stringBuffer2) : this.m_Vocabulary.getID(stringBuffer2);
                            if (pseudoPhrase != null) {
                                Counter counter = hashMap.get(pseudoPhrase);
                                if (counter == null) {
                                    hashMap.put(pseudoPhrase, new Counter());
                                } else {
                                    counter.increment();
                                }
                            }
                        }
                    }
                }
            }
        }
        return hashMap;
    }

    private int getPhrases(HashMap<String, FastVector> hashMap, String str) {
        String[] strArr = new String[this.m_MaxPhraseLength];
        StringTokenizer stringTokenizer = new StringTokenizer(str, "\n");
        int i = 1;
        while (stringTokenizer.hasMoreTokens()) {
            int i2 = 0;
            StringTokenizer stringTokenizer2 = new StringTokenizer(stringTokenizer.nextToken(), " ");
            while (stringTokenizer2.hasMoreTokens()) {
                String nextToken = stringTokenizer2.nextToken();
                for (int i3 = 0; i3 < this.m_MaxPhraseLength - 1; i3++) {
                    strArr[i3] = strArr[i3 + 1];
                }
                strArr[this.m_MaxPhraseLength - 1] = nextToken;
                i2++;
                if (i2 > this.m_MaxPhraseLength) {
                    i2 = this.m_MaxPhraseLength;
                }
                if (this.m_Stopwords.isStopword(strArr[this.m_MaxPhraseLength - 1])) {
                    i++;
                } else {
                    StringBuffer stringBuffer = new StringBuffer();
                    for (int i4 = 1; i4 <= i2; i4++) {
                        if (i4 > 1) {
                            stringBuffer.insert(0, ' ');
                        }
                        stringBuffer.insert(0, strArr[this.m_MaxPhraseLength - i4]);
                        if ((i4 <= 1 || !this.m_Stopwords.isStopword(strArr[this.m_MaxPhraseLength - i4])) && i4 >= this.m_MinPhraseLength) {
                            String stringBuffer2 = stringBuffer.toString();
                            String pseudoPhrase = this.m_vocabulary.equals("none") ? pseudoPhrase(stringBuffer2) : this.m_Vocabulary.getID(stringBuffer2);
                            if (pseudoPhrase != null) {
                                if (!this.m_vocabulary.equals("none")) {
                                    stringBuffer2 = this.m_Vocabulary.getOrig(pseudoPhrase);
                                }
                                FastVector fastVector = hashMap.get(pseudoPhrase);
                                if (fastVector == null) {
                                    FastVector fastVector2 = this.m_STDEVfeature ? new FastVector(3) : new FastVector(2);
                                    fastVector2.addElement(new Counter((i + 1) - i4));
                                    fastVector2.addElement(new Counter());
                                    fastVector2.addElement(stringBuffer2);
                                    if (this.m_STDEVfeature) {
                                        FastVector fastVector3 = new FastVector();
                                        fastVector3.addElement(new Counter((i + 1) - i4));
                                        fastVector2.addElement(fastVector3);
                                    }
                                    hashMap.put(pseudoPhrase, fastVector2);
                                } else {
                                    ((Counter) fastVector.elementAt(1)).increment();
                                    if (this.m_STDEVfeature) {
                                        FastVector fastVector4 = (FastVector) fastVector.elementAt(3);
                                        fastVector4.addElement(new Counter((i + 1) - i4));
                                        fastVector.addElement(fastVector4);
                                    }
                                }
                            }
                        }
                    }
                    i++;
                }
            }
        }
        Iterator<String> it = hashMap.keySet().iterator();
        while (it.hasNext()) {
            if (((Counter) hashMap.get(it.next()).elementAt(1)).value() < this.m_MinNumOccur) {
                it.remove();
            }
        }
        return i;
    }

    private static String[] split(String str, String str2) {
        String str3;
        ArrayList arrayList = new ArrayList();
        String str4 = WebUtils.EMPTY_STRING;
        for (int i = 0; i < str.length(); i++) {
            if (str.substring(i, i + 1).equalsIgnoreCase(str2)) {
                arrayList.add(str4);
                str3 = WebUtils.EMPTY_STRING;
            } else {
                str3 = str4 + str.charAt(i);
            }
            str4 = str3;
        }
        if (str4 != WebUtils.EMPTY_STRING) {
            arrayList.add(str4);
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    private HashMap<String, Counter> getGivenKeyphrases(String str, boolean z) {
        HashMap<String, Counter> hashMap = new HashMap<>();
        StringTokenizer stringTokenizer = new StringTokenizer(str, "\n");
        while (stringTokenizer.hasMoreTokens()) {
            String trim = stringTokenizer.nextToken().trim();
            if (trim.matches(".+?/.+?")) {
                trim = trim.split("/")[0];
            }
            String pseudoPhrase = pseudoPhrase(trim);
            if (pseudoPhrase.length() > 0) {
                String id = this.m_vocabulary.equals("none") ? pseudoPhrase : this.m_Vocabulary.getID(pseudoPhrase);
                if (id != null) {
                    if (hashMap.containsKey(id)) {
                        Counter counter = hashMap.get(id);
                        counter.increment();
                        hashMap.put(id, counter);
                        if (z && this.m_Debug) {
                            log.info("Skipping the phrase " + pseudoPhrase + ", which appears twice in the author-assigned keyphrase set.");
                        }
                    } else {
                        hashMap.put(id, new Counter());
                    }
                }
            }
        }
        if (hashMap.size() == 0) {
            return null;
        }
        return hashMap;
    }

    public String pseudoPhrase(String str) {
        String lowerCase = str.toLowerCase();
        if (lowerCase.matches(".+?/.+?")) {
            lowerCase = lowerCase.split("/")[0];
        }
        if (lowerCase.matches(".+?\\(.+?")) {
            lowerCase = lowerCase.split("\\(")[0];
        }
        if (lowerCase.matches(".+?\\'.+?")) {
            lowerCase = lowerCase.split("\\'")[1];
        }
        String[] split = lowerCase.replace('-', ' ').replace('&', ' ').replaceAll("\\*", WebUtils.EMPTY_STRING).replaceAll("\\, ", " ").replaceAll("\\. ", " ").replaceAll("\\:", WebUtils.EMPTY_STRING).trim().split(" ");
        String str2 = WebUtils.EMPTY_STRING;
        for (int i = 0; i < split.length; i++) {
            if (!this.m_Stopwords.isStopword(split[i])) {
                str2 = str2.equals(WebUtils.EMPTY_STRING) ? split[i] : str2 + " " + split[i];
            }
        }
        return join(sort(this.m_Stemmer.stemString(str2).split(" ")));
    }

    private static String join(String[] strArr) {
        String str = WebUtils.EMPTY_STRING;
        for (int i = 0; i < strArr.length; i++) {
            str = !str.equals(WebUtils.EMPTY_STRING) ? str + " " + strArr[i] : strArr[i];
        }
        return str;
    }

    public static void swap(int i, int i2, String[] strArr) {
        String str = strArr[i];
        strArr[i] = strArr[i2];
        strArr[i2] = str;
    }

    public static String[] sort(String[] strArr) {
        for (int i = 0; i < strArr.length - 1; i++) {
            int i2 = i;
            for (int i3 = i + 1; i3 < strArr.length; i3++) {
                if (strArr[i3].toUpperCase().compareTo(strArr[i2].toUpperCase()) < 0) {
                    i2 = i3;
                }
                if (strArr[i3].toUpperCase().compareTo(strArr[i2].toUpperCase()) == 0 && strArr[i3].compareTo(strArr[i2]) < 0) {
                    i2 = i3;
                }
            }
            if (i2 != i) {
                swap(i, i2, strArr);
            }
        }
        return strArr;
    }

    public static void main(String[] strArr) {
        try {
            if (Utils.getFlag('b', strArr)) {
                Filter.batchFilterFile(new KEAFilter(new StopwordsEnglish()), strArr);
            } else {
                Filter.filterFile(new KEAFilter(new StopwordsEnglish()), strArr);
            }
        } catch (Exception e) {
            log.info(e.getMessage());
        }
    }
}
