package com.rapidminer.extension.hanminer.operator.featureExtraction.vectorizer;

import com.hankcs.hanlp.mining.word.TfIdfCounter;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.extension.hanminer.document.DocumentSet;
import com.rapidminer.extension.hanminer.document.SimpleDocumentSet;
import com.rapidminer.extension.hanminer.operator.data.ReadDocument;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

/* loaded from: input_file:com/rapidminer/extension/hanminer/operator/featureExtraction/vectorizer/TfIdfVectorizer.class */
public class TfIdfVectorizer extends Operator {
    private static final String PARAMETER_MAX_FEATURES = "max_features";
    private InputPort documentSetInput;
    private OutputPort exampleSetOutput;

    public TfIdfVectorizer(OperatorDescription operatorDescription) {
        super(operatorDescription);
        this.documentSetInput = getInputPorts().createPort(ReadDocument.PARAMETER_TEXT);
        this.exampleSetOutput = getOutputPorts().createPort("example set");
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeInt(PARAMETER_MAX_FEATURES, "This parameter specifies the max number of features in the result. The vocabulary will be built by top max_features ordered by term frequency across the corpus.", 1, 500, 100, false));
        return parameterTypes;
    }

    public static TfIdfCounter computeTfIDF(DocumentSet documentSet) {
        TfIdfCounter tfIdfCounter = new TfIdfCounter();
        for (int i = 0; i < documentSet.size(); i++) {
            tfIdfCounter.add(Integer.valueOf(i), documentSet.getDocument(i));
        }
        return tfIdfCounter;
    }

    public HashMap<String, Integer> getWordFeatureMap(TfIdfCounter tfIdfCounter, int i) {
        List<Map.Entry<String, Double>> subList = tfIdfCounter.sortedAllTf().subList(0, i);
        HashMap<String, Integer> hashMap = new HashMap<>();
        int i2 = 0;
        Iterator<Map.Entry<String, Double>> it = subList.iterator();
        while (it.hasNext()) {
            int i3 = i2;
            i2++;
            hashMap.put(it.next().getKey(), Integer.valueOf(i3));
        }
        return hashMap;
    }

    public void doWork() throws OperatorException {
        DocumentSet data = this.documentSetInput.getData(SimpleDocumentSet.class);
        int parameterAsInt = getParameterAsInt(PARAMETER_MAX_FEATURES);
        TfIdfCounter computeTfIDF = computeTfIDF(data);
        Map<Object, Map<String, Double>> compute = computeTfIDF.compute();
        int min = Math.min(computeTfIDF.allTf().size(), parameterAsInt);
        HashMap<String, Integer> wordFeatureMap = getWordFeatureMap(computeTfIDF, min);
        LinkedList linkedList = new LinkedList();
        for (int i = 0; i < min; i++) {
            Ontology ontology = Ontology.ATTRIBUTE_VALUE_TYPE;
            linkedList.add(AttributeFactory.createAttribute("Feature_" + i, 4));
        }
        MemoryExampleTable memoryExampleTable = new MemoryExampleTable(linkedList);
        Iterator<Map.Entry<Object, Map<String, Double>>> it = compute.entrySet().iterator();
        while (it.hasNext()) {
            Map<String, Double> value = it.next().getValue();
            double[] dArr = new double[linkedList.size()];
            Arrays.fill(dArr, 0.0d);
            for (String str : value.keySet()) {
                if (wordFeatureMap.containsKey(str)) {
                    dArr[wordFeatureMap.get(str).intValue()] = value.get(str).doubleValue();
                }
            }
            memoryExampleTable.addDataRow(new DoubleArrayDataRow(dArr));
        }
        this.exampleSetOutput.deliver(memoryExampleTable.createExampleSet());
    }
}
