package com.rapidminer.extension.hanminer.operator.featureExtraction.vectorizer;

import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.extension.hanminer.document.DocumentSet;
import com.rapidminer.extension.hanminer.document.SimpleDocumentSet;
import com.rapidminer.extension.hanminer.operator.featureExtraction.WordCount;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.tools.Ontology;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;

/* loaded from: input_file:com/rapidminer/extension/hanminer/operator/featureExtraction/vectorizer/CountVectorizer.class */
public class CountVectorizer extends Operator {
    private static final String PARAMETER_MAX_FEATURES = "max_features";
    private InputPort documentSetInput;
    private OutputPort exampleSetOutput;

    public CountVectorizer(OperatorDescription operatorDescription) {
        super(operatorDescription);
        this.documentSetInput = getInputPorts().createPort("document set");
        this.exampleSetOutput = getOutputPorts().createPort("example set");
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeInt(PARAMETER_MAX_FEATURES, "This parameter specifies the max number of features in the result. The vocabulary will be built on top max_features terms ordered by their frequency across the corpus.", 1, 500, 100, false));
        return parameterTypes;
    }

    public void doWork() throws OperatorException {
        DocumentSet data = this.documentSetInput.getData(SimpleDocumentSet.class);
        int parameterAsInt = getParameterAsInt(PARAMETER_MAX_FEATURES);
        Map<String, Integer> wordCount = WordCount.wordCount(data);
        int min = Math.min(wordCount.size(), parameterAsInt);
        PriorityQueue priorityQueue = new PriorityQueue((entry, entry2) -> {
            return ((Integer) entry.getValue()).intValue() - ((Integer) entry2.getValue()).intValue();
        });
        Iterator<Map.Entry<String, Integer>> it = wordCount.entrySet().iterator();
        while (it.hasNext()) {
            priorityQueue.add(it.next());
            if (priorityQueue.size() > min) {
                priorityQueue.poll();
            }
        }
        HashMap hashMap = new HashMap();
        int i = 0;
        Iterator it2 = priorityQueue.iterator();
        while (it2.hasNext()) {
            int i2 = i;
            i++;
            hashMap.put(((Map.Entry) it2.next()).getKey(), Integer.valueOf(i2));
        }
        LinkedList linkedList = new LinkedList();
        for (int i3 = 0; i3 < min; i3++) {
            Ontology ontology = Ontology.ATTRIBUTE_VALUE_TYPE;
            linkedList.add(AttributeFactory.createAttribute("Feature_" + i3, 4));
        }
        MemoryExampleTable memoryExampleTable = new MemoryExampleTable(linkedList);
        for (String str : data.getDocuments()) {
            double[] dArr = new double[linkedList.size()];
            Arrays.fill(dArr, 0.0d);
            for (String str2 : str.split("\\s+")) {
                if (hashMap.containsKey(str2)) {
                    dArr[((Integer) hashMap.get(str2)).intValue()] = wordCount.get(str2).intValue();
                }
            }
            memoryExampleTable.addDataRow(new DoubleArrayDataRow(dArr));
        }
        this.exampleSetOutput.deliver(memoryExampleTable.createExampleSet());
    }
}
