package com.rapidminer.operator.text.io;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.SimpleExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.GrowingExampleTable;
import com.rapidminer.example.utils.ExampleSets;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.InputPortExtender;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.GenerateNewMDRule;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.PassThroughOrGenerateRule;
import com.rapidminer.operator.ports.metadata.SimplePrecondition;
import com.rapidminer.operator.ports.metadata.SubprocessTransformRule;
import com.rapidminer.operator.text.Document;
import com.rapidminer.operator.text.Token;
import com.rapidminer.operator.text.WordList;
import com.rapidminer.operator.text.WordListEntry;
import com.rapidminer.operator.text.io.vectorcreation.BinaryOccurrences;
import com.rapidminer.operator.text.io.vectorcreation.TFIDF;
import com.rapidminer.operator.text.io.vectorcreation.TermFrequency;
import com.rapidminer.operator.text.io.vectorcreation.TermOccurrences;
import com.rapidminer.operator.text.io.vectorcreation.VectorCreator;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.parameter.conditions.EqualTypeCondition;
import com.rapidminer.parameter.conditions.ParameterCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.ParameterService;
import com.rapidminer.tools.container.Pair;
import com.rapidminer.tools.parameter.internal.DataManagementParameterHelper;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

/* loaded from: input_file:com/rapidminer/operator/text/io/AbstractDocumentInputOperator.class */
public abstract class AbstractDocumentInputOperator extends OperatorChain {
    public static final String ATTRIBUTE_TEXT_NAME = "text";
    public static final String PARAMETER_ADD_META_INFORMATION = "add_meta_information";
    public static final String PARAMETER_CREATE_WORD_VECTOR = "create_word_vector";
    public static final String PARAMETER_VECTOR_CREATION = "vector_creation";
    public static final String PARAMETER_KEEP_TEXT = "keep_text";
    public static final String PARAMETER_PRUNE_METHOD = "prune_method";
    public static final String PARAMETER_PRUNE_ABOVE_RELATIVE = "prune_above_percent";
    public static final String PARAMETER_PRUNE_BELOW_RELATIVE = "prune_below_percent";
    public static final String PARAMETER_PRUNE_BELOW_RANK = "prune_below_rank";
    public static final String PARAMETER_PRUNE_ABOVE_RANK = "prune_above_rank";
    public static final String PARAMETER_PRUNE_BELOW_ABSOLUTE = "prune_below_absolute";
    public static final String PARAMETER_PRUNE_ABOVE_ABSOLUTE = "prune_above_absolute";
    public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";
    public static final String[] VECTOR_CREATOR_NAMES = {"TF-IDF", "Term Frequency", "Term Occurrences", "Binary Term Occurrences"};
    public static final Class<?>[] VECTOR_CREATOR_CLASSES = {TFIDF.class, TermFrequency.class, TermOccurrences.class, BinaryOccurrences.class};
    public static final String[] PRUNING_METHODS = {"none", "percentual", "absolute", "by ranking"};
    public static final int PRUNING_METHOD_PERCENTUAL = 1;
    public static final int PRUNING_METHOD_ABSOLUTE = 2;
    public static final int PRUNING_METHOD_RANKING = 3;
    private final InputPort wordListInput;
    private final OutputPort innerTextObjectSource;
    private final InputPortExtender innerTextObjectSink;
    private final OutputPort exampleSetOutput;
    private final OutputPort wordListOutput;

    public AbstractDocumentInputOperator(OperatorDescription operatorDescription) {
        super(operatorDescription, new String[]{"Vector Creation"});
        this.wordListInput = getInputPorts().createPort("word list");
        this.innerTextObjectSource = getSubprocess(0).getInnerSources().createPort("document");
        this.innerTextObjectSink = new InputPortExtender("document", getSubprocess(0).getInnerSinks(), new MetaData(Document.class), true);
        this.exampleSetOutput = getOutputPorts().createPort("example set");
        this.wordListOutput = getOutputPorts().createPort("word list");
        this.wordListInput.addPrecondition(new SimplePrecondition(this.wordListInput, new MetaData(WordList.class), false));
        this.innerTextObjectSink.start();
        getTransformer().addGenerationRule(this.innerTextObjectSource, Document.class);
        getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0)));
        getTransformer().addRule(new PassThroughOrGenerateRule(this.wordListInput, this.wordListOutput, new MetaData(WordList.class)));
        getTransformer().addRule(new GenerateNewMDRule(this.exampleSetOutput, new ExampleSetMetaData()) { // from class: com.rapidminer.operator.text.io.AbstractDocumentInputOperator.1
            public MetaData modifyMetaData(MetaData metaData) {
                ExampleSetMetaData exampleSetMetaData = (ExampleSetMetaData) metaData;
                if (AbstractDocumentInputOperator.this.getParameterAsBoolean(AbstractDocumentInputOperator.PARAMETER_KEEP_TEXT)) {
                    exampleSetMetaData.addAttribute(new AttributeMetaData("text", 5, "text"));
                }
                if (AbstractDocumentInputOperator.this.getParameterAsBoolean(AbstractDocumentInputOperator.PARAMETER_CREATE_WORD_VECTOR)) {
                    exampleSetMetaData.attributesAreSubset();
                }
                if (AbstractDocumentInputOperator.this.getParameterAsBoolean("add_meta_information")) {
                    exampleSetMetaData = AbstractDocumentInputOperator.this.addMetaDataAttributes(exampleSetMetaData);
                }
                return exampleSetMetaData;
            }
        });
    }

    public void doWork() throws OperatorException {
        boolean parameterAsBoolean = getParameterAsBoolean(PARAMETER_CREATE_WORD_VECTOR);
        boolean z = parameterAsBoolean || this.wordListOutput.isConnected();
        WordList dataOrNull = this.wordListInput.getDataOrNull(WordList.class);
        if (dataOrNull == null && z) {
            dataOrNull = generateWordList();
            if (dataOrNull == null) {
                throw new OperatorException("Could not create word list");
            }
        }
        GrowingExampleTable targetExampleTable = getTargetExampleTable();
        HashSet hashSet = new HashSet();
        Attribute attribute = null;
        if (getParameterAsBoolean(PARAMETER_KEEP_TEXT)) {
            attribute = createAttribute(hashSet, "text", 5);
            targetExampleTable.addAttribute(attribute);
        }
        List<Pair<Attribute, String>> linkedList = new LinkedList();
        List<Attribute> list = null;
        DataRowFactory dataRowFactory = new DataRowFactory(Boolean.parseBoolean(ParameterService.getParameterValue("rapidminer.system.legacy_data_mgmt")) ? getParameterAsInt("datamanagement") : 0, '.');
        VectorCreator createVectorCreator = createVectorCreator();
        Iterator<Document> textObjects = getTextObjects();
        boolean z2 = false;
        while (textObjects.hasNext()) {
            Document next = textObjects.next();
            if (next != null) {
                this.innerTextObjectSource.deliver(next);
                getSubprocess(0).execute();
                List<Document> data = this.innerTextObjectSink.getData(Document.class, true);
                if (!z2 && !data.isEmpty()) {
                    if (getParameterAsBoolean("add_meta_information")) {
                        linkedList = createMetaDataAttributes(targetExampleTable, hashSet, (Document) data.iterator().next());
                    }
                    if (parameterAsBoolean) {
                        list = createWordAttributes(dataOrNull, hashSet);
                        targetExampleTable.addAttributes(list);
                    }
                    z2 = true;
                }
                for (Document document : data) {
                    DataRow dataRow = getDataRow(targetExampleTable, dataRowFactory);
                    if (parameterAsBoolean) {
                        updateWordList(dataOrNull, document);
                        createAndStoreWordVector(list, dataOrNull, dataRow, createVectorCreator);
                        dataOrNull.closeDocument(document);
                    }
                    setMetaData(dataRow, linkedList, document);
                    if (getParameterAsBoolean(PARAMETER_KEEP_TEXT)) {
                        if (attribute == null) {
                            throw new OperatorException("Could not create text attribute");
                        }
                        dataRow.set(attribute, attribute.getMapping().mapString(document.getTokenText()));
                    }
                    targetExampleTable.addDataRow(dataRow);
                }
            }
        }
        HashMap hashMap = new HashMap();
        if (getParameterAsBoolean(PARAMETER_KEEP_TEXT)) {
            hashMap.put(attribute, "text");
        }
        if (linkedList != null) {
            for (Pair<Attribute, String> pair : linkedList) {
                if (isMetaDataSpecial((String) pair.getSecond())) {
                    hashMap.put(pair.getFirst(), pair.getSecond());
                }
            }
        }
        this.exampleSetOutput.deliver(postProcessExampleSet(new SimpleExampleSet(targetExampleTable, (List) null, hashMap)));
        this.wordListOutput.deliver(dataOrNull);
    }

    private WordList generateWordList() throws OperatorException {
        WordList wordList = new WordList(getLabelValues());
        wordList.setUpdateOnlyCurrent(false);
        wordList.setAppendUnknownWords(true);
        Iterator<Document> textObjects = getTextObjects();
        while (textObjects.hasNext()) {
            try {
                Document next = textObjects.next();
                if (next != null) {
                    this.innerTextObjectSource.deliver(next);
                    getSubprocess(0).execute();
                    for (Document document : this.innerTextObjectSink.getData(Document.class, true)) {
                        for (Token token : document.getTokenSequence()) {
                            wordList.addWordOccurance(token.getToken(), token.getWeight());
                        }
                        wordList.closeDocument(document);
                    }
                }
            } catch (RuntimeException e) {
                if (e.getCause() instanceof OperatorException) {
                    throw e.getCause();
                }
                throw e;
            }
        }
        pruneWordList(wordList);
        wordList.sort();
        wordList.setAppendUnknownWords(false);
        wordList.setUpdateOnlyCurrent(true);
        return wordList;
    }

    private void setMetaData(DataRow dataRow, List<Pair<Attribute, String>> list, Document document) {
        for (Pair<Attribute, String> pair : list) {
            Attribute attribute = (Attribute) pair.getFirst();
            String str = (String) pair.getSecond();
            if (attribute.isNominal()) {
                if (((String) document.getMetaDataValue(str)) != null) {
                    dataRow.set(attribute, attribute.getMapping().mapString(r0));
                } else {
                    dataRow.set(attribute, Double.NaN);
                }
            } else if (attribute.isNumerical()) {
                Double d = (Double) document.getMetaDataValue(str);
                if (d != null) {
                    dataRow.set(attribute, d.doubleValue());
                } else {
                    dataRow.set(attribute, Double.NaN);
                }
            } else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), 9)) {
                if (((Date) document.getMetaDataValue(str)) != null) {
                    dataRow.set(attribute, r0.getTime());
                } else {
                    dataRow.set(attribute, Double.NaN);
                }
            }
        }
    }

    private List<Attribute> createWordAttributes(WordList wordList, Set<String> set) throws UserError {
        Attribute createAttribute;
        ArrayList arrayList = new ArrayList();
        for (WordListEntry wordListEntry : wordList.getEntries()) {
            if (!wordListEntry.hasWordAttributeName()) {
                createAttribute = createAttribute(set, wordListEntry.getWord(), 4);
                wordListEntry.setWordAttributeName(createAttribute.getName());
            } else {
                if (set.contains(wordListEntry.getWordAttributeName())) {
                    throw new UserError(this, 152, new Object[]{wordListEntry.getWordAttributeName()});
                }
                createAttribute = AttributeFactory.createAttribute(wordListEntry.getWordAttributeName(), 4);
            }
            arrayList.add(createAttribute);
        }
        return arrayList;
    }

    public static List<Pair<Attribute, String>> createMetaDataAttributes(ExampleTable exampleTable, Set<String> set, Document document) {
        LinkedList linkedList = new LinkedList();
        LinkedList linkedList2 = new LinkedList();
        for (String str : document.getMetaDataKeys()) {
            Attribute createAttribute = createAttribute(set, str, document.getMetaDataType(str));
            linkedList2.add(new Pair(createAttribute, str));
            linkedList.add(createAttribute);
        }
        exampleTable.addAttributes(linkedList);
        return linkedList2;
    }

    private void updateWordList(WordList wordList, Document document) throws OperatorException {
        if (wordList == null) {
            throw new OperatorException("Could not update wordList");
        }
        for (Token token : document.getTokenSequence()) {
            wordList.addWordOccurance(token.getToken(), token.getWeight());
        }
    }

    private void createAndStoreWordVector(List<Attribute> list, WordList wordList, DataRow dataRow, VectorCreator vectorCreator) throws OperatorException {
        if (list == null || wordList == null) {
            throw new OperatorException("Could not create word attributes");
        }
        double[] createVector = vectorCreator.createVector(wordList.getCurrentDocumentFrequencies(), wordList);
        int i = 0;
        Iterator<Attribute> it = list.iterator();
        while (it.hasNext()) {
            dataRow.set(it.next(), createVector[i]);
            i++;
        }
    }

    protected static Attribute createAttribute(Set<String> set, String str, int i) {
        String uniqueAttributeName = getUniqueAttributeName(set, str);
        set.add(uniqueAttributeName);
        return AttributeFactory.createAttribute(uniqueAttributeName, i);
    }

    private static String getUniqueAttributeName(Set<String> set, String str) {
        if (!set.contains(str)) {
            return str;
        }
        int i = 0;
        while (set.contains(str + "_" + i)) {
            i++;
        }
        return str + "_" + i;
    }

    protected abstract List<String> getLabelValues() throws OperatorException;

    protected abstract ExampleSetMetaData addMetaDataAttributes(ExampleSetMetaData exampleSetMetaData);

    protected abstract int getProvidedLabelType() throws UserError;

    protected abstract boolean providesLabel() throws UserError;

    protected abstract List<String> getMetaDataKeys() throws UserError;

    protected abstract boolean isMetaDataSpecial(String str);

    protected abstract Iterator<Document> getTextObjects() throws OperatorException;

    private VectorCreator createVectorCreator() throws OperatorException {
        Class<?> cls = VECTOR_CREATOR_CLASSES[getParameterAsInt(PARAMETER_VECTOR_CREATION)];
        try {
            return (VectorCreator) cls.newInstance();
        } catch (IllegalAccessException | InstantiationException e) {
            throw new UserError(this, 904, new Object[]{cls.getName(), e.getMessage()});
        }
    }

    protected GrowingExampleTable getTargetExampleTable() throws UserError {
        return ExampleSets.createTableFrom(Collections.emptyList(), DataManagementParameterHelper.getSelectedDataManagement(this));
    }

    protected DataRow getDataRow(ExampleTable exampleTable, DataRowFactory dataRowFactory) {
        return dataRowFactory.create(exampleTable.getAttributeCount());
    }

    protected ExampleSet postProcessExampleSet(ExampleSet exampleSet) throws OperatorException {
        return exampleSet;
    }

    private void pruneWordList(WordList wordList) throws UndefinedParameterError {
        switch (getParameterAsInt(PARAMETER_PRUNE_METHOD)) {
            case 1:
                wordList.pruneByRelativeFrequency(getParameterAsDouble(PARAMETER_PRUNE_BELOW_RELATIVE), getParameterAsDouble(PARAMETER_PRUNE_ABOVE_RELATIVE));
                return;
            case 2:
                wordList.pruneByAbsoluteFrequency(getParameterAsInt(PARAMETER_PRUNE_BELOW_ABSOLUTE), getParameterAsInt(PARAMETER_PRUNE_ABOVE_ABSOLUTE));
                return;
            case 3:
                wordList.pruneByRelativeRank(getParameterAsDouble(PARAMETER_PRUNE_BELOW_RANK), getParameterAsDouble(PARAMETER_PRUNE_ABOVE_RANK));
                return;
            default:
                return;
        }
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_CREATE_WORD_VECTOR, "If checked, the tokens of a document will be used to generate a vector numerically representing the document.", true));
        ParameterTypeCategory parameterTypeCategory = new ParameterTypeCategory(PARAMETER_VECTOR_CREATION, "Select the schema for creating the word vector.", VECTOR_CREATOR_NAMES, 0);
        parameterTypeCategory.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_CREATE_WORD_VECTOR, true, true));
        parameterTypeCategory.setExpert(false);
        parameterTypes.add(parameterTypeCategory);
        ParameterTypeBoolean parameterTypeBoolean = new ParameterTypeBoolean("add_meta_information", "If checked, available meta information of the text like filename, date is added as attribute.", true);
        parameterTypeBoolean.setExpert(false);
        parameterTypes.add(parameterTypeBoolean);
        ParameterTypeBoolean parameterTypeBoolean2 = new ParameterTypeBoolean(PARAMETER_KEEP_TEXT, "If checked, the input text will be stored as a special String attribute with the role text.", false);
        parameterTypeBoolean2.setExpert(false);
        parameterTypes.add(parameterTypeBoolean2);
        parameterTypes.add(new ParameterTypeCategory(PARAMETER_PRUNE_METHOD, "Specifies if to frequent or to infrequent words should be ignored for word list building and how the frequencies are specified.", PRUNING_METHODS, 0, false));
        ParameterTypeDouble parameterTypeDouble = new ParameterTypeDouble(PARAMETER_PRUNE_BELOW_RELATIVE, "Ignore words that appear in less than this percentage of all documents.", 0.0d, 100.0d, 3.0d);
        parameterTypeDouble.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_PRUNE_METHOD, PRUNING_METHODS, true, new int[]{1}));
        parameterTypeDouble.setExpert(false);
        parameterTypes.add(parameterTypeDouble);
        ParameterTypeDouble parameterTypeDouble2 = new ParameterTypeDouble(PARAMETER_PRUNE_ABOVE_RELATIVE, "Ignore words that appear in more than this percentage of all documents.", 0.0d, 100.0d, 30.0d);
        parameterTypeDouble2.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_PRUNE_METHOD, PRUNING_METHODS, true, new int[]{1}));
        parameterTypeDouble2.setExpert(false);
        parameterTypes.add(parameterTypeDouble2);
        ParameterTypeInt parameterTypeInt = new ParameterTypeInt(PARAMETER_PRUNE_BELOW_ABSOLUTE, "Ignore words that appear in less than that many documents.", 0, Integer.MAX_VALUE, true);
        parameterTypeInt.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_PRUNE_METHOD, PRUNING_METHODS, true, new int[]{2}));
        parameterTypeInt.setExpert(false);
        parameterTypes.add(parameterTypeInt);
        ParameterTypeInt parameterTypeInt2 = new ParameterTypeInt(PARAMETER_PRUNE_ABOVE_ABSOLUTE, "Ignore words that appear in more than that many documents.", 0, Integer.MAX_VALUE, true);
        parameterTypeInt2.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_PRUNE_METHOD, PRUNING_METHODS, true, new int[]{2}));
        parameterTypeInt2.setExpert(false);
        parameterTypes.add(parameterTypeInt2);
        ParameterTypeDouble parameterTypeDouble3 = new ParameterTypeDouble(PARAMETER_PRUNE_BELOW_RANK, "Words are ordered by frequency and words with a frequency equal or less than the frequency of the rank given by this percentage will be pruned.", 0.0d, 1.0d, 0.05d);
        parameterTypeDouble3.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_PRUNE_METHOD, PRUNING_METHODS, true, new int[]{3}));
        parameterTypeDouble3.setExpert(false);
        parameterTypes.add(parameterTypeDouble3);
        ParameterTypeDouble parameterTypeDouble4 = new ParameterTypeDouble(PARAMETER_PRUNE_ABOVE_RANK, "Words are ordered by frequency and words with a frequency equal or higher than the frequency of the rank given by this percentage will be pruned.", 0.0d, 1.0d, 0.95d);
        parameterTypeDouble4.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_PRUNE_METHOD, PRUNING_METHODS, true, new int[]{3}));
        parameterTypeDouble4.setExpert(false);
        parameterTypes.add(parameterTypeDouble4);
        ParameterTypeCategory parameterTypeCategory2 = new ParameterTypeCategory("datamanagement", "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, 7, true);
        parameterTypeCategory2.registerDependencyCondition(new ParameterCondition(this, false) { // from class: com.rapidminer.operator.text.io.AbstractDocumentInputOperator.2
            public boolean isConditionFullfilled() {
                return Boolean.parseBoolean(ParameterService.getParameterValue("rapidminer.system.legacy_data_mgmt"));
            }
        });
        parameterTypes.add(parameterTypeCategory2);
        ParameterTypeCategory parameterTypeCategory3 = new ParameterTypeCategory("data_management", "The data management optimization to use. Determines, how the data is represented internally. The auto option (default) only compresses data if it is very sparse and otherwise optimizes for speed. Choose speed-optimized if you have enough memory and want to speed up your process. Choose memory-optimized if you have a lot of sparse data that has trouble fitting into memory with auto mode.", DataManagementParameterHelper.NEW_DATA_MANAGMENT_OPTIONS, 0, true);
        parameterTypeCategory3.registerDependencyCondition(new ParameterCondition(this, false) { // from class: com.rapidminer.operator.text.io.AbstractDocumentInputOperator.3
            public boolean isConditionFullfilled() {
                return !Boolean.parseBoolean(ParameterService.getParameterValue("rapidminer.system.legacy_data_mgmt"));
            }
        });
        parameterTypes.add(parameterTypeCategory3);
        return parameterTypes;
    }
}
