package com.rapidminer.operator.preprocessing.ie.features.struct;

import cern.colt.matrix.impl.AbstractFormatter;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.Partition;
import com.rapidminer.example.set.SplittedExampleSet;
import com.rapidminer.example.table.AttributeObjectFactory;
import com.rapidminer.example.table.ObjectAttribute;
import com.rapidminer.example.table.ObjectMapping;
import com.rapidminer.example.table.struct.AbstractStructureCreation;
import com.rapidminer.example.table.struct.Structures;
import com.rapidminer.example.table.struct.tree.KTreeNode;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.preprocessing.ie.features.tools.PreprocessOperatorImpl;
import com.rapidminer.operator.struct.tree.html.HTMLTreeNode;
import com.rapidminer.operator.struct.tree.html.HTMLTreeParser;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.OperatorService;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

/* loaded from: input_file:com/rapidminer/operator/preprocessing/ie/features/struct/HTMLTreeProcessing.class */
public class HTMLTreeProcessing extends AbstractStructureCreation {
    private final String PARAMETER_NEED_PARSING = "needParsing";
    HTMLTreeParser parser;
    HTMLTreeParser htmlParser;
    public static final String PARAMETER_SOURCE_ATTRIBUTE = "valueAttribute";
    public static final String PARAMETER_CREATE_TREE_STRING = "createTreeString";
    Attribute sourceAttribute;
    HashSet<String> wordVector;
    HashMap<String, Integer> wordVectorCapacity;

    public HTMLTreeProcessing(OperatorDescription operatorDescription) throws OperatorException {
        super(operatorDescription);
        this.PARAMETER_NEED_PARSING = "needParsing";
        this.parser = new HTMLTreeParser();
        this.htmlParser = new HTMLTreeParser();
    }

    public void doWork() throws OperatorException {
        ExampleSet<Example> exampleSet = (ExampleSet) this.exampleSetInput.getData();
        this.wordVector = new HashSet<>();
        this.wordVectorCapacity = new HashMap<>();
        Attribute createIdAttribute = createIdAttribute(exampleSet);
        if (createIdAttribute == null) {
            throw new OperatorException("Unable to create an ID attribute!");
        }
        Attribute attribute = exampleSet.getAttributes().get(Structures.ID_ATTRIBUTE);
        if (attribute == null) {
            LogService.getGlobal().log("Creating structID attribute...", 8);
            attribute = AttributeObjectFactory.createAttribute(Structures.ID_ATTRIBUTE, 12);
            exampleSet.getExampleTable().addAttribute(attribute);
            exampleSet.getAttributes().addRegular(attribute);
        }
        try {
            PrintStream printStream = new PrintStream(new FileOutputStream(File.createTempFile("parse-errors-", ".log")));
            this.sourceAttribute = exampleSet.getAttributes().get(getParameterAsString("valueAttribute"));
            LinkedList linkedList = new LinkedList();
            int[] iArr = new int[exampleSet.size()];
            int i = 0;
            int i2 = 0;
            int i3 = 0;
            for (Example example : exampleSet) {
                try {
                    String valueAsString = example.getValueAsString(this.sourceAttribute);
                    int index = this.sourceAttribute.getMapping().getIndex(valueAsString);
                    Double valueOf = Double.valueOf(example.getId());
                    if (valueAsString != null) {
                        try {
                        } catch (Exception e) {
                            e.printStackTrace();
                            printStream.println("Parsing of example " + valueOf + " failed.");
                            printStream.println("Error parsing: " + valueAsString);
                            e.printStackTrace(printStream);
                            printStream.println("---------------------------------------------------");
                            i2++;
                            example.setValue(attribute, Double.NaN);
                            iArr[i] = 1;
                        }
                        if (valueAsString.length() != 0) {
                            this.wordVector = buildWordVector(this.parser.parseHTML(valueAsString), this.wordVector);
                            linkedList.add(Integer.valueOf(index));
                            example.setValue(attribute, ((ObjectAttribute) attribute).m48getMapping().mapString(r0));
                            iArr[i] = 0;
                            example.setValue(createIdAttribute, i3 + ": " + valueAsString);
                            i++;
                            i3++;
                        }
                    }
                    valueAsString = "";
                    example.setValue(attribute, Double.NaN);
                    iArr[i] = 1;
                    example.setValue(createIdAttribute, i3 + ": " + valueAsString);
                    i++;
                    i3++;
                } catch (Exception e2) {
                    e2.printStackTrace();
                }
            }
            int size = this.wordVector.size();
            System.out.println("N" + size);
            Object[] array = this.wordVector.toArray();
            for (int i4 = size - 1; i4 >= 0; i4--) {
                String str = (String) array[i4];
                if (this.wordVectorCapacity.get(str).intValue() < 3) {
                    this.wordVector.remove(str);
                    this.wordVectorCapacity.remove(str);
                }
            }
            ArrayList arrayList = new ArrayList(this.wordVector);
            Collections.sort(arrayList);
            this.wordVector.clear();
            Iterator it = arrayList.iterator();
            while (it.hasNext()) {
                this.wordVector.add((String) it.next());
            }
            Attribute attribute2 = exampleSet.getAttributes().get("structIDnew");
            if (attribute2 == null) {
                LogService.getGlobal().log("Creating structID attribute...", 8);
                attribute2 = AttributeObjectFactory.createAttribute("structIDnew", 12);
                exampleSet.getExampleTable().addAttribute(attribute2);
                exampleSet.getAttributes().addRegular(attribute2);
            }
            Iterator it2 = exampleSet.iterator();
            while (it2.hasNext()) {
                try {
                    ((Example) it2.next()).setValue(attribute2, ((ObjectAttribute) attribute2).m48getMapping().mapString(replaceText((HTMLTreeNode) ((ObjectMapping) attribute.getMapping()).mapObjectIndex(new Double(r0.getValue(attribute)).intValue()))));
                } catch (Exception e3) {
                    e3.printStackTrace();
                }
            }
            new SplittedExampleSet(exampleSet, new Partition(iArr, 2)).selectSingleSubset(0);
            this.exampleSetOutput.deliver(exampleSet);
        } catch (IOException e4) {
            throw new OperatorException("Cannot create the parse-error log-file!");
        }
    }

    private HTMLTreeNode replaceText(HTMLTreeNode hTMLTreeNode) {
        String text = hTMLTreeNode.getText();
        if (text != null) {
            HashSet hashSet = new HashSet();
            for (String str : text.split(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR)) {
                hashSet.add(str.trim());
            }
            StringBuilder sb = new StringBuilder("");
            Iterator<String> it = this.wordVector.iterator();
            int i = 0;
            while (it.hasNext()) {
                if (hashSet.contains(it.next())) {
                    if (sb.length() > 0) {
                        sb.append("," + i);
                    } else {
                        sb.append(i);
                    }
                }
                i++;
            }
            hTMLTreeNode.setWvCapacity(i);
            if (sb.length() > 0) {
                String[] split = sb.toString().split(",");
                int[] iArr = new int[split.length];
                for (int i2 = 0; i2 < split.length; i2++) {
                    iArr[i2] = new Integer(split[i2]).intValue();
                }
                hTMLTreeNode.setWordVector(iArr);
            }
        }
        Iterator<KTreeNode> it2 = hTMLTreeNode.children().iterator();
        while (it2.hasNext()) {
            replaceText((HTMLTreeNode) it2.next());
        }
        return hTMLTreeNode;
    }

    private HashSet<String> buildWordVector(HTMLTreeNode hTMLTreeNode, HashSet<String> hashSet) {
        String text = hTMLTreeNode.getText();
        if (text != null && text.length() != 0) {
            for (String str : text.split(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR)) {
                String trim = str.trim();
                if (this.wordVectorCapacity.get(trim) == null) {
                    this.wordVectorCapacity.put(trim, 1);
                } else {
                    this.wordVectorCapacity.put(trim, Integer.valueOf(this.wordVectorCapacity.get(trim).intValue() + 1));
                }
                hashSet.add(trim);
            }
        }
        ArrayList<KTreeNode> children = hTMLTreeNode.children();
        for (int i = 0; i < children.size(); i++) {
            hashSet = buildWordVector((HTMLTreeNode) children.get(i), hashSet);
        }
        return hashSet;
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeString("valueAttribute", "The name of attribute from which the parser should create the parse tree.", "query"));
        parameterTypes.add(new ParameterTypeBoolean("needParsing", "Need not to be selected, if sentences are already parsed.", false));
        return parameterTypes;
    }

    public PreprocessOperatorImpl create() throws Exception {
        return OperatorService.createOperator("ParseTreePreprocessing");
    }

    public Class<?>[] getInputClasses() {
        return new Class[]{ExampleSet.class};
    }

    public Class<?>[] getOutputClasses() {
        return new Class[]{ExampleSet.class};
    }
}
