package com.rapidminer.extension.webtableextraction.operator;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.extension.webtableextraction.microdataparser.BaseNodeVisitor;
import com.rapidminer.extension.webtableextraction.microdataparser.StructuredDataExtractor;
import com.rapidminer.operator.IOObjectCollection;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.preprocessing.GuessValueTypes;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeAttribute;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.ParameterTypeStringCategory;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.parameter.conditions.EqualTypeCondition;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.ParameterService;
import com.rapidminer.tools.io.Encoding;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.validator.routines.UrlValidator;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import weka.core.xml.XMLInstances;
import weka.gui.beans.xml.XMLBeans;

/* loaded from: input_file:com/rapidminer/extension/webtableextraction/operator/StructuredDataExtractionOperator.class */
public class StructuredDataExtractionOperator extends Operator {
    public static final String PARAMETER_FILENAME = "file name";
    public static final String PARAMETER_URL = "url";
    public static final String PARAMETER_EXAMPLESET = "attribute";
    public static final String PARAMETER_SOURCE_TYPE = "resource_type";
    public static final int SOURCE_TYPE_FILE = 0;
    public static final int SOURCE_TYPE_URL = 1;
    public static final int SOURCE_TYPE_EXAMPLESET = 2;
    private final String ERROR_PREFIX = "Please check that the URL is not malformed i.e. the protocol is specified in the URL and the address is correct.";
    public static final String PARAMETER_SCHEMA_ITEM = "schema item";
    public static final String ATTRIBUTE_KEY_OF_ENCAPSULATING_NODE = "encapsulator node's attribute";
    public static final String ATTRIBUTE_VALUE_OF_ENCAPSULATING_NODE = "attribute value";
    public static final int SCHEMA_ITEM_PRODUCT = 0;
    public static final int SCHEMA_ITEM_OTHER = 1;
    public static final String PARAMETER_USER_AGENT = "user agent";
    public static final String DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";
    public static final String PARAMETER_CONNECTION_TIMEOUT = "connection timeout (ms)";
    public int CONNECTION_TIMEOUT;
    public static final String PARAMETER_SPECIFY_ATTRIBUTE_OF_VALUE_NODE = "specify value node's attribute";
    public static final String PARAMETER_VALUE_NODE_ATTRIBUTE = "attribute name";
    public static final int VALUE_NODE_ATTRIBUTE_CLASS = 0;
    public static final int VALUE_NODE_ATTRIBUTE_ID = 1;
    public static final int VALUE_NODE_ATTRIBUTE_LABEL = 2;
    public static final int VALUE_NODE_ATTRIBUTE_TITLE = 3;
    public static final int VALUE_NODE_ATTRIBUTE_HREF = 4;
    public static final int VALUE_NODE_ATTRIBUTE_SRC = 5;
    public static final int VALUE_NODE_ATTRIBUTE_OTHER = 6;
    public static final String ATTRIBUTE_KEY_OF_VALUE_NODE = "value node's attribute";
    public static final String ENCODING = "encoding";
    boolean specialCase;
    private String targetAttributeValue;
    private InputPort exampleSetInputPort;
    private OutputPort exampleSetCollectionOutput;
    private StructuredDataExtractor extractor;
    public static final String[] SOURCE_TYPES = {"file", "url", "example set"};
    public static final String[] SCHEMA_ITEMS = {StructuredDataExtractor.PRODUCT_ITEM_TYPE, "enter manually"};
    public static final String[] VALUE_NODE_ATTRIBUTES = {"class", XMLBeans.VAL_ID, XMLInstances.TAG_LABEL, BaseNodeVisitor.VALUE_EXTRACTION_ATTRIBUTE_TITLE, BaseNodeVisitor.VALUE_EXTRACTION_ATTRIBUTE_HREF, BaseNodeVisitor.VALUE_EXTRACTION_ATTRIBUTE_SRC, "enter manually"};
    private static Logger LOGGER = LogService.getRoot();

    public StructuredDataExtractionOperator(OperatorDescription operatorDescription) {
        super(operatorDescription);
        this.ERROR_PREFIX = "Please check that the URL is not malformed i.e. the protocol is specified in the URL and the address is correct.";
        this.CONNECTION_TIMEOUT = 300000;
        this.specialCase = false;
        this.targetAttributeValue = null;
        this.exampleSetInputPort = getInputPorts().createPort("input example set");
        this.exampleSetCollectionOutput = getOutputPorts().createPort("collection of product data as example sets");
        this.extractor = new StructuredDataExtractor();
        this.extractor.setjSoupConnectionReadTimeout(300000);
        getTransformer().addGenerationRule(this.exampleSetCollectionOutput, IOObjectCollection.class);
    }

    public void doWork() throws OperatorException {
        IOObjectCollection<ExampleSet> iOObjectCollection = null;
        long nanoTime = System.nanoTime();
        try {
            if (getParameterAsBoolean(PARAMETER_SPECIFY_ATTRIBUTE_OF_VALUE_NODE)) {
                this.specialCase = true;
                if (getParameterAsInt(PARAMETER_VALUE_NODE_ATTRIBUTE) != 6) {
                    this.targetAttributeValue = getParameterAsString(PARAMETER_VALUE_NODE_ATTRIBUTE);
                } else {
                    this.targetAttributeValue = getParameterAsString(ATTRIBUTE_KEY_OF_VALUE_NODE);
                }
                LOGGER.log(Level.INFO, "** Special Case: Value Node Attribute Name = " + this.targetAttributeValue);
            } else {
                this.specialCase = false;
            }
            if (getParameterAsInt("resource_type") == 0) {
                String parameterAsString = getParameterAsString("file name");
                LOGGER.log(Level.INFO, "Got File Name = " + parameterAsString);
                File file = new File(parameterAsString);
                if (!file.exists()) {
                    throw new UserError(this, "301", new Object[]{file});
                }
                if (!file.canRead()) {
                    throw new UserError(this, "302", new Object[]{file, ""});
                }
                iOObjectCollection = createCollectionFromFileOrUrl(parameterAsString, file);
            } else if (getParameterAsInt("resource_type") == 1) {
                iOObjectCollection = createCollectionFromFileOrUrl(getParameterAsString("url"), null);
            } else if (getParameterAsInt("resource_type") == 2) {
                iOObjectCollection = createCollectionFromExampleSet(getParameterAsString("attribute"));
            }
        } catch (UndefinedParameterError e) {
            LOGGER.log(Level.WARNING, "UndefinedParameterError : " + e.getMessage());
            e.printStackTrace();
        }
        LOGGER.log(Level.INFO, "Total processing time (sec): " + (((float) (System.nanoTime() - nanoTime)) / 1.0E9f));
        this.exampleSetCollectionOutput.deliver(iOObjectCollection);
    }

    public IOObjectCollection<ExampleSet> createCollectionFromFileOrUrl(String str, File file) throws UserError {
        IOObjectCollection<ExampleSet> iOObjectCollection = new IOObjectCollection<>();
        String parameter = getParameter(PARAMETER_USER_AGENT);
        this.extractor.setUserAgent((parameter == null || parameter.trim().length() <= 0) ? DEFAULT_USER_AGENT : parameter);
        this.extractor.setjSoupConnectionReadTimeout(getParameterAsInt(PARAMETER_CONNECTION_TIMEOUT));
        this.extractor.setUrlString(str);
        this.extractor.setTargetAttributeValue(this.targetAttributeValue);
        String str2 = null;
        String str3 = null;
        if (getParameterAsInt(PARAMETER_SCHEMA_ITEM) == 0) {
            str2 = BaseNodeVisitor.ITEM_TYPE_ATTRIBUTE;
            str3 = StructuredDataExtractor.PRODUCT_ITEM_TYPE;
            LOGGER.log(Level.INFO, "WILL EXTRACT " + str2);
        } else if (getParameterAsInt(PARAMETER_SCHEMA_ITEM) == 1) {
            str2 = getParameterAsString(ATTRIBUTE_KEY_OF_ENCAPSULATING_NODE);
            str3 = getParameterAsString(ATTRIBUTE_VALUE_OF_ENCAPSULATING_NODE);
            LOGGER.log(Level.INFO, "WILL EXTRACT " + str2);
        }
        long currentTimeMillis = System.currentTimeMillis();
        if (file == null) {
            try {
                LOGGER.log(Level.INFO, "++ 1) Must get data from URL = " + str);
                Document createConnection = this.extractor.createConnection(str);
                String parameterAsString = getParameterAsString("encoding");
                LOGGER.log(Level.INFO, "** Encoding = " + parameterAsString);
                createConnection.outputSettings().charset(parameterAsString);
                LOGGER.log(Level.INFO, "++ 2) Document loaded from URL");
                ExampleSet exampleSetFromSchemaOfInterest = this.extractor.getExampleSetFromSchemaOfInterest(createConnection, this.specialCase, str2, str3);
                if (exampleSetFromSchemaOfInterest != null && exampleSetFromSchemaOfInterest.size() > 0) {
                    try {
                        exampleSetFromSchemaOfInterest = new GuessValueTypes(getOperatorDescription()).apply(exampleSetFromSchemaOfInterest);
                    } catch (OperatorException e) {
                        LOGGER.log(Level.WARNING, "Error : " + e.getMessage());
                        e.printStackTrace();
                    }
                }
                iOObjectCollection.add(exampleSetFromSchemaOfInterest);
            } catch (MalformedURLException e2) {
                LOGGER.log(Level.WARNING, "Error accessing the given url. Please check internet connection." + e2);
                e2.printStackTrace();
                throw new UserError(this, e2, "313", new Object[]{"Please check that the URL is not malformed i.e. the protocol is specified in the URL and the address is correct." + str});
            } catch (IOException e3) {
                LOGGER.log(Level.WARNING, "Error accessing the given url. Please check internet connection." + e3);
                e3.printStackTrace();
                throw new UserError(this, e3, "313", new Object[]{"Please check that the URL is not malformed i.e. the protocol is specified in the URL and the address is correct." + str});
            }
        } else if (file != null) {
            LOGGER.log(Level.INFO, "++ 1) Must get data from File = " + file.getAbsolutePath());
            try {
                Document parse = Jsoup.parse(file, (String) null, "");
                String parameterAsString2 = getParameterAsString("encoding");
                LOGGER.log(Level.INFO, "** Encoding = " + parameterAsString2);
                parse.outputSettings().charset(parameterAsString2);
                LOGGER.log(Level.INFO, "++ 2) Document loaded from File. Document Title = " + parse.title());
                ExampleSet exampleSetFromSchemaOfInterest2 = this.extractor.getExampleSetFromSchemaOfInterest(parse, this.specialCase, str2, str3);
                if (exampleSetFromSchemaOfInterest2 != null && exampleSetFromSchemaOfInterest2.size() > 0) {
                    try {
                        exampleSetFromSchemaOfInterest2 = new GuessValueTypes(getOperatorDescription()).apply(exampleSetFromSchemaOfInterest2);
                    } catch (OperatorException e4) {
                        LOGGER.log(Level.WARNING, "Error : " + e4.getMessage());
                        e4.printStackTrace();
                    }
                }
                iOObjectCollection.add(exampleSetFromSchemaOfInterest2);
            } catch (IOException e5) {
                e5.printStackTrace();
                LOGGER.log(Level.WARNING, "Error accessing the given url. Please check internet connection." + e5.getMessage());
            }
        }
        LOGGER.log(Level.INFO, "+Time for loading, parsing and extracting data = " + ((System.currentTimeMillis() - currentTimeMillis) / 1000) + " seconds");
        return iOObjectCollection;
    }

    private IOObjectCollection<ExampleSet> createCollectionFromExampleSet(String str) throws UserError {
        ExampleSet dataOrNull;
        LOGGER.log(Level.INFO, "++ Must get data from ExampleSet ");
        IOObjectCollection<ExampleSet> iOObjectCollection = new IOObjectCollection<>();
        String parameter = getParameter(PARAMETER_USER_AGENT);
        this.extractor.setUserAgent((parameter == null || parameter.trim().length() <= 0) ? DEFAULT_USER_AGENT : parameter);
        this.extractor.setjSoupConnectionReadTimeout(getParameterAsInt(PARAMETER_CONNECTION_TIMEOUT));
        this.extractor.setTargetAttributeValue(this.targetAttributeValue);
        String str2 = null;
        String str3 = null;
        if (getParameterAsInt(PARAMETER_SCHEMA_ITEM) == 0) {
            str2 = BaseNodeVisitor.ITEM_TYPE_ATTRIBUTE;
            str3 = StructuredDataExtractor.PRODUCT_ITEM_TYPE;
            LOGGER.log(Level.INFO, "WILL EXTRACT " + str2);
        } else if (getParameterAsInt(PARAMETER_SCHEMA_ITEM) == 1) {
            str2 = getParameterAsString(ATTRIBUTE_KEY_OF_ENCAPSULATING_NODE);
            str3 = getParameterAsString(ATTRIBUTE_VALUE_OF_ENCAPSULATING_NODE);
            LOGGER.log(Level.INFO, "WILL EXTRACT " + str2);
        }
        try {
            dataOrNull = this.exampleSetInputPort.getDataOrNull(ExampleSet.class);
        } catch (UserError e) {
            LOGGER.log(Level.WARNING, "Error : " + e.getMessage());
            e.printStackTrace();
        }
        if (dataOrNull == null || dataOrNull.size() == 0) {
            LOGGER.log(Level.WARNING, "Provided ExampleSet was null ");
            throw new UserError(this, 313, new Object[]{"The provided table is invalid or empty."});
        }
        Attribute attribute = dataOrNull.getAttributes().get(str);
        long currentTimeMillis = System.currentTimeMillis();
        Iterator it = dataOrNull.iterator();
        while (it.hasNext()) {
            String trim = ((Example) it.next()).getValueAsString(attribute).trim();
            this.extractor.setUrlString(trim);
            if (new UrlValidator().isValid(trim)) {
                System.out.println("URL is valid, connecting and extracting.");
                try {
                    if (new URL(trim) != null) {
                        LOGGER.log(Level.INFO, "++ 1) Must get data from URL = " + trim);
                        Document createConnection = this.extractor.createConnection(trim);
                        String parameterAsString = getParameterAsString("encoding");
                        LOGGER.log(Level.INFO, "** Encoding = " + parameterAsString);
                        createConnection.outputSettings().charset(parameterAsString);
                        ExampleSet exampleSetFromSchemaOfInterest = this.extractor.getExampleSetFromSchemaOfInterest(createConnection, this.specialCase, str2, str3);
                        if (exampleSetFromSchemaOfInterest != null && exampleSetFromSchemaOfInterest.size() > 0) {
                            try {
                                exampleSetFromSchemaOfInterest = new GuessValueTypes(getOperatorDescription()).apply(exampleSetFromSchemaOfInterest);
                            } catch (OperatorException e2) {
                                LOGGER.log(Level.WARNING, "Error : " + e2.getMessage());
                                e2.printStackTrace();
                            }
                        }
                        iOObjectCollection.add(exampleSetFromSchemaOfInterest);
                    }
                } catch (MalformedURLException e3) {
                    LOGGER.log(Level.WARNING, "Error accessing the given url. Please check internet connection." + e3);
                    e3.printStackTrace();
                } catch (IOException e4) {
                    LOGGER.log(Level.WARNING, "Error accessing the given url. Please check internet connection." + e4);
                    e4.printStackTrace();
                }
            } else {
                LOGGER.log(Level.INFO, "++ Treating path as a File path");
                File file = new File(trim);
                if (!file.exists()) {
                    LOGGER.log(Level.INFO, "File does not exist");
                } else if (file.canRead()) {
                    System.out.println("Treating URL as file path!");
                    if (file != null) {
                        try {
                            Document parse = Jsoup.parse(file, (String) null, "");
                            String parameterAsString2 = getParameterAsString("encoding");
                            LOGGER.log(Level.INFO, "** Encoding = " + parameterAsString2);
                            parse.outputSettings().charset(parameterAsString2);
                            this.extractor.setUrlString(trim);
                            LOGGER.log(Level.INFO, "++ 2) Document loaded from File. Document Title = " + parse.title());
                            ExampleSet exampleSetFromSchemaOfInterest2 = this.extractor.getExampleSetFromSchemaOfInterest(parse, this.specialCase, str2, str3);
                            if (exampleSetFromSchemaOfInterest2 != null && exampleSetFromSchemaOfInterest2.size() > 0) {
                                try {
                                    exampleSetFromSchemaOfInterest2 = new GuessValueTypes(getOperatorDescription()).apply(exampleSetFromSchemaOfInterest2);
                                } catch (OperatorException e5) {
                                    LOGGER.log(Level.WARNING, "Error : " + e5.getMessage());
                                    e5.printStackTrace();
                                }
                            }
                            iOObjectCollection.add(exampleSetFromSchemaOfInterest2);
                        } catch (IOException e6) {
                            e6.printStackTrace();
                            LOGGER.log(Level.WARNING, "Error accessing the given url. Please check internet connection." + e6.getMessage());
                        }
                    }
                } else {
                    LOGGER.log(Level.INFO, "File does not readable");
                }
            }
            LOGGER.log(Level.WARNING, "Error : " + e.getMessage());
            e.printStackTrace();
            return iOObjectCollection;
        }
        LOGGER.log(Level.INFO, "+Time for loading, parsing and extracting data = " + ((System.currentTimeMillis() - currentTimeMillis) / 1000) + " seconds");
        return iOObjectCollection;
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeCategory("resource_type", "Open html document from a file, a URL or pass in an ExampleSet contining file or URL paths", SOURCE_TYPES, 1, false));
        ParameterTypeFile parameterTypeFile = new ParameterTypeFile("file name", "File to open", (String) null, true, false);
        parameterTypeFile.registerDependencyCondition(new EqualTypeCondition(this, "resource_type", SOURCE_TYPES, true, new int[]{0}));
        parameterTypes.add(parameterTypeFile);
        ParameterTypeString parameterTypeString = new ParameterTypeString("url", "URL to open", true, false);
        parameterTypeString.registerDependencyCondition(new EqualTypeCondition(this, "resource_type", SOURCE_TYPES, true, new int[]{1}));
        parameterTypes.add(parameterTypeString);
        ParameterTypeAttribute parameterTypeAttribute = new ParameterTypeAttribute("attribute", "The attribute having the path of html document. It could be a file path or a URL.", this.exampleSetInputPort, true, false);
        parameterTypeAttribute.registerDependencyCondition(new EqualTypeCondition(this, "resource_type", SOURCE_TYPES, true, new int[]{2}));
        parameterTypes.add(parameterTypeAttribute);
        parameterTypes.add(new ParameterTypeCategory(PARAMETER_SCHEMA_ITEM, "Select the schema.org item to extract its microdata.", SCHEMA_ITEMS, 0, false));
        ParameterTypeString parameterTypeString2 = new ParameterTypeString(ATTRIBUTE_KEY_OF_ENCAPSULATING_NODE, "Attribute key of the encapsulating element", true, false);
        parameterTypeString2.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SCHEMA_ITEM, SCHEMA_ITEMS, true, new int[]{1}));
        parameterTypes.add(parameterTypeString2);
        ParameterTypeString parameterTypeString3 = new ParameterTypeString(ATTRIBUTE_VALUE_OF_ENCAPSULATING_NODE, "Attribtue value of the encapsulating element ", true, false);
        parameterTypeString3.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SCHEMA_ITEM, SCHEMA_ITEMS, true, new int[]{1}));
        parameterTypes.add(parameterTypeString3);
        parameterTypes.add(new ParameterTypeString(PARAMETER_USER_AGENT, PARAMETER_USER_AGENT, DEFAULT_USER_AGENT, true));
        parameterTypes.add(new ParameterTypeInt(PARAMETER_CONNECTION_TIMEOUT, "Connection timeout in milliseconds", 1, Integer.MAX_VALUE, this.CONNECTION_TIMEOUT));
        parameterTypes.add(new ParameterTypeBoolean(PARAMETER_SPECIFY_ATTRIBUTE_OF_VALUE_NODE, "Custom attribute based extraction (default = class)", false, false));
        ParameterTypeCategory parameterTypeCategory = new ParameterTypeCategory(PARAMETER_VALUE_NODE_ATTRIBUTE, "Name of attribute in the value-containing node. The data will be extracted from this node.", VALUE_NODE_ATTRIBUTES, 0, false);
        parameterTypeCategory.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_SPECIFY_ATTRIBUTE_OF_VALUE_NODE, true, true));
        parameterTypes.add(parameterTypeCategory);
        ParameterTypeString parameterTypeString4 = new ParameterTypeString(ATTRIBUTE_KEY_OF_VALUE_NODE, "Name of attribute in the value-containing node. The data will be extracted from its node.", true, false);
        parameterTypeString4.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_VALUE_NODE_ATTRIBUTE, VALUE_NODE_ATTRIBUTES, true, new int[]{6}));
        parameterTypes.add(parameterTypeString4);
        if (ParameterService.getParameterValue("rapidminer.general.encoding") != null) {
        }
        ParameterTypeStringCategory parameterTypeStringCategory = new ParameterTypeStringCategory("encoding", "The encoding used for reading response webpage or html file.", Encoding.CHARSETS, Encoding.CHARSETS[81], false);
        parameterTypeStringCategory.setExpert(true);
        parameterTypes.add(parameterTypeStringCategory);
        return parameterTypes;
    }
}
