package com.rapidminer.operator.web.crawler.deprecated;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.InputPortExtender;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SubprocessTransformRule;
import com.rapidminer.operator.text.Document;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.ParameterTypeRegexp;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.container.Pair;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Level;
import org.apache.xerces.dom3.as.ASDataType;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;

@Deprecated
/* loaded from: input_file:com/rapidminer/operator/web/crawler/deprecated/ProcessingCrawlerOperator.class */
public class ProcessingCrawlerOperator extends OperatorChain {
    public static final String METADATA_PAGE_CONTENT = "page_content";
    public static final String METADATA_PAGE_URL = "page_url";
    private final OutputPort innerTextObjectSource;
    private final InputPortExtender innerTextObjectSink;
    private final OutputPort exampleSetOutput;

    public ProcessingCrawlerOperator(OperatorDescription operatorDescription) {
        super(operatorDescription, new String[]{"Process Webpage"});
        this.innerTextObjectSource = getSubprocess(0).getInnerSources().createPort("document");
        this.innerTextObjectSink = new InputPortExtender("document", getSubprocess(0).getInnerSinks(), new MetaData(Document.class), true);
        this.exampleSetOutput = getOutputPorts().createPort("example set");
        this.innerTextObjectSink.start();
        getTransformer().addGenerationRule(this.innerTextObjectSource, Document.class);
        getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0)));
        getTransformer().addGenerationRule(this.exampleSetOutput, ExampleSet.class);
    }

    public void doWork() throws OperatorException {
        Pair<WebCrawlerSupervisor, ExampleTable> createSupervisor = createSupervisor();
        WebCrawlerSupervisor webCrawlerSupervisor = (WebCrawlerSupervisor) createSupervisor.getFirst();
        ExampleTable exampleTable = (ExampleTable) createSupervisor.getSecond();
        HashMap hashMap = new HashMap();
        for (String[] strArr : getParameterList("crawling_rules")) {
            String str = strArr[0];
            String str2 = strArr[1];
            List list = (List) hashMap.get(str);
            if (list == null) {
                list = new LinkedList();
                hashMap.put(str, list);
            }
            list.add(str2);
        }
        WebSphinxCrawler webSphinxCrawler = new WebSphinxCrawler(webCrawlerSupervisor, getLogger(), hashMap, isParameterSet("max_pages") ? getParameterAsInt("max_pages") : -1, getParameterAsInt("delay"), getParameterAsString("user_agent"));
        webSphinxCrawler.setDownloadParameters(webSphinxCrawler.getDownloadParameters().changeMaxThreads(getParameterAsInt(CrawlerOperator.PARAMETER_MAX_THREADS)).changeObeyRobotExclusion(getParameterAsBoolean(CrawlerOperator.PARAMETER_OBEY_ROBOT_EXCLUSION) || !getParameterAsBoolean(CrawlerOperator.PARAMETER_REALLY_IGNORE_ROBOT_EXCLUSION)).changeUserAgent(getParameterAsString("user_agent")).changeMaxPageSize(getParameterAsInt("max_page_size")));
        switch (getParameterAsInt("domain")) {
            case 0:
                webSphinxCrawler.setDomain(Crawler.WEB);
                break;
            case 1:
                webSphinxCrawler.setDomain(Crawler.SERVER);
                break;
            case 2:
                webSphinxCrawler.setDomain(Crawler.SUBTREE);
                break;
        }
        webSphinxCrawler.setMaxDepth(getParameterAsInt(CrawlerOperator.PARAMETER_MAX_DEPTH));
        String parameterAsString = getParameterAsString("url");
        try {
            webSphinxCrawler.setSynchronous(true);
            webSphinxCrawler.setRoot(new Link(parameterAsString));
            webSphinxCrawler.run();
            if (webSphinxCrawler.isErrorOccurred()) {
                throw webSphinxCrawler.getError();
            }
            this.exampleSetOutput.deliver(exampleTable.createExampleSet());
        } catch (MalformedURLException e) {
            throw new UserError(this, ASDataType.UNSIGNEDLONG_DATATYPE, new Object[]{parameterAsString, e});
        }
    }

    private Pair<WebCrawlerSupervisor, ExampleTable> createSupervisor() throws UndefinedParameterError {
        final boolean parameterAsBoolean = getParameterAsBoolean(CrawlerOperator.PARAMETER_ADD_CONTENT_ATTRIBUTE);
        final MemoryExampleTable memoryExampleTable = new MemoryExampleTable(new Attribute[0]);
        final LinkedList linkedList = new LinkedList();
        final DataRowFactory dataRowFactory = new DataRowFactory(getParameterAsInt("datamanagement"), '.');
        return new Pair<>(new WebCrawlerSupervisor() { // from class: com.rapidminer.operator.web.crawler.deprecated.ProcessingCrawlerOperator.1
            private boolean gotTexts = false;

            @Override // com.rapidminer.operator.web.crawler.deprecated.WebCrawlerSupervisor
            public void storePage(Page page) throws OperatorException {
                String str;
                try {
                    byte[] contentBytes = page.getContentBytes();
                    try {
                        String[] split = page.getContentType().split(";");
                        String str2 = "UTF-8";
                        int length = split.length;
                        int i = 0;
                        while (true) {
                            if (i >= length) {
                                break;
                            }
                            String trim = split[i].trim();
                            if (trim.startsWith("charset=")) {
                                str2 = trim.substring(trim.indexOf("=") + 1);
                                break;
                            }
                            i++;
                        }
                        str = new String(contentBytes, str2);
                    } catch (UnsupportedEncodingException e) {
                        str = new String(contentBytes);
                    }
                    Document document = new Document(str);
                    if (parameterAsBoolean) {
                        document.addMetaData(ProcessingCrawlerOperator.METADATA_PAGE_CONTENT, str, 5);
                    }
                    document.addMetaData(ProcessingCrawlerOperator.METADATA_PAGE_URL, page.getURL().toString(), 1);
                    ProcessingCrawlerOperator.this.innerTextObjectSource.deliver(document);
                    checkForStop();
                    ProcessingCrawlerOperator.this.getSubprocess(0).execute();
                    checkForStop();
                    for (Document document2 : ProcessingCrawlerOperator.this.innerTextObjectSink.getData(Document.class, true)) {
                        if (!this.gotTexts) {
                            for (String str3 : document2.getMetaDataKeys()) {
                                linkedList.add(AttributeFactory.createAttribute(str3, document2.getMetaDataType(str3)));
                            }
                            memoryExampleTable.addAttributes(linkedList);
                            this.gotTexts = true;
                        }
                        DataRow create = dataRowFactory.create(memoryExampleTable.getAttributeCount());
                        int i2 = 0;
                        for (Attribute attribute : linkedList) {
                            if (attribute.isNominal()) {
                                if (((String) document2.getMetaDataValue(attribute.getName())) != null) {
                                    create.set(attribute, attribute.getMapping().mapString(r0));
                                } else {
                                    create.set(attribute, Double.NaN);
                                }
                            } else if (attribute.isNumerical()) {
                                Double d = (Double) document2.getMetaDataValue(attribute.getName());
                                if (d != null) {
                                    create.set(attribute, d.doubleValue());
                                } else {
                                    create.set(attribute, Double.NaN);
                                }
                            } else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), 9)) {
                                if (((Date) document2.getMetaDataValue(attribute.getName())) != null) {
                                    create.set(attribute, r0.getTime());
                                } else {
                                    create.set(attribute, Double.NaN);
                                }
                            }
                            i2++;
                        }
                        memoryExampleTable.addDataRow(create);
                    }
                } catch (OperatorException e2) {
                } catch (Exception e3) {
                    ProcessingCrawlerOperator.this.getLogger().log(Level.WARNING, "Error during processing page \"" + page.getURL().toString() + "\": ", (Throwable) e3);
                }
            }

            @Override // com.rapidminer.operator.web.crawler.deprecated.WebCrawlerSupervisor
            public void checkForStop() throws ProcessStoppedException {
                ProcessingCrawlerOperator.this.checkForStop();
            }
        }, memoryExampleTable);
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        ParameterTypeString parameterTypeString = new ParameterTypeString("url", "Specifies the url at which the crawler should start", false);
        parameterTypeString.setExpert(false);
        parameterTypes.add(parameterTypeString);
        ParameterTypeList parameterTypeList = new ParameterTypeList("crawling_rules", "Specifies a set of rules that determine, which links to follow and which pages to process.", new ParameterTypeCategory("rule_application", "Specifies the behavior property", CrawlerOperator.RULES, 0), new ParameterTypeRegexp("rule_value", "Specifies the value of the rule"));
        parameterTypeList.setExpert(false);
        parameterTypes.add(parameterTypeList);
        parameterTypes.add(new ParameterTypeBoolean(CrawlerOperator.PARAMETER_ADD_CONTENT_ATTRIBUTE, "Specifies, whether the pages' content should be added as a String Attribute.", false, false));
        ParameterTypeInt parameterTypeInt = new ParameterTypeInt("max_pages", "The maximal number of pages to store.", 1, Integer.MAX_VALUE, true);
        parameterTypeInt.setExpert(false);
        parameterTypes.add(parameterTypeInt);
        ParameterTypeInt parameterTypeInt2 = new ParameterTypeInt(CrawlerOperator.PARAMETER_MAX_DEPTH, "Specifies the maximal depth of the crawling process", 0, Integer.MAX_VALUE, 2);
        parameterTypeInt2.setExpert(false);
        parameterTypes.add(parameterTypeInt2);
        ParameterTypeCategory parameterTypeCategory = new ParameterTypeCategory("domain", "Specifies whether links should be followed into the whole web, on the same server or to descendents of the root url.", CrawlerOperator.DOMAINS, 0);
        parameterTypeCategory.setExpert(false);
        parameterTypes.add(parameterTypeCategory);
        ParameterTypeInt parameterTypeInt3 = new ParameterTypeInt("delay", "Specifies the delay when vistiting a page in milleseconds", 0, Integer.MAX_VALUE, 1000);
        parameterTypeInt3.setExpert(false);
        parameterTypes.add(parameterTypeInt3);
        ParameterTypeInt parameterTypeInt4 = new ParameterTypeInt(CrawlerOperator.PARAMETER_MAX_THREADS, "Specifies the number of crawling threads working in parallel", 1, Integer.MAX_VALUE, 1);
        parameterTypeInt4.setExpert(true);
        parameterTypes.add(parameterTypeInt4);
        ParameterTypeInt parameterTypeInt5 = new ParameterTypeInt("max_page_size", "Specifies the maximum page size (in KB): pages larger than this limit are not downloaded", 1, Integer.MAX_VALUE, 100);
        parameterTypeInt5.setExpert(false);
        parameterTypes.add(parameterTypeInt5);
        ParameterTypeString parameterTypeString2 = new ParameterTypeString("user_agent", "The identity the crawler uses while accessing a server", "rapid-miner-crawler");
        parameterTypeString2.setExpert(true);
        parameterTypes.add(parameterTypeString2);
        ParameterTypeBoolean parameterTypeBoolean = new ParameterTypeBoolean(CrawlerOperator.PARAMETER_OBEY_ROBOT_EXCLUSION, "Specifies whether the crawler obeys the rules, which pages on site might be visited by a robot. Disable only if you know what you are doing and if you a sure not to violate any existing laws by doing so", true);
        parameterTypeBoolean.setExpert(true);
        parameterTypes.add(parameterTypeBoolean);
        ParameterTypeBoolean parameterTypeBoolean2 = new ParameterTypeBoolean(CrawlerOperator.PARAMETER_REALLY_IGNORE_ROBOT_EXCLUSION, "Do you really want to ignore the robot exclusion? This might be illegal.", false);
        parameterTypeBoolean2.registerDependencyCondition(new BooleanParameterCondition(this, CrawlerOperator.PARAMETER_OBEY_ROBOT_EXCLUSION, false, false));
        parameterTypeBoolean2.setExpert(true);
        parameterTypes.add(parameterTypeBoolean2);
        parameterTypes.add(new ParameterTypeCategory("datamanagement", "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, 7));
        return parameterTypes;
    }
}
