package com.rapidminer.operator.web.crawler;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.ExampleTable;
import com.rapidminer.example.table.GrowingExampleTable;
import com.rapidminer.example.utils.ExampleSets;
import com.rapidminer.extension.professional.tools.ThreadParameterProvider;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.InputPortExtender;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SubprocessTransformRule;
import com.rapidminer.operator.text.Document;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.ParameterTypePassword;
import com.rapidminer.parameter.ParameterTypeRegexp;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.container.Pair;
import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.CrawlController;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.fetcher.PageFetcher;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.file.attribute.FileAttribute;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.regex.PatternSyntaxException;
import javax.xml.bind.DatatypeConverter;
import org.apache.http.HttpStatus;
import org.apache.http.message.BasicHeader;

/* loaded from: input_file:com/rapidminer/operator/web/crawler/ProcessingCrawlerOperator.class */
public class ProcessingCrawlerOperator extends OperatorChain {
    private static final String METADATA_PAGE_CONTENT = "page_content";
    private static final String METADATA_PAGE_URL = "page_url";
    private final OutputPort innerTextObjectSource;
    private final InputPortExtender innerTextObjectSink;
    private final OutputPort exampleSetOutput;

    public ProcessingCrawlerOperator(OperatorDescription operatorDescription) {
        super(operatorDescription, new String[]{"Process Webpage"});
        this.innerTextObjectSource = getSubprocess(0).getInnerSources().createPort("document");
        this.innerTextObjectSink = new InputPortExtender("document", getSubprocess(0).getInnerSinks(), new MetaData(Document.class), true);
        this.exampleSetOutput = getOutputPorts().createPort("example set");
        this.innerTextObjectSink.start();
        getTransformer().addGenerationRule(this.innerTextObjectSource, Document.class);
        getTransformer().addRule(new SubprocessTransformRule(getSubprocess(0)));
        getTransformer().addGenerationRule(this.exampleSetOutput, ExampleSet.class);
    }

    public void doWork() throws OperatorException {
        final HashMap hashMap = new HashMap();
        for (String[] strArr : getParameterList("crawling_rules")) {
            String str = strArr[0];
            String str2 = strArr[1];
            List list = (List) hashMap.get(str);
            if (list == null) {
                list = new LinkedList();
                hashMap.put(str, list);
            }
            list.add(str2);
        }
        try {
            for (Map.Entry entry : hashMap.entrySet()) {
                String str3 = (String) entry.getKey();
                List list2 = (List) entry.getValue();
                if (!"store_with_matching_content".equals(str3)) {
                    Crawler4JCrawler.doesCompile(list2);
                }
            }
            Pair<WebCrawlerSupervisor, ExampleTable> createSupervisor = createSupervisor();
            final WebCrawlerSupervisor webCrawlerSupervisor = (WebCrawlerSupervisor) createSupervisor.getFirst();
            ExampleTable exampleTable = (ExampleTable) createSupervisor.getSecond();
            int parameterAsInt = isParameterSet("max_pages") ? getParameterAsInt("max_pages") : -1;
            CrawlConfig crawlConfig = new CrawlConfig();
            crawlConfig.setPolitenessDelay(getParameterAsInt("delay"));
            crawlConfig.setMaxTotalConnections(getParameterAsInt(CrawlerOperator.PARAMETER_MAX_TOTAL_CONNECTIONS));
            crawlConfig.setMaxConnectionsPerHost(getParameterAsInt(CrawlerOperator.PARAMETER_MAX_CONNECTIONS_PER_HOST));
            crawlConfig.setMaxDownloadSize(getParameterAsInt("max_page_size") * 1024);
            crawlConfig.setUserAgentString(getParameterAsString("user_agent"));
            crawlConfig.setMaxDepthOfCrawling(getParameterAsInt(CrawlerOperator.PARAMETER_MAX_DEPTH));
            crawlConfig.setMaxPagesToFetch(parameterAsInt);
            if (getParameterAsBoolean(CrawlerOperator.PARAMETER_BASIC_AUTH_ENABLE) && isParameterSet(CrawlerOperator.PARAMETER_BASIC_AUTH_USERNAME) && isParameterSet("password")) {
                String parameterAsString = getParameterAsString(CrawlerOperator.PARAMETER_BASIC_AUTH_USERNAME);
                char[] charArray = getParameterAsString("password").toCharArray();
                LinkedList linkedList = new LinkedList();
                linkedList.add(new BasicHeader("Authorization", "Basic " + DatatypeConverter.printBase64Binary((parameterAsString + ":" + new String(charArray)).getBytes(StandardCharsets.UTF_8))));
                crawlConfig.setDefaultHeaders(linkedList);
            }
            try {
                Path createTempDirectory = Files.createTempDirectory("rm-web-mining-crawler", new FileAttribute[0]);
                crawlConfig.setCrawlStorageFolder(createTempDirectory.toString());
                PageFetcher pageFetcher = new PageFetcher(crawlConfig);
                RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
                robotstxtConfig.setEnabled(!getParameterAsBoolean(CrawlerOperator.PARAMETER_IGNORE_ROBOT_EXCLUSION));
                robotstxtConfig.setUserAgentName(getParameterAsString("user_agent"));
                RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
                final List<Crawler4JCrawler> synchronizedList = Collections.synchronizedList(new LinkedList());
                final AtomicInteger atomicInteger = new AtomicInteger(0);
                try {
                    final CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
                    new URL(getParameterAsString("url"));
                    crawlController.addSeed(getParameterAsString("url"));
                    CrawlController.WebCrawlerFactory<Crawler4JCrawler> webCrawlerFactory = new CrawlController.WebCrawlerFactory<Crawler4JCrawler>() { // from class: com.rapidminer.operator.web.crawler.ProcessingCrawlerOperator.1
                        /* JADX WARN: Can't rename method to resolve collision */
                        @Override // edu.uci.ics.crawler4j.crawler.CrawlController.WebCrawlerFactory
                        public Crawler4JCrawler newInstance() throws Exception {
                            Crawler4JCrawler crawler4JCrawler = new Crawler4JCrawler(webCrawlerSupervisor, ProcessingCrawlerOperator.this.getLogger(), hashMap, false);
                            crawler4JCrawler.init(atomicInteger.getAndIncrement(), crawlController);
                            synchronizedList.add(crawler4JCrawler);
                            return crawler4JCrawler;
                        }
                    };
                    int numberOfThreads = ThreadParameterProvider.getNumberOfThreads();
                    getLogger().log(Level.INFO, "Starting " + numberOfThreads + " crawlers.");
                    crawlController.start(webCrawlerFactory, numberOfThreads);
                    for (Crawler4JCrawler crawler4JCrawler : synchronizedList) {
                        if (crawler4JCrawler.isErrorOccurred()) {
                            throw crawler4JCrawler.getError();
                        }
                    }
                    try {
                        Files.walkFileTree(createTempDirectory, new SimpleFileVisitor<Path>() { // from class: com.rapidminer.operator.web.crawler.ProcessingCrawlerOperator.2
                            @Override // java.nio.file.SimpleFileVisitor, java.nio.file.FileVisitor
                            public FileVisitResult visitFile(Path path, BasicFileAttributes basicFileAttributes) throws IOException {
                                Files.delete(path);
                                return FileVisitResult.CONTINUE;
                            }

                            @Override // java.nio.file.SimpleFileVisitor, java.nio.file.FileVisitor
                            public FileVisitResult postVisitDirectory(Path path, IOException iOException) throws IOException {
                                Files.delete(path);
                                return FileVisitResult.CONTINUE;
                            }
                        });
                    } catch (IOException e) {
                        getLogger().log(Level.WARNING, "Failed to clean up temporary crawling directory: " + e.getMessage());
                    }
                    this.exampleSetOutput.deliver(exampleTable.createExampleSet());
                } catch (MalformedURLException e2) {
                    throw new UserError(this, 212, new Object[]{getParameterAsString("url"), e2});
                } catch (Exception e3) {
                    throw new UserError(this, e3, "web.crawling.generic", new Object[]{e3.getMessage()});
                }
            } catch (IOException e4) {
                throw new UserError(this, e4, "web.crawling.temp_dir", new Object[]{e4.getMessage()});
            }
        } catch (PatternSyntaxException e5) {
            throw new UserError(this, e5, "web.crawling.rules", new Object[]{e5.getMessage()});
        }
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        ParameterTypeString parameterTypeString = new ParameterTypeString("url", "Specifies the url at which the crawler should start", false);
        parameterTypeString.setExpert(false);
        parameterTypes.add(parameterTypeString);
        ParameterTypeList parameterTypeList = new ParameterTypeList("crawling_rules", "Specifies a set of rules that determine, which links to follow and which pages to process.", new ParameterTypeCategory("rule_application", "Specifies the behavior property", CrawlerOperator.RULES, 0), new ParameterTypeRegexp("rule_value", "Specifies the value of the rule"));
        parameterTypeList.setExpert(false);
        parameterTypes.add(parameterTypeList);
        ParameterTypeInt parameterTypeInt = new ParameterTypeInt(CrawlerOperator.PARAMETER_MAX_DEPTH, "Specifies the maximal depth of the crawling process. A depth of 1 means 'only crawl direct links on the initial page'.", 0, 32767, 1);
        parameterTypeInt.setExpert(false);
        parameterTypes.add(parameterTypeInt);
        ParameterTypeBoolean parameterTypeBoolean = new ParameterTypeBoolean(CrawlerOperator.PARAMETER_STORE_TEXT_AS_HTML, "If selected, the actual HTML is returned instead of the textual representation.", false);
        parameterTypeBoolean.setExpert(true);
        parameterTypes.add(parameterTypeBoolean);
        ParameterTypeBoolean parameterTypeBoolean2 = new ParameterTypeBoolean(CrawlerOperator.PARAMETER_BASIC_AUTH_ENABLE, "If selected, all requests will send basic auth information in their header. Use only when crawling HTTPS pages!", false, false);
        parameterTypeBoolean2.setExpert(false);
        parameterTypes.add(parameterTypeBoolean2);
        ParameterTypeString parameterTypeString2 = new ParameterTypeString(CrawlerOperator.PARAMETER_BASIC_AUTH_USERNAME, "Username for basic authentication.");
        parameterTypeString2.registerDependencyCondition(new BooleanParameterCondition(this, CrawlerOperator.PARAMETER_BASIC_AUTH_ENABLE, true, true));
        parameterTypeString2.setExpert(false);
        parameterTypes.add(parameterTypeString2);
        ParameterTypePassword parameterTypePassword = new ParameterTypePassword("password", "Password for basic authentication.");
        parameterTypePassword.registerDependencyCondition(new BooleanParameterCondition(this, CrawlerOperator.PARAMETER_BASIC_AUTH_ENABLE, true, true));
        parameterTypePassword.setExpert(false);
        parameterTypes.add(parameterTypePassword);
        parameterTypes.add(new ParameterTypeBoolean(CrawlerOperator.PARAMETER_ADD_CONTENT_ATTRIBUTE, "Specifies, whether the pages' content should be added as a text attribute.", false, false));
        ParameterTypeInt parameterTypeInt2 = new ParameterTypeInt("max_pages", "The maximal number of pages to store.", 1, Integer.MAX_VALUE, true);
        parameterTypeInt2.setExpert(true);
        parameterTypes.add(parameterTypeInt2);
        ParameterTypeInt parameterTypeInt3 = new ParameterTypeInt("max_page_size", "Specifies the maximum page size (in KB): pages larger than this limit are not downloaded", 1, Integer.MAX_VALUE, 1000);
        parameterTypeInt3.setExpert(true);
        parameterTypes.add(parameterTypeInt3);
        ParameterTypeInt parameterTypeInt4 = new ParameterTypeInt("delay", "Specifies the courtesy delay when vistiting a page from the same host in milliseconds.", 0, Integer.MAX_VALUE, HttpStatus.SC_OK);
        parameterTypeInt4.setExpert(true);
        parameterTypes.add(parameterTypeInt4);
        ParameterTypeInt parameterTypeInt5 = new ParameterTypeInt(CrawlerOperator.PARAMETER_MAX_TOTAL_CONNECTIONS, "Maximum amount of HTTP connections used at the same time.", 1, Integer.MAX_VALUE, 100);
        parameterTypeInt5.setExpert(true);
        parameterTypes.add(parameterTypeInt5);
        ParameterTypeInt parameterTypeInt6 = new ParameterTypeInt(CrawlerOperator.PARAMETER_MAX_CONNECTIONS_PER_HOST, "Maximum amount of simultaneous HTTP connections used to connect to a single host.", 1, Integer.MAX_VALUE, 100);
        parameterTypeInt6.setExpert(true);
        parameterTypes.add(parameterTypeInt6);
        ParameterTypeString parameterTypeString3 = new ParameterTypeString("user_agent", "The identity the crawler uses while accessing a server", "rapidminer-web-mining-extension-crawler");
        parameterTypeString3.setExpert(true);
        parameterTypes.add(parameterTypeString3);
        ParameterTypeBoolean parameterTypeBoolean3 = new ParameterTypeBoolean(CrawlerOperator.PARAMETER_IGNORE_ROBOT_EXCLUSION, "Specifies whether the crawler should ignore the robot exclusion rules set by the crawled page. Enable only if you know what you are doing and if you a sure not to violate any existing laws by doing so!", false);
        parameterTypeBoolean3.setExpert(true);
        parameterTypes.add(parameterTypeBoolean3);
        return parameterTypes;
    }

    private Pair<WebCrawlerSupervisor, ExampleTable> createSupervisor() throws UndefinedParameterError {
        final boolean parameterAsBoolean = getParameterAsBoolean(CrawlerOperator.PARAMETER_ADD_CONTENT_ATTRIBUTE);
        final boolean parameterAsBoolean2 = getParameterAsBoolean(CrawlerOperator.PARAMETER_STORE_TEXT_AS_HTML);
        final GrowingExampleTable createTableFrom = ExampleSets.createTableFrom(Collections.emptyList());
        final LinkedList linkedList = new LinkedList();
        final DataRowFactory dataRowFactory = new DataRowFactory(0, '.');
        return new Pair<>(new WebCrawlerSupervisor() { // from class: com.rapidminer.operator.web.crawler.ProcessingCrawlerOperator.3
            private boolean gotTexts = false;

            @Override // com.rapidminer.operator.web.crawler.WebCrawlerSupervisor
            public void storePage(ParseData parseData, Page page) throws OperatorException {
                try {
                    String html = parseData instanceof HtmlParseData ? parameterAsBoolean2 ? ((HtmlParseData) parseData).getHtml() : parseData.toString() : parseData.toString();
                    Document document = new Document(html);
                    if (parameterAsBoolean) {
                        document.addMetaData("page_content", html, 5);
                    }
                    document.addMetaData("page_url", page.getWebURL().getURL(), 1);
                    ProcessingCrawlerOperator.this.innerTextObjectSource.deliver(document);
                    checkForStop();
                    ProcessingCrawlerOperator.this.getSubprocess(0).execute();
                    checkForStop();
                    for (Document document2 : ProcessingCrawlerOperator.this.innerTextObjectSink.getData(Document.class, true)) {
                        if (!this.gotTexts) {
                            for (String str : document2.getMetaDataKeys()) {
                                linkedList.add(AttributeFactory.createAttribute(str, document2.getMetaDataType(str)));
                            }
                            createTableFrom.addAttributes(linkedList);
                            this.gotTexts = true;
                        }
                        DataRow create = dataRowFactory.create(createTableFrom.getAttributeCount());
                        for (Attribute attribute : linkedList) {
                            if (attribute.isNominal()) {
                                if (((String) document2.getMetaDataValue(attribute.getName())) != null) {
                                    create.set(attribute, attribute.getMapping().mapString(r0));
                                } else {
                                    create.set(attribute, Double.NaN);
                                }
                            } else if (attribute.isNumerical()) {
                                Double d = (Double) document2.getMetaDataValue(attribute.getName());
                                if (d != null) {
                                    create.set(attribute, d.doubleValue());
                                } else {
                                    create.set(attribute, Double.NaN);
                                }
                            } else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), 9)) {
                                if (((Date) document2.getMetaDataValue(attribute.getName())) != null) {
                                    create.set(attribute, r0.getTime());
                                } else {
                                    create.set(attribute, Double.NaN);
                                }
                            }
                        }
                        createTableFrom.addDataRow(create);
                    }
                } catch (Exception e) {
                    ProcessingCrawlerOperator.this.getLogger().log(Level.WARNING, "Error during processing page \"" + page.getWebURL().getURL() + "\": ", (Throwable) e);
                } catch (OperatorException e2) {
                }
            }

            @Override // com.rapidminer.operator.web.crawler.WebCrawlerSupervisor
            public void checkForStop() throws ProcessStoppedException {
                ProcessingCrawlerOperator.this.checkForStop();
            }
        }, createTableFrom);
    }
}
