package com.rapidminer.operator.web.crawler;

import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.WebCrawler;
import edu.uci.ics.crawler4j.parser.BinaryParseData;
import edu.uci.ics.crawler4j.parser.ParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/* loaded from: input_file:com/rapidminer/operator/web/crawler/Crawler4JCrawler.class */
public class Crawler4JCrawler extends WebCrawler {
    public static final String RULE_STORE_URL = "store_with_matching_url";
    public static final String RULE_STORE_PAGE = "store_with_matching_content";
    public static final String RULE_FOLLOW_URL = "follow_link_with_matching_url";
    private final WebCrawlerSupervisor supervisor;
    private final Logger logger;
    private final boolean acceptBinaryData;
    private OperatorException errorState = null;
    private final Map<String, List<Pattern>> regexRules = new HashMap();
    private final Map<String, List<String>> containRules = new HashMap();

    public Crawler4JCrawler(WebCrawlerSupervisor webCrawlerSupervisor, Logger logger, Map<String, List<String>> map, boolean z) {
        this.logger = logger;
        this.supervisor = webCrawlerSupervisor;
        this.acceptBinaryData = z;
        for (Map.Entry<String, List<String>> entry : map.entrySet()) {
            String key = entry.getKey();
            List<String> value = entry.getValue();
            if ("store_with_matching_content".equals(key)) {
                this.containRules.put(entry.getKey(), new LinkedList(value));
            } else {
                ArrayList arrayList = new ArrayList(value.size());
                Iterator<String> it = value.iterator();
                while (it.hasNext()) {
                    arrayList.add(Pattern.compile(it.next(), 34));
                }
                this.regexRules.put(entry.getKey(), arrayList);
            }
        }
    }

    @Override // edu.uci.ics.crawler4j.crawler.WebCrawler
    public boolean shouldVisit(Page page, WebURL webURL) {
        if (isErrorOccurred()) {
            this.logger.log(Level.INFO, "Error occurred during crawling, shutting down crawler!");
            getMyController().shutdown();
            return false;
        }
        try {
            this.supervisor.checkForStop();
            String lowerCase = webURL.getURL().toLowerCase();
            if (isAnyRuleMatching("follow_link_with_matching_url", lowerCase)) {
                this.logger.log(Level.FINE, "Following link: " + lowerCase);
                return true;
            }
            this.logger.log(Level.FINE, "Discarded link due to URL follow rules: " + lowerCase);
            return false;
        } catch (ProcessStoppedException e) {
            this.logger.log(Level.INFO, "Process was stopped, shutting down crawler. This will take a few seconds.");
            getMyController().shutdown();
            return false;
        }
    }

    @Override // edu.uci.ics.crawler4j.crawler.WebCrawler
    public void visit(Page page) {
        if (isErrorOccurred() || !isAnyRuleMatching("store_with_matching_url", page.getWebURL().getURL().toLowerCase())) {
            return;
        }
        ParseData parseData = page.getParseData();
        if (parseData instanceof BinaryParseData) {
            if (this.acceptBinaryData) {
                try {
                    this.supervisor.storePage(parseData, page);
                    return;
                } catch (OperatorException e) {
                    this.errorState = e;
                    this.logger.log(Level.WARNING, "Could not save page \"" + page.getWebURL().getURL() + "\" because of error: " + e.getMessage());
                    return;
                }
            }
            return;
        }
        if (!isAnyRuleContained("store_with_matching_content", parseData.toString())) {
            this.logger.log(Level.FINE, "Discarded page due to matching content store rules: " + page.getWebURL().getURL());
            return;
        }
        try {
            this.supervisor.storePage(parseData, page);
        } catch (OperatorException e2) {
            this.errorState = e2;
            this.logger.log(Level.WARNING, "Could not save page \"" + page.getWebURL().getURL() + "\" because of error: " + e2.getMessage());
        }
    }

    public boolean isErrorOccurred() {
        return this.errorState != null;
    }

    public OperatorException getError() {
        return this.errorState;
    }

    private boolean isAnyRuleMatching(String str, String str2) {
        List<Pattern> list = this.regexRules.get(str);
        if (list == null) {
            return true;
        }
        Iterator<Pattern> it = list.iterator();
        if (it.hasNext()) {
            return it.next().matcher(str2).matches();
        }
        return true;
    }

    private boolean isAnyRuleContained(String str, String str2) {
        List<String> list = this.containRules.get(str);
        if (list == null) {
            return true;
        }
        Iterator<String> it = list.iterator();
        if (it.hasNext()) {
            return str2.contains(it.next());
        }
        return true;
    }

    public static void doesCompile(List<String> list) throws PatternSyntaxException {
        Iterator<String> it = list.iterator();
        while (it.hasNext()) {
            Pattern.compile(it.next(), 34);
        }
    }
}
