package com.rapidminer.operator.web.crawler.deprecated;

import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Pattern;
import websphinx.Crawler;
import websphinx.Link;
import websphinx.Page;

@Deprecated
/* loaded from: input_file:com/rapidminer/operator/web/crawler/deprecated/WebSphinxCrawler.class */
public class WebSphinxCrawler extends Crawler {
    private static final long serialVersionUID = -614344545462107732L;
    public static final String RULE_STORE_URL = "store_with_matching_url";
    public static final String RULE_STORE_PAGE = "store_with_matching_content";
    public static final String RULE_FOLLOW_URL = "follow_link_with_matching_url";
    public static final String RULE_FOLLOW_LINK = "follow_link_with_matching_text";
    private int maxPages;
    private WebCrawlerSupervisor supervisor;
    private final String userAgent;
    private final int delay;
    private final Logger logger;
    private int currentIndex = 0;
    private OperatorException errorState = null;
    private final Map<String, List<Pattern>> rules = new HashMap();

    public WebSphinxCrawler(WebCrawlerSupervisor webCrawlerSupervisor, Logger logger, Map<String, List<String>> map, int i, int i2, String str) {
        this.logger = logger;
        this.supervisor = webCrawlerSupervisor;
        this.maxPages = i;
        this.delay = i2;
        this.userAgent = str;
        for (String str2 : map.keySet()) {
            List<String> list = map.get(str2);
            ArrayList arrayList = new ArrayList(list.size());
            Iterator<String> it = list.iterator();
            while (it.hasNext()) {
                arrayList.add(Pattern.compile(it.next(), 34));
            }
            this.rules.put(str2, arrayList);
        }
    }

    @Override // websphinx.Crawler
    public boolean shouldVisit(Link link) {
        if ((this.currentIndex >= this.maxPages && this.maxPages != -1) || isErrorOccurred()) {
            return false;
        }
        try {
            this.supervisor.checkForStop();
            String externalForm = link.getURL().toExternalForm();
            String text = link.toText();
            if (!rulesApply("follow_link_with_matching_url", externalForm) || !rulesApply(RULE_FOLLOW_LINK, text)) {
                return false;
            }
            this.logger.log(Level.INFO, "Following link " + externalForm, (Object) 2);
            return true;
        } catch (ProcessStoppedException e) {
            stop();
            return false;
        }
    }

    @Override // websphinx.Crawler
    public void visit(Page page) {
        try {
            if (this.delay > 0) {
                Thread.sleep(new Random().nextInt(this.delay));
            }
        } catch (InterruptedException e) {
        }
        if (!isErrorOccurred() && (this.currentIndex < this.maxPages || this.maxPages == -1)) {
            boolean rulesApply = rulesApply("store_with_matching_url", page.getURL().toExternalForm());
            if (rulesApply && rulesApply("store_with_matching_content", page.getContent())) {
                try {
                    this.supervisor.storePage(page);
                } catch (OperatorException e2) {
                    this.errorState = e2;
                    this.logger.log(Level.WARNING, "Could not save page \"" + page.getURL().toString() + "\" because of error: " + e2.getMessage());
                }
                this.currentIndex++;
                if (this.currentIndex == this.maxPages) {
                    stop();
                }
            } else if (rulesApply) {
                this.logger.log(Level.INFO, "Discarded page \"" + page.getURL().toString() + "\" because content does not match filter rules.");
            } else {
                this.logger.log(Level.INFO, "Discarded page \"" + page.getURL().toString() + "\" because url does not match filter rules.");
            }
        }
        page.discardContent();
    }

    @Override // websphinx.Crawler
    public String toString() {
        return this.userAgent;
    }

    private boolean rulesApply(String str, String str2) {
        List<Pattern> list = this.rules.get(str);
        if (list == null) {
            return true;
        }
        Iterator<Pattern> it = list.iterator();
        if (it.hasNext()) {
            return it.next().matcher(str2).matches();
        }
        return true;
    }

    public boolean isErrorOccurred() {
        return this.errorState != null;
    }

    public OperatorException getError() {
        return this.errorState;
    }
}
