package com.rapidminer.extension.webtableextraction.crawl;

import com.rapidminer.tools.LogService;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:com/rapidminer/extension/webtableextraction/crawl/BasicBreadthFirstSearchCrawler.class */
public class BasicBreadthFirstSearchCrawler {
    public static final int MAX_CONNECTED_LINKS_TO_CRAWL = 100;
    public static final int POLITENESS_TIME_MILLISECONDS = 200;
    private static final Logger LOGGER = LogService.getRoot();
    private Queue<String> queueOfSitesToCrawl = new LinkedList();
    private Set<String> connectedLinks = new HashSet();
    private String regexForLinks = "\\b(http[s]*)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]";

    public int sizeOfConnectedLinks() {
        return this.connectedLinks.size();
    }

    public void breadthFirstSearch(String str) {
        this.queueOfSitesToCrawl.add(str);
        BufferedReader bufferedReader = null;
        StringBuilder sb = new StringBuilder();
        boolean z = false;
        while (!this.queueOfSitesToCrawl.isEmpty()) {
            String poll = this.queueOfSitesToCrawl.poll();
            if (this.connectedLinks.size() > 100) {
                LOGGER.log(Level.INFO, "Connected URL links exceeds max limit. Quitting Crawl ");
                return;
            }
            while (!z) {
                try {
                    LOGGER.log(Level.INFO, "Opening URL: " + poll);
                    bufferedReader = new BufferedReader(new InputStreamReader(new URL(poll).openStream()));
                    z = true;
                    while (true) {
                        String readLine = bufferedReader.readLine();
                        if (readLine == null) {
                            break;
                        } else {
                            sb.append(readLine);
                        }
                    }
                } catch (MalformedURLException e) {
                    LOGGER.log(Level.WARNING, "MalformedURL: " + poll);
                    e.printStackTrace();
                    poll = this.queueOfSitesToCrawl.poll();
                    z = false;
                } catch (IOException e2) {
                    LOGGER.log(Level.WARNING, "IOException: " + e2.toString());
                    e2.printStackTrace();
                    poll = this.queueOfSitesToCrawl.poll();
                    z = false;
                }
                try {
                    Thread.currentThread();
                    Thread.sleep(200L);
                } catch (InterruptedException e3) {
                    e3.printStackTrace();
                }
            }
            String sb2 = sb.toString();
            LOGGER.log(Level.INFO, "Page = \n" + sb2);
            Matcher matcher = Pattern.compile(this.regexForLinks).matcher(sb2);
            while (matcher.find()) {
                String group = matcher.group();
                LOGGER.log(Level.INFO, "*** Matched a URL: " + group);
                if (!this.connectedLinks.contains(group)) {
                    this.connectedLinks.add(group);
                    LOGGER.log(Level.INFO, "+++ Added link to site: " + group);
                    this.queueOfSitesToCrawl.add(group);
                    this.connectedLinks.add(group);
                }
            }
        }
        if (bufferedReader != null) {
            try {
                bufferedReader.close();
            } catch (IOException e4) {
                e4.printStackTrace();
            }
        }
    }

    public static void main(String[] strArr) {
        BasicBreadthFirstSearchCrawler basicBreadthFirstSearchCrawler = new BasicBreadthFirstSearchCrawler();
        LOGGER.log(Level.INFO, "Starting a Breadth First Search Crawl");
        basicBreadthFirstSearchCrawler.breadthFirstSearch("https://www.baur.de/s/STUHL/#els=true");
        LOGGER.log(Level.INFO, "\nTotal URL links discovered on this page = " + basicBreadthFirstSearchCrawler.sizeOfConnectedLinks());
    }
}
