package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import java.io.ByteArrayInputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/uci/ics/crawler4j/parser/TikaHtmlParser.class */
public class TikaHtmlParser implements HtmlParser {
    protected static final Logger logger = LoggerFactory.getLogger((Class<?>) TikaHtmlParser.class);
    private final CrawlConfig config;
    private final org.apache.tika.parser.html.HtmlParser htmlParser = new org.apache.tika.parser.html.HtmlParser();
    private final ParseContext parseContext = new ParseContext();

    public TikaHtmlParser(CrawlConfig crawlConfig) throws InstantiationException, IllegalAccessException {
        this.config = crawlConfig;
        this.parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
    }

    @Override // edu.uci.ics.crawler4j.parser.HtmlParser
    public HtmlParseData parse(Page page, String str) throws ParseException {
        HtmlParseData htmlParseData = new HtmlParseData();
        HtmlContentHandler htmlContentHandler = new HtmlContentHandler();
        Metadata metadata = new Metadata();
        try {
            ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(page.getContentData());
            Throwable th = null;
            try {
                try {
                    this.htmlParser.parse(byteArrayInputStream, htmlContentHandler, metadata, this.parseContext);
                    if (byteArrayInputStream != null) {
                        if (0 != 0) {
                            try {
                                byteArrayInputStream.close();
                            } catch (Throwable th2) {
                                th.addSuppressed(th2);
                            }
                        } else {
                            byteArrayInputStream.close();
                        }
                    }
                    String chooseEncoding = chooseEncoding(page, metadata);
                    htmlParseData.setContentCharset(chooseEncoding);
                    htmlParseData.setText(htmlContentHandler.getBodyText().trim());
                    htmlParseData.setTitle(metadata.get(DublinCore.TITLE));
                    htmlParseData.setMetaTags(htmlContentHandler.getMetaTags());
                    htmlParseData.setOutgoingUrls(getOutgoingUrls(str, htmlContentHandler, chooseEncoding));
                    try {
                        if (page.getContentCharset() == null) {
                            htmlParseData.setHtml(new String(page.getContentData()));
                        } else {
                            htmlParseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
                        }
                        return htmlParseData;
                    } catch (UnsupportedEncodingException e) {
                        logger.error("error parsing the html: " + page.getWebURL().getURL(), (Throwable) e);
                        throw new ParseException();
                    }
                } finally {
                }
            } finally {
            }
        } catch (Exception e2) {
            logger.error("{}, while parsing: {}", e2.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
    }

    private Set<WebURL> getOutgoingUrls(String str, HtmlContentHandler htmlContentHandler, String str2) {
        HashSet hashSet = new HashSet();
        String baseUrl = htmlContentHandler.getBaseUrl();
        if (baseUrl != null) {
            str = baseUrl;
        }
        int i = 0;
        for (ExtractedUrlAnchorPair extractedUrlAnchorPair : htmlContentHandler.getOutgoingUrls()) {
            String href = extractedUrlAnchorPair.getHref();
            if (href != null && !href.trim().isEmpty()) {
                String lowerCase = href.trim().toLowerCase();
                if (!lowerCase.contains("javascript:") && !lowerCase.contains("mailto:") && !lowerCase.contains("@")) {
                    String canonicalURL = URLCanonicalizer.getCanonicalURL(href, str, (str2 == null || str2.isEmpty()) ? StandardCharsets.UTF_8 : Charset.forName(str2));
                    if (canonicalURL != null) {
                        WebURL webURL = new WebURL();
                        webURL.setURL(canonicalURL);
                        webURL.setTag(extractedUrlAnchorPair.getTag());
                        webURL.setAnchor(extractedUrlAnchorPair.getAnchor());
                        webURL.setAttributes(extractedUrlAnchorPair.getAttributes());
                        hashSet.add(webURL);
                        i++;
                        if (i > this.config.getMaxOutgoingLinksToFollow()) {
                            break;
                        }
                    } else {
                        continue;
                    }
                }
            }
        }
        return hashSet;
    }

    private String chooseEncoding(Page page, Metadata metadata) {
        String contentCharset = page.getContentCharset();
        return (contentCharset == null || contentCharset.isEmpty()) ? metadata.get("Content-Encoding") : contentCharset;
    }
}
