package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.url.WebURL;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.HashSet;
import java.util.Set;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXTransformerFactory;
import javax.xml.transform.sax.TransformerHandler;
import javax.xml.transform.stream.StreamResult;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.ccil.cowan.tagsoup.XMLWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/uci/ics/crawler4j/parser/BinaryParseData.class */
public class BinaryParseData implements ParseData {
    private static final String DEFAULT_ENCODING = "UTF-8";
    private static final String DEFAULT_OUTPUT_FORMAT = "html";
    private final ParseContext context = new ParseContext();
    private Set<WebURL> outgoingUrls = new HashSet();
    private String html = null;
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) BinaryParseData.class);
    private static final org.apache.tika.parser.Parser AUTO_DETECT_PARSER = new AutoDetectParser();
    private static final SAXTransformerFactory SAX_TRANSFORMER_FACTORY = (SAXTransformerFactory) TransformerFactory.newInstance();

    public BinaryParseData() {
        this.context.set(org.apache.tika.parser.Parser.class, AUTO_DETECT_PARSER);
    }

    public void setBinaryContent(byte[] bArr) {
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        try {
            AUTO_DETECT_PARSER.parse(byteArrayInputStream, getTransformerHandler(byteArrayOutputStream, "html", "UTF-8"), new Metadata(), this.context);
            this.html = new String(byteArrayOutputStream.toByteArray(), "UTF-8").replace("http://www.w3.org/1999/xhtml", "");
        } catch (Exception e) {
            logger.error("Error parsing file", (Throwable) e);
        }
    }

    private static TransformerHandler getTransformerHandler(OutputStream outputStream, String str, String str2) throws TransformerConfigurationException {
        TransformerHandler newTransformerHandler = SAX_TRANSFORMER_FACTORY.newTransformerHandler();
        Transformer transformer = newTransformerHandler.getTransformer();
        transformer.setOutputProperty(XMLWriter.METHOD, str);
        transformer.setOutputProperty(XMLWriter.INDENT, "yes");
        if (str2 != null) {
            transformer.setOutputProperty("encoding", str2);
        }
        newTransformerHandler.setResult(new StreamResult(new PrintStream(outputStream)));
        return newTransformerHandler;
    }

    public String getHtml() {
        return this.html;
    }

    public void setHtml(String str) {
        this.html = str;
    }

    @Override // edu.uci.ics.crawler4j.parser.ParseData
    public Set<WebURL> getOutgoingUrls() {
        return this.outgoingUrls;
    }

    @Override // edu.uci.ics.crawler4j.parser.ParseData
    public void setOutgoingUrls(Set<WebURL> set) {
        this.outgoingUrls = set;
    }

    @Override // edu.uci.ics.crawler4j.parser.ParseData
    public String toString() {
        return (this.html == null || this.html.isEmpty()) ? "No data parsed yet" : this.html;
    }
}
