package com.rapidminer.operator.web.html;

import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.text.Document;
import com.rapidminer.operator.text.Token;
import com.rapidminer.operator.text.io.AbstractTokenProcessor;
import com.rapidminer.operator.web.features.construction.WebserviceBasedAttributeConstruction;
import com.rapidminer.operator.web.io.URLConnectionProvider;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.io.Encoding;
import java.io.UnsupportedEncodingException;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
import org.apache.commons.compress.compressors.CompressorStreamFactory;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.cxf.jaxrs.utils.JAXRSUtils;
import org.opengis.metadata.Identifier;
import org.polliwog.Constants;
import org.polliwog.handlers.GraphFormatter;
import org.polliwog.replacements.AbstractReplacement;
import org.springframework.beans.factory.xml.BeanDefinitionParserDelegate;
import ucar.nc2.constants.ACDD;

/* loaded from: input_file:com/rapidminer/operator/web/html/HTMLTextExtractionOperator.class */
public class HTMLTextExtractionOperator extends AbstractTokenProcessor {
    public static final String PARAMETER_EXTRACT_CONTENT = "extract_content";
    public static final String PARAMETER_MIN_LENGTH = "minimum_text_block_length";
    public static final String PARAMETER_OVERRIDE_CONTENT_TYPE_INFORMATION = "override_content_type_information";
    public static final String PARAMETER_NEGLEGT_SPAN_TAGS = "neglegt_span_tags";
    public static final String PARAMETER_NEGLECT_P_TAGS = "neglect_p_tags";
    public static final String PARAMETER_NEGLECT_B_TAGS = "neglect_b_tags";
    public static final String PARAMETER_NEGLECT_I_TAGS = "neglect_i_tags";
    public static final String PARAMETER_NEGLECT_BR_TAGS = "neglect_br_tags";
    public static final String PARAMETER_IGNORE_NON_HTML_TAGS = "ignore_non_html_tags";
    private static final String META_DATA_HTML_TITLE = "Title";
    private static final String META_DATA_HTML_LANGUAGE = "Language";
    private static final String META_DATA_HTML_DESCRIPTION = "Description";
    private static final String META_DATA_HTML_KEYWORDS = "Keywords";
    private static final String META_DATA_HTML_ROBOTS = "Robots";
    private static final Set<String> knownTags = new HashSet();
    private boolean encodingChanged;

    public HTMLTextExtractionOperator(OperatorDescription operatorDescription) {
        super(operatorDescription);
        this.encodingChanged = false;
    }

    protected Document doWork(Document document) throws OperatorException {
        String tokenText = document.getTokenText();
        if (getParameterAsBoolean(PARAMETER_OVERRIDE_CONTENT_TYPE_INFORMATION)) {
            String str = URLConnectionProvider.DEFAULT_ENCODING;
            if (document.getMetaDataKeys().contains("Content-Type")) {
                str = URLConnectionProvider.parseEncoding((String) document.getMetaDataValue("Content-Type"));
            }
            Matcher matcher = Pattern.compile("<meta[\\s]*http-equiv=[\"]*([^>]*?)[\"]*[\\s]*content=[\"]*([^>]*?)[\"]*[\\s]*?[/]*?>").matcher(tokenText.toLowerCase());
            while (true) {
                if (!matcher.find()) {
                    break;
                }
                if (matcher.groupCount() >= 2) {
                    String trim = matcher.group(1).trim();
                    String trim2 = matcher.group(2).trim();
                    if (trim.toLowerCase().equals("content-type")) {
                        String parseEncoding = URLConnectionProvider.parseEncoding(trim2.replace("\"", ""));
                        try {
                            tokenText = new String(tokenText.getBytes(str), Encoding.getEncoding(parseEncoding));
                            this.encodingChanged = true;
                        } catch (UnsupportedEncodingException e) {
                            tokenText = new String(tokenText.getBytes(), Encoding.getEncoding(parseEncoding));
                        } catch (IllegalCharsetNameException e2) {
                        } catch (UnsupportedCharsetException e3) {
                        }
                        document.addMetaData("Content-Type", trim2, 1);
                        break;
                    }
                }
            }
        }
        int indexOf = tokenText.toLowerCase().indexOf("<title>") + "<title>".length();
        int indexOf2 = tokenText.toLowerCase().indexOf("</title>");
        if (indexOf < 0 || indexOf2 < 0 || indexOf >= indexOf2) {
            document.addMetaData("Title", (String) null, 1);
        } else {
            String trim3 = tokenText.substring(indexOf, indexOf2).trim();
            if (trim3 == null || trim3.isEmpty()) {
                document.addMetaData("Title", (String) null, 1);
            } else {
                document.addMetaData("Title", StringEscapeUtils.unescapeHtml(trim3), 1);
            }
        }
        document.addMetaData(META_DATA_HTML_LANGUAGE, (String) null, 1);
        document.addMetaData("Description", (String) null, 1);
        document.addMetaData(META_DATA_HTML_KEYWORDS, (String) null, 1);
        document.addMetaData(META_DATA_HTML_ROBOTS, (String) null, 1);
        Matcher matcher2 = Pattern.compile("<(meta|META)[\\s]*(HTTP-EQUIV|http-equiv|NAME|name)=[\"]*([^>]*?)[\"]*[\\s]*(content|CONTENT)=[\"]*([^>]*?)[\"]*[\\s]*?[/]*?>").matcher(tokenText);
        while (matcher2.find()) {
            if (matcher2.groupCount() >= 5) {
                String trim4 = matcher2.group(3).trim();
                String trim5 = matcher2.group(5).trim();
                if (trim4 != null && !trim4.isEmpty()) {
                    if (trim4.toLowerCase().equals("language")) {
                        document.addMetaData(META_DATA_HTML_LANGUAGE, trim5, 1);
                    } else if (trim4.toLowerCase().equals("description")) {
                        document.addMetaData("Description", StringEscapeUtils.unescapeHtml(trim5), 1);
                    } else if (trim4.toLowerCase().equals(ACDD.keywords)) {
                        document.addMetaData(META_DATA_HTML_KEYWORDS, StringEscapeUtils.unescapeHtml(trim5), 1);
                    } else if (trim4.toLowerCase().equals("robots")) {
                        document.addMetaData(META_DATA_HTML_ROBOTS, trim5, 1);
                    }
                }
            }
        }
        if (!getParameterAsBoolean(PARAMETER_EXTRACT_CONTENT)) {
            if (!this.encodingChanged) {
                return document;
            }
            Document document2 = new Document(Collections.singletonList(new Token(tokenText, 1.0f)), new Document(Collections.singletonList(new Token(tokenText, 1.0f)), document));
            document2.getTokenText();
            return document2;
        }
        String replaceAll = tokenText.replaceAll("<!--[\\s\\S]?-->", "").replaceAll("<style.*?>[\\s\\S]*?</style>", "").replaceAll("<script.*?>[\\s\\S]*?</script>", "").replaceAll("<img[^>]*?>", "").replaceAll("<a[^>]*?>(.*?)<[\\s]*/a>", " $1 ");
        if (getParameterAsBoolean(PARAMETER_NEGLEGT_SPAN_TAGS)) {
            replaceAll = replaceAll.replaceAll("<[/]*[span|SPAN][^>]*?>(.*?)<[\\s]*/span>", " $1 ");
        }
        if (getParameterAsBoolean(PARAMETER_NEGLECT_P_TAGS)) {
            replaceAll = replaceAll.replaceAll("<[/]*[p|P][^a-zA-Z>]*?>", Constants.DEFAULT_KEY_VALUE_SEPARATOR);
        }
        if (getParameterAsBoolean(PARAMETER_NEGLECT_B_TAGS)) {
            replaceAll = replaceAll.replaceAll("<[/]*[b|B][^a-zA-Z>]*?>", Constants.DEFAULT_KEY_VALUE_SEPARATOR);
        }
        if (getParameterAsBoolean(PARAMETER_NEGLECT_I_TAGS)) {
            replaceAll = replaceAll.replaceAll("<[/]*[i|I][^a-zA-Z>]*?>", Constants.DEFAULT_KEY_VALUE_SEPARATOR);
        }
        if (getParameterAsBoolean(PARAMETER_NEGLECT_BR_TAGS)) {
            replaceAll = replaceAll.replaceAll("<(br|BR)[^>]*?>", Constants.DEFAULT_KEY_VALUE_SEPARATOR);
        }
        if (getParameterAsBoolean(PARAMETER_IGNORE_NON_HTML_TAGS)) {
            Matcher matcher3 = Pattern.compile("</?([A-Za-z]*)?([^>]*?)>").matcher(replaceAll);
            while (matcher3.find()) {
                if (matcher3.groupCount() > 0 && !knownTags.contains(matcher3.group(1).toLowerCase())) {
                    replaceAll = replaceAll.replace(matcher3.group(0), "");
                }
            }
        }
        String replaceAll2 = replaceAll.replaceAll("(<[^>]*?>)", " <tag> ").replaceAll("[\r]*[\n]+", Constants.DEFAULT_KEY_VALUE_SEPARATOR).replaceAll("[\\s]+", Constants.DEFAULT_KEY_VALUE_SEPARATOR);
        String[] split = replaceAll2.split("\\s");
        int[] iArr = new int[split.length];
        int i = 0;
        while (i < split.length) {
            if (split[i].equals("<tag>")) {
                iArr[i] = 0;
            } else {
                iArr[i] = (i > 0 ? iArr[i - 1] : 0) + 1;
            }
            i++;
        }
        for (int length = split.length - 2; length >= 0; length--) {
            if (iArr[length] != 0 && iArr[length] < iArr[length + 1]) {
                iArr[length] = iArr[length + 1];
            }
        }
        int parameterAsInt = getParameterAsInt(PARAMETER_MIN_LENGTH);
        LinkedList linkedList = new LinkedList();
        int i2 = 0;
        while (i2 < split.length) {
            if (iArr[i2] >= parameterAsInt) {
                StringBuffer stringBuffer = new StringBuffer();
                for (int i3 = 0; i3 < iArr[i2]; i3++) {
                    int i4 = i2;
                    i2++;
                    stringBuffer.append(split[i4] + Constants.DEFAULT_KEY_VALUE_SEPARATOR);
                    if (i2 >= iArr.length) {
                        break;
                    }
                }
                linkedList.add(new Token(StringEscapeUtils.unescapeHtml(stringBuffer.toString().trim()), 1.0f));
            }
            i2++;
        }
        return new Document(linkedList, new Document(Collections.singletonList(new Token(replaceAll2, 1.0f)), document));
    }

    public List<ParameterType> getParameterTypes() {
        LinkedList linkedList = new LinkedList();
        linkedList.add(new ParameterTypeBoolean(PARAMETER_EXTRACT_CONTENT, "Specifies whether content is extracted or not", true, false));
        ParameterTypeInt parameterTypeInt = new ParameterTypeInt(PARAMETER_MIN_LENGTH, "The minimum length (in words/tokens) of text blocks.", 1, Integer.MAX_VALUE, 5, false);
        parameterTypeInt.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_EXTRACT_CONTENT, false, true));
        linkedList.add(parameterTypeInt);
        linkedList.add(new ParameterTypeBoolean(PARAMETER_OVERRIDE_CONTENT_TYPE_INFORMATION, "Specifies whether potentially existing content type information and used encoding information should be overriden using the HTML meta http-equiv tag.", true, false));
        linkedList.add(new ParameterTypeBoolean(PARAMETER_NEGLEGT_SPAN_TAGS, "Specifies whether <span> tags should be neglected or used as text block divider.", true, false));
        linkedList.add(new ParameterTypeBoolean(PARAMETER_NEGLECT_P_TAGS, "Specifies whether <p> tags should be neglected or used as text block divider.", true, false));
        linkedList.add(new ParameterTypeBoolean(PARAMETER_NEGLECT_B_TAGS, "Specifies whether <b> tags should be neglected or used as text block divider.", true, false));
        linkedList.add(new ParameterTypeBoolean(PARAMETER_NEGLECT_I_TAGS, "Specifies whether <i> tags should be neglected or used as text block divider.", true, false));
        linkedList.add(new ParameterTypeBoolean(PARAMETER_NEGLECT_BR_TAGS, "Specifies whether <br> tags should be neglected or used as text block divider.", true, false));
        linkedList.add(new ParameterTypeBoolean(PARAMETER_IGNORE_NON_HTML_TAGS, "Specifies whether tags that are not common HTML should be ignored.", true, false));
        return linkedList;
    }

    static {
        knownTags.add("abbr");
        knownTags.add("acronym");
        knownTags.add("address");
        knownTags.add("applet");
        knownTags.add("area");
        knownTags.add("b");
        knownTags.add("base");
        knownTags.add("basefont");
        knownTags.add("bdo");
        knownTags.add("big");
        knownTags.add("blockquote");
        knownTags.add(WebserviceBasedAttributeConstruction.PARAMETER_HTTP_BODY);
        knownTags.add(CompressorStreamFactory.BROTLI);
        knownTags.add("button");
        knownTags.add("caption");
        knownTags.add("center");
        knownTags.add("cite");
        knownTags.add(Identifier.CODE_KEY);
        knownTags.add("col");
        knownTags.add("colgroup");
        knownTags.add("dd");
        knownTags.add("del");
        knownTags.add("dfn");
        knownTags.add("dir");
        knownTags.add("div");
        knownTags.add("dl");
        knownTags.add("dt");
        knownTags.add("em");
        knownTags.add("fieldset");
        knownTags.add("font");
        knownTags.add("form");
        knownTags.add("frame");
        knownTags.add("frameset");
        knownTags.add("h1");
        knownTags.add("h2");
        knownTags.add("h3");
        knownTags.add("h4");
        knownTags.add("h5");
        knownTags.add("h6");
        knownTags.add("head");
        knownTags.add("hr");
        knownTags.add("html");
        knownTags.add("i");
        knownTags.add("iframe");
        knownTags.add(GraphFormatter.IMG);
        knownTags.add("input");
        knownTags.add("ins");
        knownTags.add("isindex");
        knownTags.add("kbd");
        knownTags.add("label");
        knownTags.add("legend");
        knownTags.add("li");
        knownTags.add("link");
        knownTags.add(BeanDefinitionParserDelegate.MAP_ELEMENT);
        knownTags.add("menu");
        knownTags.add("meta");
        knownTags.add("noframes");
        knownTags.add("noscript");
        knownTags.add(AbstractReplacement.DEFAULT_TYPE);
        knownTags.add("ol");
        knownTags.add("optgroup");
        knownTags.add("option");
        knownTags.add(WindowFeatureGenerator.PREV_PREFIX);
        knownTags.add("param");
        knownTags.add("pre");
        knownTags.add(JAXRSUtils.MEDIA_TYPE_Q_PARAM);
        knownTags.add("s");
        knownTags.add("samp");
        knownTags.add("script");
        knownTags.add("select");
        knownTags.add("small");
        knownTags.add("span");
        knownTags.add("strike");
        knownTags.add("strong");
        knownTags.add("style");
        knownTags.add("sub");
        knownTags.add("sup");
        knownTags.add("table");
        knownTags.add("tbody");
        knownTags.add("td");
        knownTags.add("textarea");
        knownTags.add("tfoot");
        knownTags.add("th");
        knownTags.add("thead");
        knownTags.add("title");
        knownTags.add("tr");
        knownTags.add("tt");
        knownTags.add("u");
        knownTags.add("ul");
        knownTags.add("var");
    }
}
