package com.rapidminer.operator.text.io.tokenizer;

import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.text.Document;
import com.rapidminer.operator.text.Token;
import com.rapidminer.operator.text.io.AbstractTokenProcessor;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeRegexp;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.parameter.conditions.EqualTypeCondition;
import com.rapidminer.tools.Tools;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;

/* loaded from: input_file:com/rapidminer/operator/text/io/tokenizer/StringTokenizerOperator.class */
public class StringTokenizerOperator extends AbstractTokenProcessor {
    public static final int ENGLISH = 0;
    public static final int GERMAN = 1;
    public static final int GENERIC_ASIAN = 2;
    public static final String PARAMETER_LANGUAGE = "language";
    public static final String PARAMETER_MODE = "mode";
    public static final String PARAMETER_SPLIT_CHARACTERS = "characters";
    public static final String PARAMETER_SPLIT_REGEX = "expression";
    public static final String PARAMETER_ASIAN_TOKEN_SIZE = "max_token_length";
    public static final int MODE_NON_LETTERS = 0;
    public static final int MODE_SPECIFY_CHARACTERS = 1;
    public static final int MODE_REGULAR_EXPRESSION = 2;
    public static final int MODE_LINGUISTIC_SENTENCES = 3;
    public static final int MODE_LINGUISTIC_TOKENS = 4;
    public static final String[] LANGUAGES = {"English", "German", "Generic Asian"};
    public static final String[] MODES = {"non letters", "specify characters", "regular expression", "linguistic sentences", "linguistic tokens"};

    public StringTokenizerOperator(OperatorDescription operatorDescription) {
        super(operatorDescription);
    }

    @Override // com.rapidminer.operator.text.io.AbstractTokenProcessor
    protected Document doWork(Document document) throws UserError {
        int parameterAsInt = getParameterAsInt(PARAMETER_MODE);
        char[] cArr = new char[0];
        if (parameterAsInt == 1) {
            cArr = getParameterAsString(PARAMETER_SPLIT_CHARACTERS).toCharArray();
            Arrays.sort(cArr);
        }
        Pattern pattern = null;
        if (parameterAsInt == 2) {
            try {
                pattern = Pattern.compile(getParameterAsString("expression"), 32);
            } catch (Exception e) {
                throw new UserError(this, 206, new Object[]{getParameterAsString("expression"), e.getMessage()});
            }
        }
        int parameterAsInt2 = getParameterAsInt("language");
        String str = parameterAsInt2 == 0 ? "en" : parameterAsInt2 == 1 ? "de" : "?";
        SentenceDetectorME sentenceDetectorME = null;
        if (parameterAsInt == 3) {
            try {
                sentenceDetectorME = new SentenceDetectorME(new SentenceModel(Tools.getResource("pos/" + str + "-sent.bin").openStream()));
            } catch (Exception e2) {
                e2.printStackTrace();
                return null;
            }
        }
        TokenizerME tokenizerME = null;
        if (parameterAsInt == 4) {
            if (parameterAsInt2 != 2) {
                try {
                    tokenizerME = new TokenizerME(new TokenizerModel(Tools.getResource("pos/" + str + "-token.bin").openStream()));
                } catch (Exception e3) {
                    e3.printStackTrace();
                    return null;
                }
            }
        }
        ArrayList arrayList = new ArrayList();
        for (Token token : document.getTokenSequence()) {
            if (parameterAsInt == 2) {
                if (pattern == null) {
                    throw new IllegalStateException("Regexp must not be null");
                }
                for (String str2 : pattern.split(token.getToken())) {
                    arrayList.add(new Token(str2, token));
                }
            } else if (parameterAsInt == 0 || parameterAsInt == 1) {
                char[] charArray = token.getToken().toCharArray();
                int i = 0;
                for (int i2 = 0; i2 < charArray.length; i2++) {
                    if (isSplitPoint(parameterAsInt, charArray[i2], cArr)) {
                        if (i2 - i > 0) {
                            arrayList.add(new Token(new String(charArray, i, i2 - i), token));
                        }
                        i = i2 + 1;
                    }
                }
                if (charArray.length - i > 0) {
                    arrayList.add(new Token(new String(charArray, i, charArray.length - i), token));
                }
            } else if (parameterAsInt == 3) {
                if (sentenceDetectorME == null) {
                    throw new IllegalStateException("SentenceDetector must not be null");
                }
                for (String str3 : sentenceDetectorME.sentDetect(token.getToken())) {
                    arrayList.add(new Token(str3, token));
                }
            } else if (parameterAsInt == 4) {
                for (String str4 : parameterAsInt2 != 2 ? tokenizerME.tokenize(token.getToken()) : getGenericAsianTokens(token.getToken(), getParameterAsInt(PARAMETER_ASIAN_TOKEN_SIZE))) {
                    arrayList.add(new Token(str4, token));
                }
            }
        }
        document.setTokenSequence(arrayList);
        return document;
    }

    private boolean isSplitPoint(int i, char c, char[] cArr) {
        return i == 0 ? !Character.isLetter(c) : Arrays.binarySearch(cArr, c) >= 0;
    }

    private String[] getGenericAsianTokens(String str, int i) {
        ArrayList arrayList = new ArrayList();
        for (String str2 : str.split("\\s+")) {
            for (int i2 = 0; i2 < str2.length(); i2++) {
                for (int i3 = 0; i3 < i; i3++) {
                    int i4 = i2 - i3;
                    if (i4 >= 0) {
                        StringBuilder sb = new StringBuilder(i3 + 1);
                        while (i4 <= i2) {
                            sb.append(str2.charAt(i4));
                            i4++;
                        }
                        arrayList.add(sb.toString());
                    }
                }
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeCategory(PARAMETER_MODE, "This selects the tokenization mode. Depending on the mode, split points are chosen differently.", MODES, 0, false));
        ParameterTypeString parameterTypeString = new ParameterTypeString(PARAMETER_SPLIT_CHARACTERS, "The incoming document will be split into tokens on each of this characters. For example enter a '.' for splitting into sentences.", ".:");
        parameterTypeString.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_MODE, MODES, true, new int[]{1}));
        parameterTypes.add(parameterTypeString);
        ParameterTypeRegexp parameterTypeRegexp = new ParameterTypeRegexp("expression", "This regular expression defines the splitting point.");
        parameterTypeRegexp.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_MODE, MODES, true, new int[]{2}));
        parameterTypes.add(parameterTypeRegexp);
        ParameterTypeCategory parameterTypeCategory = new ParameterTypeCategory("language", "The language for the used part of speech (POS) tagger.", LANGUAGES, 0);
        parameterTypeCategory.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_MODE, MODES, true, new int[]{3, 4}));
        parameterTypes.add(parameterTypeCategory);
        ParameterTypeInt parameterTypeInt = new ParameterTypeInt(PARAMETER_ASIAN_TOKEN_SIZE, "The maximal token length of the tokens", 1, Integer.MAX_VALUE, 3);
        parameterTypeInt.registerDependencyCondition(new EqualTypeCondition(this, "language", LANGUAGES, true, new int[]{2}));
        parameterTypes.add(parameterTypeInt);
        return parameterTypes;
    }
}
