package com.rapidminer.operator.web.services.google;

import com.healthmarketscience.jackcess.PropertyMap;
import com.healthmarketscience.jackcess.impl.JetFormat;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.utils.ExampleSetBuilder;
import com.rapidminer.example.utils.ExampleSets;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.io.AbstractExampleSource;
import com.rapidminer.operator.web.io.GetWebpageOperator;
import com.rapidminer.operator.web.io.UserAgent;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.OperatorService;
import java.text.DateFormat;
import java.text.ParseException;
import java.util.List;
import java.util.Locale;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;

/* loaded from: input_file:com/rapidminer/operator/web/services/google/ReadGoogleArchiveSearch.class */
public class ReadGoogleArchiveSearch extends AbstractExampleSource {
    public static final String PARAMETER_QUERY = "query";
    public static final String PARAMETER_LANGUAGE = "language";
    public static final String PARAMETER_MAX_RANDOM_TIME = "max_random_waiting_time";
    public static final String PARAMETER_YEAR_START = "year_start";
    public static final String PARAMETER_YEAR_END = "year_end";
    public static final String PARAMETER_MONTH_START = "month_start";
    public static final String PARAMETER_MONTH_END = "month_end";
    private final Random random;

    public ReadGoogleArchiveSearch(OperatorDescription operatorDescription) {
        super(operatorDescription);
        this.random = new Random();
    }

    public ExampleSet createExampleSet() throws OperatorException {
        String parameterAsString = getParameterAsString("query");
        String parameterAsString2 = getParameterAsString("language");
        int parameterAsInt = getParameterAsInt("max_random_waiting_time");
        int parameterAsInt2 = getParameterAsInt("year_start");
        int parameterAsInt3 = getParameterAsInt("year_end");
        int parameterAsInt4 = getParameterAsInt("month_start");
        int parameterAsInt5 = getParameterAsInt("month_end");
        try {
            GetWebpageOperator createOperator = OperatorService.createOperator(GetWebpageOperator.class);
            ExampleSetBuilder from = ExampleSets.from(new Attribute[]{AttributeFactory.createAttribute("Date", 10), AttributeFactory.createAttribute("Link", 1), AttributeFactory.createAttribute("Source", 1), AttributeFactory.createAttribute(PropertyMap.TITLE_PROP, 1), AttributeFactory.createAttribute("Abstract", 5)});
            DateFormat dateInstance = DateFormat.getDateInstance(2, Locale.ENGLISH);
            int i = 0;
            boolean z = true;
            int i2 = parameterAsInt2;
            while (i2 <= parameterAsInt3) {
                int i3 = i2 == parameterAsInt2 ? parameterAsInt4 : 1;
                int i4 = i2 == parameterAsInt3 ? parameterAsInt5 : 12;
                for (int i5 = i3; i5 <= i4; i5++) {
                    do {
                        String str = "http://news.google.de/archivesearch?q=" + parameterAsString + "&num=100&hl=en&lr=lang_" + parameterAsString2 + "&as_ldate=" + i2 + "/" + i5 + "&as_hdate=" + i2 + "/" + i5 + "&start=" + i;
                        String randomUserAgent = UserAgent.getRandomUserAgent();
                        createOperator.setParameter("url", str);
                        createOperator.setParameter("user_agent", randomUserAgent);
                        try {
                            Thread.sleep(this.random.nextInt(parameterAsInt));
                        } catch (InterruptedException e) {
                            e.printStackTrace();
                        }
                        String document = createOperator.m1051read().toString();
                        Matcher matcher = Pattern.compile("<td bgcolor=#efefef align=right nowrap><font size=-1>Results <b>([0-9]*?)</b> - <b>([0-9]*?)</b> of .*?<b>([0-9|,]*?)</b>", 32).matcher(document);
                        boolean find = matcher.find();
                        int groupCount = matcher.groupCount();
                        if (find && groupCount >= 3) {
                            z = Integer.parseInt(matcher.group(2)) >= Integer.parseInt(matcher.group(3).replace(",", ""));
                            i += 100;
                            Matcher matcher2 = Pattern.compile("<tr><td><table[^>]*?><tr><td>(.*?)</table></td></tr>").matcher(document);
                            while (matcher2.find()) {
                                double[] dArr = new double[5];
                                int i6 = 0;
                                for (int i7 = 0; i7 < dArr.length; i7++) {
                                    dArr[i7] = Double.NaN;
                                }
                                if (matcher2.groupCount() > 0) {
                                    String group = matcher2.group(0);
                                    Matcher matcher3 = Pattern.compile("<tr><td><table[^>]*?><tr><td><a[^>]*?>.*?</a><br><div[^>]*?><font size=-1 color=666666 class=l>.*?<font size=-1 color=black> - (.*?)</font><br></font>").matcher(group);
                                    if (matcher3.find() && matcher3.groupCount() > 0) {
                                        try {
                                            dArr[0] = dateInstance.parse(matcher3.group(1)).getTime();
                                            i6 = 0 + 1;
                                        } catch (ParseException e2) {
                                            e2.printStackTrace();
                                        }
                                    }
                                    Matcher matcher4 = Pattern.compile("<td><a href=(.*?) onmousedown=").matcher(group);
                                    if (matcher4.find() && matcher4.groupCount() > 0) {
                                        dArr[1] = r0.getMapping().mapString(matcher4.group(1));
                                        i6++;
                                    }
                                    Matcher matcher5 = Pattern.compile("<tr><td><table[^>]*?><tr><td><a[^>]*?>.*?</a><br><div[^>]*?><font size=-1 color=666666 class=l>(.*?)<font size=-1 color=black>.*?</font><br></font>").matcher(group);
                                    if (matcher5.find() && matcher5.groupCount() > 0) {
                                        dArr[2] = r0.getMapping().mapString(matcher5.group(1));
                                        i6++;
                                    }
                                    Matcher matcher6 = Pattern.compile("<tr><td><table[^>]*?><tr><td><a[^>]*?>(.*?)</a>").matcher(group);
                                    if (matcher6.find() && matcher6.groupCount() > 0) {
                                        dArr[3] = r0.getMapping().mapString(StringEscapeUtils.unescapeHtml(matcher6.group(1)));
                                        i6++;
                                    }
                                    Matcher matcher7 = Pattern.compile("<tr><td><table[^>]*?><tr><td><a[^>]*?>.*?</a><br><div[^>]*?><font size=-1 color=666666 class=l>.*?<font size=-1 color=black> - .*?</font><br></font><font size=-1>(.*?)</font></div>").matcher(group);
                                    if (matcher7.find() && matcher7.groupCount() > 0) {
                                        dArr[4] = r0.getMapping().mapString(StringEscapeUtils.unescapeHtml(matcher7.group(1).replaceAll("<[/]*[^>]?>", "")));
                                        i6++;
                                    }
                                    System.out.println(i6);
                                    from.addRow(dArr);
                                }
                            }
                        }
                    } while (!z);
                }
                i2++;
            }
            return from.build();
        } catch (OperatorCreationException e3) {
            e3.printStackTrace();
            return null;
        }
    }

    public List<ParameterType> getParameterTypes() {
        List<ParameterType> parameterTypes = super.getParameterTypes();
        parameterTypes.add(new ParameterTypeString("query", "The query.", false));
        parameterTypes.add(new ParameterTypeString("language", "The language.", false));
        parameterTypes.add(new ParameterTypeInt("max_random_waiting_time", "The maximal time to wait between two page fetches in ms", 0, 10000, 500));
        parameterTypes.add(new ParameterTypeInt("year_start", "The year to start the crawling", JetFormat.MAX_RECORD_SIZE, 2100, false));
        parameterTypes.add(new ParameterTypeInt("year_end", "The year to stop the crawling", JetFormat.MAX_RECORD_SIZE, 2100, false));
        parameterTypes.add(new ParameterTypeInt("month_start", "The month to start the crawling", 1, 12, false));
        parameterTypes.add(new ParameterTypeInt("month_end", "The month to stop the crawling", 1, 12, false));
        return parameterTypes;
    }
}
