package com.rapidminer.operator.web.services.google;

import java.text.DateFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;

/* loaded from: input_file:com/rapidminer/operator/web/services/google/GoogleArchiveParser.class */
public class GoogleArchiveParser implements Iterable<GoogleNews> {
    private static final String SINGLE_NEWS_REGEX = "<table cellpadding=0 cellspacing=0 border=0 style=\"margin-left: 0em;\">(.*?)</table>";
    private static final String DATE_REGEX = "size=-1 color=black> - (.*?)</font>";
    private static final String LINK_REGEX = "<a href=(\\S*?) ";
    private static final String SOURCE_REGEX = "<div class=j><font size=-1 color=666666 class=l>(.*?)<font";
    private static final String TITLE_REGEX = "<a[^>]*?>(.*?)</a>";
    private static final String ABSTRACT_TEXT_REGEX = "</font><font size=-1>(.*?)</font></div>";
    private static final DateFormat DATE_FORMAT = DateFormat.getDateInstance(2, Locale.ENGLISH);
    private final String page;
    private final List<GoogleNews> parsed = new ArrayList();

    public GoogleArchiveParser(String str) {
        this.page = str;
    }

    public GoogleArchiveParser parse() {
        this.parsed.clear();
        Matcher matcher = Pattern.compile(SINGLE_NEWS_REGEX, 32).matcher(this.page);
        while (matcher.find()) {
            GoogleNews parseNewsToken = parseNewsToken(matcher.group());
            if (parseNewsToken != null) {
                this.parsed.add(parseNewsToken);
            }
        }
        return this;
    }

    private GoogleNews parseNewsToken(String str) {
        GoogleNews googleNews = new GoogleNews();
        googleNews.setDate(parseDate(str));
        googleNews.setTitle(parseTitle(str));
        googleNews.setLink(parseLink(str));
        googleNews.setSource(parseSource(str));
        googleNews.setAbstractText(parseAbstractText(str));
        if (googleNews.getDate() == null || googleNews.getTitle() == null || googleNews.getLink() == null || googleNews.getSource() == null || googleNews.getAbstractText() == null) {
            return null;
        }
        return googleNews;
    }

    private Date parseDate(String str) {
        Matcher matcher = Pattern.compile(DATE_REGEX, 32).matcher(str);
        try {
            if (matcher.find()) {
                return DATE_FORMAT.parse(matcher.group(1));
            }
            System.err.println("Can not parse date, returning null");
            return null;
        } catch (ParseException e) {
            System.err.println("Can not parse date, returning null");
            return null;
        }
    }

    private String parseTitle(String str) {
        Matcher matcher = Pattern.compile(TITLE_REGEX, 32).matcher(str);
        if (matcher.find()) {
            return StringEscapeUtils.unescapeHtml(matcher.group(1));
        }
        System.err.println("Can not parse title; returning null");
        return null;
    }

    private String parseLink(String str) {
        Matcher matcher = Pattern.compile(LINK_REGEX, 32).matcher(str);
        if (matcher.find()) {
            return matcher.group(1);
        }
        System.err.println("Can not parse link; returning null");
        return null;
    }

    private String parseSource(String str) {
        Matcher matcher = Pattern.compile(SOURCE_REGEX, 32).matcher(str);
        if (matcher.find()) {
            return matcher.group(1);
        }
        System.err.println("Can not parse source; returning null");
        return null;
    }

    private String parseAbstractText(String str) {
        Matcher matcher = Pattern.compile(ABSTRACT_TEXT_REGEX, 32).matcher(str);
        if (matcher.find()) {
            return StringEscapeUtils.unescapeHtml(matcher.group(1).replaceAll("<[/]*[^>]?>", ""));
        }
        System.err.println("Can not parse abstract text; returning null");
        return null;
    }

    @Override // java.lang.Iterable
    public Iterator<GoogleNews> iterator() {
        return this.parsed.iterator();
    }
}
