package org.webdatacommons.webtables.extraction;

import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Optional;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multisets;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.webdatacommons.webtables.extraction.detection.HeaderDetection;
import org.webdatacommons.webtables.extraction.detection.KeyColumnDetection;
import org.webdatacommons.webtables.extraction.model.DocumentMetadata;
import org.webdatacommons.webtables.extraction.stats.HashMapStatsData;
import org.webdatacommons.webtables.extraction.stats.StatsKeeper;
import org.webdatacommons.webtables.extraction.stats.TableStats;
import org.webdatacommons.webtables.extraction.util.LuceneNormalizer;
import org.webdatacommons.webtables.extraction.util.TableConvert;
import org.webdatacommons.webtables.tools.data.Dataset;
import org.webdatacommons.webtables.tools.data.HeaderPosition;
import org.webdatacommons.webtables.tools.data.TableOrientation;
import weka.core.TestInstances;
import weka.core.xml.XMLInstances;

/* loaded from: input_file:org/webdatacommons/webtables/extraction/BasicExtractionAlgorithm.class */
public class BasicExtractionAlgorithm implements ExtractionAlgorithm {
    protected static final int TABLE_MIN_COLS = 2;
    protected static final double TABLE_MAX_SPARSENESS = 0.49d;
    protected static final double TABLE_MAX_LINKS = 0.51d;
    protected static final double MIN_ATTRIBUTE_SIZE_AVG = 4.0d;
    protected static final double MAX_ATTRIBUTE_SIZE_AVG = 20.0d;
    protected static final int TABLE_MIN_ROWS = 3;
    protected static final int CONTEXT_LENGTH_BEFORE_TABLE = 250;
    protected static final int CONTEXT_LENGTH_AFTER_TABLE = 250;
    protected StatsKeeper stats;
    protected Pattern tablePattern1;
    protected boolean th_filter;
    protected boolean th_filter_strong;
    protected boolean th_filter_mark_only;
    protected boolean extract_terms;
    protected boolean extract_content;
    protected boolean extract_part_content;
    protected boolean save_reference;
    protected LuceneNormalizer termExtractor;
    TableClassification tc;
    TableConvert myTableConvert;
    TableStats myTableStats;
    protected static final CharMatcher cleaner = CharMatcher.WHITESPACE;
    protected static final Joiner joiner = Joiner.on(TestInstances.DEFAULT_SEPARATORS).skipNulls();
    protected static final Whitelist whitelist = Whitelist.simpleText();
    private static int NUM_RUNS = 50;

    /* loaded from: input_file:org/webdatacommons/webtables/extraction/BasicExtractionAlgorithm$TABLE_COUNTERS.class */
    public enum TABLE_COUNTERS {
        TABLE_NOT_PRESENT,
        TABLES_FOUND,
        TABLES_INSIDE_FORMS,
        NON_LEAF_TABLES,
        SMALL_TABLES,
        RELATIONS_FOUND,
        SPARSE_TABLE,
        LINK_TABLE,
        CALENDAR_FOUND,
        NON_REGULAR_TABLES,
        LANGDETECT_EXCEPTION,
        ENGLISH,
        NON_ENGLISH,
        TO_MANY_BADWORDS,
        SPANNING_TD,
        NO_HEADERS,
        MORE_THAN_ONE_HEADER,
        SHORT_ATTRIBUTE_NAMES,
        LONG_ATTRIBUTE_NAMES,
        NO_HEADERS_CORRECTION
    }

    public BasicExtractionAlgorithm(StatsKeeper statsKeeper, boolean z, TableClassification tableClassification) {
        this.tablePattern1 = Pattern.compile("<table(.*)(class=\"(.*)\"|id=\"(.*)\")(.*)?>");
        this.tc = new TableClassification("/org/webdatacommons/webtables/SimpleCart_P1.mdl", "/org/webdatacommons/webtables/SimpleCart_P2.mdl");
        this.myTableConvert = new TableConvert(2, 2);
        if (tableClassification != null) {
            this.tc = tableClassification;
        }
        this.stats = statsKeeper;
        this.extract_terms = z;
        if (this.extract_terms) {
            this.termExtractor = new LuceneNormalizer();
        }
    }

    public BasicExtractionAlgorithm(StatsKeeper statsKeeper, boolean z) {
        this(statsKeeper, z, null);
    }

    @Override // org.webdatacommons.webtables.extraction.ExtractionAlgorithm
    public List<Dataset> extract(Document document, DocumentMetadata documentMetadata) throws IOException, InterruptedException {
        ArrayList arrayList = new ArrayList();
        int i = -1;
        Iterator it = document.getElementsByTag("table").iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            this.stats.reportProgress();
            i++;
            this.stats.incCounter(TABLE_COUNTERS.TABLES_FOUND);
            Iterator it2 = element.parents().iterator();
            while (true) {
                if (it2.hasNext()) {
                    if (((Element) it2.next()).tagName().equals("form")) {
                        this.stats.incCounter(TABLE_COUNTERS.TABLES_INSIDE_FORMS);
                        break;
                    }
                } else {
                    Elements elementsByTag = element.getElementsByTag("table");
                    elementsByTag.remove(element);
                    if (elementsByTag.size() > 0) {
                        this.stats.incCounter(TABLE_COUNTERS.NON_LEAF_TABLES);
                    } else {
                        Elements elementsByTag2 = element.getElementsByTag("tr");
                        if (elementsByTag2.size() < 3) {
                            this.stats.incCounter(TABLE_COUNTERS.SMALL_TABLES);
                        } else {
                            int i2 = 0;
                            int[] iArr = new int[elementsByTag2.size()];
                            HashMultiset create = HashMultiset.create();
                            for (int i3 = 0; i3 < elementsByTag2.size(); i3++) {
                                int size = ((Element) elementsByTag2.get(i3)).select("td, th").size();
                                iArr[i3] = size;
                                create.add(Integer.valueOf(size));
                                if (size > i2) {
                                    i2 = size;
                                }
                            }
                            int intValue = ((Integer) ((Multiset.Entry) Multisets.copyHighestCountFirst(create).entrySet().iterator().next()).getElement()).intValue();
                            if (intValue < 2) {
                                this.stats.incCounter(TABLE_COUNTERS.SMALL_TABLES);
                            } else if (i2 != intValue) {
                                this.stats.incCounter(TABLE_COUNTERS.NON_REGULAR_TABLES);
                            } else {
                                Elements select = element.select("td[colspan], th[colspan]");
                                Elements select2 = element.select("td[rowspan], th[rowspan]");
                                if (select.size() > 0 || select2.size() > 0) {
                                    this.stats.incCounter(TABLE_COUNTERS.SPANNING_TD);
                                } else {
                                    Boolean bool = true;
                                    if (element.select("th").size() == 0) {
                                        this.stats.incCounter(TABLE_COUNTERS.NO_HEADERS);
                                        bool = false;
                                    }
                                    Optional<Dataset> doExtract = doExtract(element, elementsByTag2, intValue);
                                    if (doExtract.isPresent()) {
                                        Dataset dataset = doExtract.get();
                                        dataset.tableNum = i;
                                        dataset.s3Link = documentMetadata.getS3Link();
                                        dataset.recordOffset = documentMetadata.getStart();
                                        dataset.recordEndOffset = documentMetadata.getEnd();
                                        dataset.url = documentMetadata.getUrl();
                                        dataset.lastModified = documentMetadata.getLastModified();
                                        dataset.hasHeader = bool;
                                        Elements select3 = element.select("caption");
                                        if (select3.size() == 1) {
                                            dataset.setTitle(cleanCell(((Element) select3.get(0)).text()));
                                        }
                                        dataset.setPageTitle(document.title());
                                        this.stats.incCounter(TABLE_COUNTERS.RELATIONS_FOUND);
                                        dataset.setTableType(this.tc.classifyTable(element).getTableType());
                                        Element[][] elementArr = this.myTableConvert.toTable(element).get();
                                        dataset.setTableOrientation(tableOrientation(element, dataset.getHeaderPosition().toString(), dataset.getTableType().toString(), elementArr));
                                        String elements = document.select(XMLInstances.TAG_BODY).toString();
                                        int i4 = 0;
                                        Matcher matcher = this.tablePattern1.matcher(elements);
                                        while (matcher.find()) {
                                            if (matcher.group().length() != 0 && (matcher.group().contains(element.className().trim()) || matcher.group().contains(element.id().trim()))) {
                                                i4 = matcher.start();
                                                break;
                                            }
                                        }
                                        int i5 = 0;
                                        String[] split = elements.split("\n");
                                        int i6 = 0;
                                        while (true) {
                                            if (i6 >= split.length) {
                                                break;
                                            }
                                            if (split[i6] != null && split[i6].trim().contains(element.toString().split("\n")[0].trim())) {
                                                i5 = i6;
                                                break;
                                            }
                                            i6++;
                                        }
                                        int length = (element.toString().split("\n").length + i5) - 1;
                                        ArrayList arrayList2 = new ArrayList();
                                        ArrayList arrayList3 = new ArrayList();
                                        int i7 = 0;
                                        for (int i8 = i5 - 1; i8 > 0; i8--) {
                                            String[] split2 = split[i8].trim().split(TestInstances.DEFAULT_SEPARATORS);
                                            for (int i9 = 0; i9 < split2.length; i9++) {
                                                if (split2[i9] != null && !split2[i9].isEmpty() && !split2[i9].equals(TestInstances.DEFAULT_SEPARATORS)) {
                                                    arrayList2.add(split2[i9].trim());
                                                    i7++;
                                                    if (i7 >= 250) {
                                                        break;
                                                    }
                                                }
                                            }
                                        }
                                        int i10 = 0;
                                        for (int i11 = length; i11 < split.length; i11++) {
                                            String[] split3 = split[i11].trim().split(TestInstances.DEFAULT_SEPARATORS);
                                            for (int i12 = 0; i12 < split3.length; i12++) {
                                                if (split3[i12] != null && !split3[i12].isEmpty() && !split3[i12].equals(TestInstances.DEFAULT_SEPARATORS)) {
                                                    arrayList3.add(split3[i12].trim());
                                                    i10++;
                                                    if (i10 >= 250) {
                                                        break;
                                                    }
                                                }
                                            }
                                        }
                                        Iterator it3 = arrayList2.iterator();
                                        while (it3.hasNext()) {
                                            dataset.textBeforeTable += ((String) it3.next()) + TestInstances.DEFAULT_SEPARATORS;
                                        }
                                        dataset.textBeforeTable = Jsoup.parse(dataset.textBeforeTable).text();
                                        Iterator it4 = arrayList3.iterator();
                                        while (it4.hasNext()) {
                                            dataset.textAfterTable += ((String) it4.next()) + TestInstances.DEFAULT_SEPARATORS;
                                        }
                                        dataset.textAfterTable = Jsoup.parse(dataset.textAfterTable).text();
                                        Elements elementsByTag3 = document.getElementsByTag("p");
                                        HashMap hashMap = new HashMap();
                                        HashMap hashMap2 = new HashMap();
                                        if (!elementsByTag3.isEmpty() && !TableContextContaningTimeStamp(document.getElementsByTag("p")).isEmpty()) {
                                            Iterator<Element> it5 = TableContextContaningTimeStamp(document.getElementsByTag("p")).iterator();
                                            while (it5.hasNext()) {
                                                Element next = it5.next();
                                                int indexOf = elements.indexOf(next.toString());
                                                if (i4 - indexOf < 0) {
                                                    hashMap2.put(Integer.valueOf((i4 - indexOf) * (-1)), next.text());
                                                } else {
                                                    hashMap.put(Integer.valueOf(i4 - indexOf), next.text());
                                                }
                                            }
                                            if (hashMap2.isEmpty() && !hashMap.isEmpty()) {
                                                dataset.setTableContextTimeStampAfterTable(null);
                                                dataset.setTableContextTimeStampBeforeTable(hashMap.toString());
                                            } else if (!hashMap.isEmpty() || hashMap2.isEmpty()) {
                                                dataset.setTableContextTimeStampBeforeTable(hashMap.toString());
                                                dataset.setTableContextTimeStampAfterTable(hashMap2.toString());
                                            } else {
                                                dataset.setTableContextTimeStampBeforeTable(null);
                                                dataset.setTableContextTimeStampAfterTable(hashMap2.toString());
                                            }
                                            hashMap2.clear();
                                            hashMap.clear();
                                        } else if (elementsByTag3.isEmpty()) {
                                            dataset.setTableContextTimeStampBeforeTable(null);
                                            dataset.setTableContextTimeStampAfterTable(null);
                                        } else {
                                            dataset.setTableContextTimeStampBeforeTable(null);
                                            dataset.setTableContextTimeStampAfterTable(null);
                                        }
                                        if (dataset.getTableOrientation() == TableOrientation.VERTICAL) {
                                            elementArr = this.myTableConvert.transpose(elementArr, element);
                                        }
                                        this.myTableStats = new TableStats(elementArr[1].length, elementArr.length, elementArr);
                                        KeyColumnDetection keyColumnDetection = new KeyColumnDetection(this.myTableStats).keyColumnDetection();
                                        if (keyColumnDetection.isHasKeyColumn()) {
                                            dataset.setHasKeyColumn(true);
                                            dataset.setKeyColumnIndex(keyColumnDetection.getKeyColumnIndex());
                                        } else {
                                            dataset.setHasKeyColumn(false);
                                            dataset.setKeyColumnIndex(keyColumnDetection.getKeyColumnIndex());
                                        }
                                        if (!dataset.hasHeader.booleanValue()) {
                                            HeaderDetection HeaderDetectionBasedOnCellContentPattern = new HeaderDetection(this.myTableStats).HeaderDetectionBasedOnCellContentPattern();
                                            if (HeaderDetectionBasedOnCellContentPattern.isHasHeader()) {
                                                dataset.setHasHeader(true);
                                                dataset.setHeaderPosition(HeaderPosition.FIRST_ROW);
                                                dataset.setHeaderRowIndex(HeaderDetectionBasedOnCellContentPattern.getRowIndex());
                                                this.stats.incCounter(TABLE_COUNTERS.NO_HEADERS_CORRECTION);
                                            } else {
                                                dataset.setHasHeader(false);
                                                dataset.setHeaderPosition(HeaderPosition.NONE);
                                                dataset.setHeaderRowIndex(HeaderDetectionBasedOnCellContentPattern.getRowIndex());
                                            }
                                        }
                                        arrayList.add(dataset);
                                    } else {
                                        this.stats.incCounter(TABLE_COUNTERS.TABLE_NOT_PRESENT);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
        return arrayList;
    }

    protected Element findProperParentTag(Element element) {
        ArrayList arrayList = new ArrayList();
        arrayList.add("tbody");
        arrayList.add("tr");
        arrayList.add("td");
        arrayList.add("table");
        String tagName = element.parent().tagName();
        Element parent = element.parent();
        if (arrayList.contains(tagName)) {
            parent = findProperParentTag(parent);
        }
        return parent;
    }

    protected HashSet<Element> TableContextContaningTimeStamp(Elements elements) {
        HashSet<Element> hashSet = new HashSet<>();
        Iterator it = elements.iterator();
        while (it.hasNext()) {
            Element element = (Element) it.next();
            String text = element.text();
            Matcher matcher = Pattern.compile("[0-3]?[0-9][\\-|\\|\\.][0-3]?[0-9][\\-|\\|\\.](?:[0-9]{2})?[0-9]{2}(?=(\\s|\\.|\n|$|\\,))").matcher(text);
            while (matcher.find()) {
                if (matcher.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher2 = Pattern.compile("([J|j]an(?:uary)?|[F|f]eb(?:ruary)?|[M|m]ar(?:ch)?|[A|a]pr(?:il)?|May|[J|j]un(?:e)?|[J|j]ul(?:y)?|[A|a]ug(?:ust)?|[S|s]ep(?:tember)?|[O|o]ct(?:ober)?|[N|n]ov(?:ember)?|[D|d]ec(?:ember)?)(?=(\\s|\\.|\n|$|\\,))").matcher(text);
            while (matcher2.find()) {
                if (matcher2.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher3 = Pattern.compile("(1[0-2]|0?[1-9]):([0-5]?[0-9])(\\s)?([AP]M)?(?=(\\s|\n|\\.|$|\\,))").matcher(text);
            while (matcher3.find()) {
                if (matcher3.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher4 = Pattern.compile("(1[0-2]|0\t?[1-9]):([0-5]?[0-9]):([0-5]?[0-9])(\\s)?([AP]M)?(?=(\\s|\n|\\.|$|\\,))").matcher(text);
            while (matcher4.find()) {
                if (matcher4.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher5 = Pattern.compile("(([1-2][0-9]{3})(\\s?(BC|AC|B.C.|A.C.)?)|([0-9]{3})\\s?(BC|AC|B.C.|A.C.)|([0-9]{2})\\s?(BC|AC|B.C.|A.C.)|([0-9]{1})\\s?(BC|AC|B.C.|A.C.))(?=(\\s|\n|\\.|$|\\,|-|\\/))").matcher(text);
            while (matcher5.find()) {
                if (matcher5.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher6 = Pattern.compile("(2[0-3]|[01]?[0-9]):([0-5]?[0-9])(?=(\\s|\n|\\.|$|\\,))").matcher(text);
            while (matcher6.find()) {
                if (matcher6.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher7 = Pattern.compile("(2[0-3]|[01]?[0-9]):([0-5]?[0-9]):([0-5]?[0-9])(?=(\\s|\n|\\.|$|\\,))").matcher(text);
            while (matcher7.find()) {
                if (matcher7.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher8 = Pattern.compile("(1[0-2]|0?[1-9]):([0-5]?[0-9])(\\s)?(H|h)?(?=(\\s|\n|\\.|$|\\,))").matcher(text);
            while (matcher8.find()) {
                if (matcher8.group().length() != 0) {
                    hashSet.add(element);
                }
            }
            Matcher matcher9 = Pattern.compile("(([1]?[0-9]|[2]?[0-4])\\s?([O|o]['|`][C|c]lock))(?=(\\s|\n|\\.|$|\\,))").matcher(text);
            while (matcher9.find()) {
                if (matcher9.group().length() != 0) {
                    hashSet.add(element);
                }
            }
        }
        return hashSet;
    }

    protected Optional<Dataset> doExtract(Element element, Elements elements, int i) {
        int size = elements.size() * i;
        return asRelation(elements, i, (int) (TABLE_MAX_SPARSENESS * size), (int) (TABLE_MAX_LINKS * size));
    }

    protected Optional<Dataset> asRelation(Elements elements, int i, int i2, int i3) {
        int i4 = 0;
        int i5 = 0;
        int size = elements.size();
        String[][] strArr = new String[i][size];
        for (int i6 = 0; i6 < size; i6++) {
            Elements select = ((Element) elements.get(i6)).select("td, th");
            int size2 = select.size();
            int i7 = 0;
            while (i7 < size2) {
                Element element = (Element) select.get(i7);
                if (element.select("a").size() > 0) {
                    i5++;
                }
                String cleanCell = cleanCell(element.text());
                if (cleanCell.length() == 0) {
                    i4++;
                }
                strArr[i7][i6] = cleanCell;
                i7++;
            }
            for (int i8 = i7; i8 < i; i8++) {
                strArr[i8][i6] = "";
            }
            if (i4 > i2) {
                this.stats.incCounter(TABLE_COUNTERS.SPARSE_TABLE);
                return Optional.absent();
            }
            if (i5 > i3) {
                this.stats.incCounter(TABLE_COUNTERS.LINK_TABLE);
                return Optional.absent();
            }
        }
        Dataset dataset = new Dataset();
        dataset.relation = strArr;
        dataset.headerPosition = headerPosition(elements);
        return Optional.of(dataset);
    }

    protected TableOrientation tableOrientation(Element element, String str, String str2, Element[][] elementArr) {
        double[] dArr;
        double[] dArr2;
        double[] dArr3;
        double[] dArr4;
        if (elementArr.length > 10 && elementArr[1].length > 10) {
            dArr = new double[10];
            dArr2 = new double[10];
        } else if (elementArr.length > 10 && elementArr[1].length <= 10) {
            dArr = new double[elementArr[1].length];
            dArr2 = new double[10];
        } else if (elementArr.length > 10 || elementArr[1].length <= 10) {
            dArr = new double[elementArr[1].length];
            dArr2 = new double[elementArr.length];
        } else {
            dArr = new double[10];
            dArr2 = new double[elementArr.length];
        }
        for (int i = 0; i < dArr2.length; i++) {
            for (int i2 = 0; i2 < dArr.length; i2++) {
                if (elementArr[i][i2] == null) {
                    dArr[i2] = 0.0d;
                } else {
                    dArr[i2] = elementArr[i][i2].text().length();
                }
            }
            dArr2[i] = StandardDeviation(dArr);
        }
        double d = 0.0d;
        for (double d2 : dArr2) {
            d += d2;
        }
        double length = d / dArr2.length;
        if (elementArr.length > 10 && elementArr[1].length > 10) {
            dArr3 = new double[10];
            dArr4 = new double[10];
        } else if (elementArr.length > 10 && elementArr[1].length <= 10) {
            dArr3 = new double[10];
            dArr4 = new double[elementArr[1].length];
        } else if (elementArr.length > 10 || elementArr[1].length <= 10) {
            dArr3 = new double[elementArr.length];
            dArr4 = new double[elementArr[1].length];
        } else {
            dArr3 = new double[elementArr.length];
            dArr4 = new double[10];
        }
        for (int i3 = 0; i3 < dArr4.length; i3++) {
            for (int i4 = 0; i4 < dArr3.length; i4++) {
                if (elementArr[i4][i3] == null) {
                    dArr3[i4] = 0.0d;
                } else {
                    dArr3[i4] = elementArr[i4][i3].text().length();
                }
            }
            dArr4[i3] = StandardDeviation(dArr3);
        }
        double d3 = 0.0d;
        for (double d4 : dArr4) {
            d3 += d4;
        }
        return str.equals("FIRST_COLUMN") ? TableOrientation.VERTICAL : str.equals("FIRST_ROW") ? TableOrientation.HORIZONTAL : str2.equals("MATRIX") ? TableOrientation.MIXED : d3 / ((double) dArr4.length) <= length ? TableOrientation.HORIZONTAL : TableOrientation.VERTICAL;
    }

    protected double StandardDeviation(double[] dArr) {
        double d = 0.0d;
        for (double d2 : dArr) {
            d += d2;
        }
        double length = d / dArr.length;
        double[] dArr2 = new double[dArr.length];
        for (int i = 0; i < dArr2.length; i++) {
            dArr2[i] = Math.pow(dArr[i] - length, 2.0d);
        }
        double d3 = 0.0d;
        for (double d4 : dArr2) {
            d3 += d4;
        }
        return Math.sqrt(d3 / (dArr.length - 1));
    }

    protected HeaderPosition headerPosition(Elements elements) {
        boolean z = true;
        boolean z2 = true;
        Elements children = ((Element) elements.get(0)).children();
        int size = children.size();
        int i = 1;
        while (true) {
            if (i >= size) {
                break;
            }
            if (!((Element) children.get(i)).tag().getName().equals("th")) {
                z = false;
                break;
            }
            i++;
        }
        int size2 = elements.size();
        for (int i2 = 1; i2 < size2; i2++) {
            Elements children2 = ((Element) elements.get(i2)).children();
            if (children2.size() == 0 || !((Element) children2.get(0)).tag().getName().equals("th")) {
                z2 = false;
                break;
            }
        }
        return (z && z2) ? HeaderPosition.MIXED : z ? HeaderPosition.FIRST_ROW : z2 ? HeaderPosition.FIRST_COLUMN : HeaderPosition.NONE;
    }

    protected static String cleanCell(String str) {
        return cleaner.trimAndCollapseFrom(StringEscapeUtils.unescapeHtml4(Jsoup.clean(str, whitelist)), ' ');
    }

    @Override // org.webdatacommons.webtables.extraction.ExtractionAlgorithm
    public StatsKeeper getStatsKeeper() {
        return this.stats;
    }

    public static void main(String[] strArr) throws MalformedURLException, IOException, InterruptedException {
        BasicExtractionAlgorithm basicExtractionAlgorithm = new BasicExtractionAlgorithm(new HashMapStatsData(), true, null);
        for (String str : new String[]{"https://en.wikipedia.org/wiki/BRIC"}) {
            InputStream openStream = new URL(str).openStream();
            long nanoTime = System.nanoTime();
            for (int i = 0; i < NUM_RUNS; i++) {
                for (Dataset dataset : basicExtractionAlgorithm.extract(Jsoup.parse(openStream, (String) null, ""), new DocumentMetadata(0L, 0L, "", "", ""))) {
                    System.out.println(Arrays.deepToString(dataset.relation));
                    System.out.println("Has Header: " + dataset.getHasHeader());
                    System.out.println("Header Index: " + dataset.getHeaderRowIndex());
                    System.out.println("Header Position: " + dataset.getHeaderPosition());
                    System.out.println("Table Orientation: " + dataset.getTableOrientation());
                    System.out.println("Table Context Before Table(Time Stamp): " + dataset.getTableContextTimeStampBeforeTable());
                    System.out.println("Table Context After Table(Time Stamp): " + dataset.getTableContextTimeStampAfterTable());
                    System.out.println("Table Context Before Table: " + dataset.getTextBeforeTable());
                    System.out.println("Table Context After Table: " + dataset.getTextAfterTable());
                    System.out.println("Has Key Column: " + dataset.getHasKeyColumn());
                    System.out.println("Key Column Index: " + dataset.getKeyColumnIndex());
                    System.out.println("Table Type: " + dataset.getTableType());
                    System.out.println("Page Title: " + dataset.getPageTitle());
                    System.out.println("Page URL: " + dataset.getUrl());
                    System.out.println("Table Number: " + dataset.getTableNum());
                    System.out.println("Table Caption: " + dataset.getTitle());
                    System.out.println("");
                    System.out.println("-----------------------------------------------------------------------------------------");
                    System.out.println("");
                }
            }
            System.out.println("Time: " + ((((float) (System.nanoTime() - nanoTime)) / NUM_RUNS) / 1000000.0f));
        }
    }
}
