package org.webdatacommons.webtables.extraction.util;

import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multisets;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.webdatacommons.webtables.tools.cleaning.CustomAnalyzer;

/* loaded from: input_file:org/webdatacommons/webtables/extraction/util/LuceneNormalizer.class */
public class LuceneNormalizer {
    protected Set<String> stopWords;
    protected int minTermLength = 0;
    Analyzer analyzer = new CustomAnalyzer();

    public Set<String> setOfTerms(String str) throws IOException {
        HashSet hashSet = new HashSet();
        if (str.length() == 0) {
            return hashSet;
        }
        TokenStream tokenStream = this.analyzer.tokenStream((String) null, new StringReader(str));
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            hashSet.add(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        return hashSet;
    }

    public Set<String> topNTerms(String str, int i) throws IOException {
        HashSet hashSet = new HashSet();
        HashMultiset create = HashMultiset.create();
        if (str.length() == 0) {
            return hashSet;
        }
        TokenStream tokenStream = this.analyzer.tokenStream((String) null, new StringReader(str));
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            create.add(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        for (E e : Multisets.copyHighestCountFirst(create).elementSet()) {
            if (i == 0) {
                break;
            }
            hashSet.add(e);
            i--;
        }
        return hashSet;
    }
}
