package edu.pitt.dbmi.edda.operator.regexop.document;

import edu.pitt.dbmi.edda.util.FileUtils;
import java.io.File;
import java.io.Serializable;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/pitt/dbmi/edda/operator/regexop/document/LabeledDocument.class */
public class LabeledDocument implements Serializable {
    private static final long serialVersionUID = 1;
    public String label;
    public File file;
    public transient String documentContent;
    public Integer documentNumber;
    public String key;
    public int numberOfSpaceTokens = 1;
    public Boolean isUsingAbstractsOnly = false;

    public String getContent() {
        if (this.documentContent == null) {
            this.documentContent = FileUtils.getContents(this.file);
            if (this.isUsingAbstractsOnly.booleanValue()) {
                this.documentContent = extractAbstractFromContent(this.documentContent);
            }
            this.numberOfSpaceTokens = Math.max(countSpaceTokens(this.documentContent), 1);
        }
        return this.documentContent;
    }

    public void clearContent() {
        this.documentContent = null;
    }

    private String extractAbstractFromContent(String str) {
        int indexOf = str.indexOf("AB  -");
        return str.substring(indexOf == -1 ? indexOf + 1 : indexOf + "AB  -".length(), str.length());
    }

    private int countSpaceTokens(String str) {
        int i = 0;
        while (Pattern.compile("\\s+", 34).matcher(str).find()) {
            i++;
        }
        return i;
    }

    public String toString() {
        return this.label + " ==> " + this.file.getName();
    }
}
