package org.apache.tika.parser.pdf;

import com.rapidminer.operator.web.features.construction.WebserviceBasedAttributeConstruction;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSDictionary;
import org.apache.pdfbox.cos.COSInteger;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSObject;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot;
import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.text.PDFMarkedContentExtractor;
import org.apache.pdfbox.text.TextPosition;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.xmpbox.XmpConstants;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/apache/tika/parser/pdf/PDFMarkedContent2XHTML.class */
public class PDFMarkedContent2XHTML extends PDF2XHTML {
    private static final int MAX_RECURSION_DEPTH = 1000;
    private static final String DIV = "div";
    private static final Map<String, HtmlTag> COMMON_TAG_MAP = new HashMap();
    private State state;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/parser/pdf/PDFMarkedContent2XHTML$HtmlTag.class */
    public static class HtmlTag {
        private final String tag;
        private final String clazz;

        HtmlTag() {
            this("");
        }

        HtmlTag(String str) {
            this(str, "");
        }

        HtmlTag(String str, String str2) {
            this.tag = str;
            this.clazz = str2;
        }

        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            HtmlTag htmlTag = (HtmlTag) obj;
            if (Objects.equals(this.tag, htmlTag.tag)) {
                return Objects.equals(this.clazz, htmlTag.clazz);
            }
            return false;
        }

        public int hashCode() {
            return (31 * (this.tag != null ? this.tag.hashCode() : 0)) + (this.clazz != null ? this.clazz.hashCode() : 0);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/parser/pdf/PDFMarkedContent2XHTML$MCID.class */
    public static class MCID {
        private final ObjectRef objectRef;
        private final int mcid;

        public MCID(ObjectRef objectRef, int i) {
            this.objectRef = objectRef;
            this.mcid = i;
        }

        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            MCID mcid = (MCID) obj;
            return this.mcid == mcid.mcid && Objects.equals(this.objectRef, mcid.objectRef);
        }

        public int hashCode() {
            return Objects.hash(this.objectRef, Integer.valueOf(this.mcid));
        }

        public String toString() {
            return "MCID{objectRef=" + this.objectRef + ", mcid=" + this.mcid + '}';
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/parser/pdf/PDFMarkedContent2XHTML$ObjectRef.class */
    public static class ObjectRef {
        private final long objId;
        private final int version;

        public ObjectRef(long j, int i) {
            this.objId = j;
            this.version = i;
        }

        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            ObjectRef objectRef = (ObjectRef) obj;
            return this.objId == objectRef.objId && this.version == objectRef.version;
        }

        public int hashCode() {
            return Objects.hash(Long.valueOf(this.objId), Integer.valueOf(this.version));
        }

        public String toString() {
            return "ObjectRef{objId=" + this.objId + ", version=" + this.version + '}';
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/parser/pdf/PDFMarkedContent2XHTML$State.class */
    public static class State {
        Set<MCID> processedMCIDs;
        boolean inLink;
        int tableDepth;
        private StringBuilder hrefAnchorBuilder;
        private String uri;
        private int tdDepth;

        private State() {
            this.processedMCIDs = new HashSet();
            this.inLink = false;
            this.tableDepth = 0;
            this.hrefAnchorBuilder = new StringBuilder();
            this.uri = null;
            this.tdDepth = 0;
        }
    }

    private PDFMarkedContent2XHTML(PDDocument pDDocument, ContentHandler contentHandler, ParseContext parseContext, Metadata metadata, PDFParserConfig pDFParserConfig) throws IOException {
        super(pDDocument, contentHandler, parseContext, metadata, pDFParserConfig);
        this.state = new State();
    }

    public static void process(PDDocument pDDocument, ContentHandler contentHandler, ParseContext parseContext, Metadata metadata, PDFParserConfig pDFParserConfig) throws SAXException, TikaException {
        try {
            PDFMarkedContent2XHTML pDFMarkedContent2XHTML = new PDFMarkedContent2XHTML(pDDocument, contentHandler, parseContext, metadata, pDFParserConfig);
            try {
                pDFMarkedContent2XHTML.writeText(pDDocument, new Writer() { // from class: org.apache.tika.parser.pdf.PDFMarkedContent2XHTML.1
                    @Override // java.io.Writer
                    public void write(char[] cArr, int i, int i2) {
                    }

                    @Override // java.io.Writer, java.io.Flushable
                    public void flush() {
                    }

                    @Override // java.io.Writer, java.io.Closeable, java.lang.AutoCloseable
                    public void close() {
                    }
                });
                if (pDFMarkedContent2XHTML.exceptions.size() > 0) {
                    throw new TikaException("Unable to extract PDF content", pDFMarkedContent2XHTML.exceptions.get(0));
                }
            } catch (IOException e) {
                if (!(e.getCause() instanceof SAXException)) {
                    throw new TikaException("Unable to extract PDF content", e);
                }
                throw ((SAXException) e.getCause());
            }
        } catch (IOException e2) {
            throw new TikaException("couldn't initialize PDFMarkedContent2XHTML", e2);
        }
    }

    private static Map<String, HtmlTag> loadRoleMap(Map<String, Object> map) {
        if (map == null) {
            return Collections.EMPTY_MAP;
        }
        HashMap hashMap = new HashMap();
        for (Map.Entry<String, Object> entry : map.entrySet()) {
            String key = entry.getKey();
            Object value = entry.getValue();
            if (value instanceof String) {
                String lowerCase = ((String) value).toLowerCase(Locale.US);
                if (COMMON_TAG_MAP.containsValue(new HtmlTag(lowerCase))) {
                    hashMap.put(key, new HtmlTag(lowerCase));
                } else {
                    hashMap.put(key, new HtmlTag(DIV, lowerCase));
                }
            }
        }
        return hashMap;
    }

    private static void findPages(COSBase cOSBase, List<ObjectRef> list) {
        if (cOSBase != null && (cOSBase instanceof COSArray)) {
            Iterator it = ((COSArray) cOSBase).iterator();
            while (it.hasNext()) {
                COSObject cOSObject = (COSBase) it.next();
                if (cOSObject instanceof COSObject) {
                    COSDictionary object = cOSObject.getObject();
                    if (object instanceof COSDictionary) {
                        COSDictionary cOSDictionary = object;
                        if (cOSDictionary.containsKey(COSName.TYPE) && COSName.PAGE.equals(cOSDictionary.getCOSName(COSName.TYPE))) {
                            list.add(new ObjectRef(cOSObject.getObjectNumber(), cOSObject.getGenerationNumber()));
                        } else if (object.containsKey(COSName.KIDS)) {
                            findPages(object.getItem(COSName.KIDS), list);
                        }
                    }
                }
            }
        }
    }

    @Override // org.apache.tika.parser.pdf.AbstractPDF2XHTML
    protected void processPages(PDPageTree pDPageTree) throws IOException {
        ArrayList arrayList = new ArrayList();
        findPages(this.pdDocument.getPages().getCOSObject().getItem(COSName.KIDS), arrayList);
        if (arrayList.size() != this.pdDocument.getNumberOfPages()) {
            throw new IOException((Throwable) new TikaException("Couldn't find the right number of page refs (" + arrayList.size() + ") for pages (" + this.pdDocument.getNumberOfPages() + ")"));
        }
        PDStructureTreeRoot structureTreeRoot = this.pdDocument.getDocumentCatalog().getStructureTreeRoot();
        Map<String, HtmlTag> loadRoleMap = loadRoleMap(structureTreeRoot.getRoleMap());
        Map<MCID, String> loadTextByMCID = loadTextByMCID(arrayList);
        try {
            recurse(structureTreeRoot.getK(), null, 0, loadTextByMCID, loadRoleMap);
            try {
                if (this.state.hrefAnchorBuilder.length() > 0) {
                    this.xhtml.startElement("p");
                    writeString(this.state.hrefAnchorBuilder.toString());
                    this.xhtml.endElement("p");
                }
                for (MCID mcid : loadTextByMCID.keySet()) {
                    if (!this.state.processedMCIDs.contains(mcid)) {
                        if (mcid.mcid > -1) {
                        }
                        this.xhtml.startElement("p");
                        writeString(loadTextByMCID.get(mcid));
                        this.xhtml.endElement("p");
                    }
                }
                Iterator it = this.pdDocument.getPages().iterator();
                while (it.hasNext()) {
                    PDPage pDPage = (PDPage) it.next();
                    startPage(pDPage);
                    endPage(pDPage);
                }
            } catch (SAXException e) {
                throw new IOException(e);
            }
        } catch (SAXException e2) {
            throw new IOException(e2);
        }
    }

    private void recurse(COSBase cOSBase, ObjectRef objectRef, int i, Map<MCID, String> map, Map<String, HtmlTag> map2) throws IOException, SAXException {
        if (i > 1000) {
            throw new IOException((Throwable) new TikaException("Exceeded max recursion depth 1000"));
        }
        if (cOSBase instanceof COSArray) {
            Iterator it = ((COSArray) cOSBase).iterator();
            while (it.hasNext()) {
                recurse((COSBase) it.next(), objectRef, i, map, map2);
            }
            return;
        }
        if (!(cOSBase instanceof COSObject)) {
            if (cOSBase instanceof COSInteger) {
                MCID mcid = new MCID(objectRef, ((COSInteger) cOSBase).intValue());
                if (map.containsKey(mcid)) {
                    if (this.state.inLink) {
                        this.state.hrefAnchorBuilder.append(map.get(mcid));
                    } else {
                        try {
                            writeString(map.get(mcid));
                        } catch (IOException e) {
                            handleCatchableIOE(e);
                        }
                    }
                    this.state.processedMCIDs.add(mcid);
                    return;
                }
                return;
            }
            if (cOSBase instanceof COSDictionary) {
                COSDictionary cOSDictionary = (COSDictionary) cOSBase;
                COSDictionary cOSDictionary2 = cOSDictionary.getCOSDictionary(COSName.A);
                if (cOSDictionary2 != null) {
                    this.state.uri = cOSDictionary2.getString(COSName.URI);
                    return;
                } else if (cOSDictionary.containsKey(COSName.K)) {
                    recurse(cOSDictionary.getDictionaryObject(COSName.K), objectRef, i + 1, map, map2);
                    return;
                } else {
                    if (cOSDictionary.containsKey(COSName.OBJ)) {
                        recurse(cOSDictionary.getDictionaryObject(COSName.OBJ), objectRef, i + 1, map, map2);
                        return;
                    }
                    return;
                }
            }
            return;
        }
        COSName item = ((COSObject) cOSBase).getItem(COSName.TYPE);
        if (item != null && (item instanceof COSName) && "OBJR".equals(item.getName())) {
            recurse(((COSObject) cOSBase).getDictionaryObject(COSName.OBJ), objectRef, i + 1, map, map2);
        }
        COSName item2 = ((COSObject) cOSBase).getItem(COSName.S);
        String name = item2 instanceof COSName ? item2.getName() : "";
        COSBase item3 = ((COSObject) cOSBase).getItem(COSName.K);
        if (item3 == null) {
            return;
        }
        COSObject item4 = ((COSObject) cOSBase).getItem(COSName.PG);
        if (item4 != null && (item4 instanceof COSObject)) {
            objectRef = new ObjectRef(item4.getObjectNumber(), item4.getGenerationNumber());
        }
        HtmlTag tag = getTag(name, map2);
        boolean z = false;
        boolean z2 = false;
        if ("link".equals(tag.clazz)) {
            this.state.inLink = true;
            z = true;
        }
        if (!this.state.inLink) {
            if ("span".equals(tag.tag)) {
                z2 = true;
            } else if ("lbody".equals(tag.clazz)) {
                z2 = true;
            }
            if (!z2) {
                if (tag.clazz == null || tag.clazz.trim().length() <= 0) {
                    this.xhtml.startElement(tag.tag);
                } else {
                    this.xhtml.startElement(tag.tag, "class", tag.clazz);
                }
            }
        }
        recurse(item3, objectRef, i + 1, map, map2);
        if (z) {
            writeLink();
        }
        if (this.state.inLink || z || z2) {
            return;
        }
        this.xhtml.endElement(tag.tag);
    }

    private void writeLink() throws SAXException, IOException {
        if (this.state.uri == null || this.state.uri.trim().length() <= 0) {
            try {
                writeString(this.state.hrefAnchorBuilder.toString());
            } catch (IOException e) {
                handleCatchableIOE(e);
            }
        } else {
            this.xhtml.startElement("a", "href", this.state.uri);
            this.xhtml.characters(this.state.hrefAnchorBuilder.toString());
            this.xhtml.endElement("a");
        }
        this.state.hrefAnchorBuilder.setLength(0);
        this.state.inLink = false;
        this.state.uri = null;
    }

    private HtmlTag getTag(String str, Map<String, HtmlTag> map) {
        if (map.containsKey(str)) {
            return map.get(str);
        }
        String lowerCase = str.toLowerCase(Locale.US);
        if (COMMON_TAG_MAP.containsKey(lowerCase)) {
            return COMMON_TAG_MAP.get(lowerCase);
        }
        map.put(str, new HtmlTag(DIV, str.toLowerCase(Locale.US)));
        return map.get(str);
    }

    private Map<MCID, String> loadTextByMCID(List<ObjectRef> list) throws IOException {
        String unicode;
        int i = 1;
        HashMap hashMap = new HashMap();
        Iterator it = this.pdDocument.getPages().iterator();
        while (it.hasNext()) {
            PDPage pDPage = (PDPage) it.next();
            ObjectRef objectRef = list.get(i - 1);
            PDFMarkedContentExtractor pDFMarkedContentExtractor = new PDFMarkedContentExtractor();
            try {
                pDFMarkedContentExtractor.processPage(pDPage);
                for (PDMarkedContent pDMarkedContent : pDFMarkedContentExtractor.getMarkedContents()) {
                    List contents = pDMarkedContent.getContents();
                    StringBuilder sb = new StringBuilder();
                    for (Object obj : contents) {
                        if ((obj instanceof TextPosition) && (unicode = ((TextPosition) obj).getUnicode()) != null) {
                            sb.append(unicode);
                        }
                    }
                    int mcid = pDMarkedContent.getMCID();
                    MCID mcid2 = new MCID(objectRef, mcid);
                    String sb2 = sb.toString();
                    if (pDMarkedContent.getTag().equals("P")) {
                        sb2 = sb2.trim();
                    }
                    if (mcid < 0 && hashMap.containsKey(mcid2)) {
                        sb2 = ((String) hashMap.get(mcid2)) + "\n" + sb2;
                    }
                    hashMap.put(mcid2, sb2);
                }
                i++;
            } catch (IOException e) {
                handleCatchableIOE(e);
            }
        }
        return hashMap;
    }

    @Override // org.apache.tika.parser.pdf.PDF2XHTML
    public /* bridge */ /* synthetic */ void processPage(PDPage pDPage) throws IOException {
        super.processPage(pDPage);
    }

    @Override // org.apache.tika.parser.pdf.AbstractPDF2XHTML
    public /* bridge */ /* synthetic */ void setStartPage(int i) {
        super.setStartPage(i);
    }

    @Override // org.apache.tika.parser.pdf.AbstractPDF2XHTML
    public /* bridge */ /* synthetic */ int getStartPage() {
        return super.getStartPage();
    }

    @Override // org.apache.tika.parser.pdf.AbstractPDF2XHTML
    public /* bridge */ /* synthetic */ void setEndBookmark(PDOutlineItem pDOutlineItem) {
        super.setEndBookmark(pDOutlineItem);
    }

    @Override // org.apache.tika.parser.pdf.AbstractPDF2XHTML
    public /* bridge */ /* synthetic */ void setStartBookmark(PDOutlineItem pDOutlineItem) {
        super.setStartBookmark(pDOutlineItem);
    }

    @Override // org.apache.tika.parser.pdf.AbstractPDF2XHTML
    public /* bridge */ /* synthetic */ int getCurrentPageNo() {
        return super.getCurrentPageNo();
    }

    static {
        COMMON_TAG_MAP.put("document", new HtmlTag(WebserviceBasedAttributeConstruction.PARAMETER_HTTP_BODY));
        COMMON_TAG_MAP.put(DIV, new HtmlTag(DIV));
        COMMON_TAG_MAP.put("p", new HtmlTag("p"));
        COMMON_TAG_MAP.put("span", new HtmlTag("span"));
        COMMON_TAG_MAP.put("table", new HtmlTag("table"));
        COMMON_TAG_MAP.put("thead", new HtmlTag("thead"));
        COMMON_TAG_MAP.put("tbody", new HtmlTag("tbody"));
        COMMON_TAG_MAP.put("tr", new HtmlTag("tr"));
        COMMON_TAG_MAP.put("th", new HtmlTag("th"));
        COMMON_TAG_MAP.put("td", new HtmlTag("td"));
        COMMON_TAG_MAP.put("l", new HtmlTag("ul"));
        COMMON_TAG_MAP.put(XmpConstants.LIST_NAME, new HtmlTag(XmpConstants.LIST_NAME));
        COMMON_TAG_MAP.put("h1", new HtmlTag("h1"));
        COMMON_TAG_MAP.put("h2", new HtmlTag("h2"));
        COMMON_TAG_MAP.put("h3", new HtmlTag("h3"));
        COMMON_TAG_MAP.put("h4", new HtmlTag("h4"));
        COMMON_TAG_MAP.put("h5", new HtmlTag("h5"));
        COMMON_TAG_MAP.put("h6", new HtmlTag("h6"));
    }
}
