package com.openkm.extractor;

import com.openkm.core.Config;
import com.openkm.util.FileUtils;
import com.openkm.util.ReportUtils;
import com.openkm.util.WebUtils;
import java.io.BufferedInputStream;
import java.io.CharArrayReader;
import java.io.CharArrayWriter;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang.RandomStringUtils;
import org.apache.jackrabbit.extractor.AbstractTextExtractor;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
import org.apache.pdfbox.util.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:com/openkm/extractor/PdfTextExtractor.class */
public class PdfTextExtractor extends AbstractTextExtractor {
    private static final Logger log = LoggerFactory.getLogger(PdfTextExtractor.class);

    public PdfTextExtractor() {
        super(new String[]{ReportUtils.MIME_PDF});
    }

    public Reader extractText(InputStream inputStream, String str, String str2) throws IOException {
        try {
            try {
                PDFParser pDFParser = new PDFParser(new BufferedInputStream(inputStream));
                try {
                    pDFParser.parse();
                    PDDocument pDDocument = pDFParser.getPDDocument();
                    CharArrayWriter charArrayWriter = new CharArrayWriter();
                    PDFTextStripper pDFTextStripper = new PDFTextStripper();
                    pDFTextStripper.setLineSeparator("\n");
                    pDFTextStripper.writeText(pDDocument, charArrayWriter);
                    String trim = charArrayWriter.toString().trim();
                    log.debug("TextStripped: '{}'", trim);
                    if (!Config.SYSTEM_PDF_FORCE_OCR && trim.length() > 1) {
                        CharArrayReader charArrayReader = new CharArrayReader(charArrayWriter.toCharArray());
                        try {
                            PDDocument pDDocument2 = pDFParser.getPDDocument();
                            if (pDDocument2 != null) {
                                pDDocument2.close();
                            }
                        } catch (IOException e) {
                        }
                        inputStream.close();
                        return charArrayReader;
                    }
                    log.warn("PDF does not contains text layer");
                    List allPages = pDDocument.getDocumentCatalog().getAllPages();
                    StringBuilder sb = new StringBuilder();
                    Iterator it = allPages.iterator();
                    while (it.hasNext()) {
                        Map images = ((PDPage) it.next()).getResources().getImages();
                        if (images != null) {
                            for (String str3 : images.keySet()) {
                                PDXObjectImage pDXObjectImage = (PDXObjectImage) images.get(str3);
                                File file = null;
                                if (str3.length() < 3) {
                                    str3 = str3.concat(RandomStringUtils.randomAlphabetic(2));
                                }
                                try {
                                    file = File.createTempFile(str3, "." + pDXObjectImage.getSuffix());
                                    log.debug("Writing image: {}", file.getPath());
                                    pDXObjectImage.write2file(file);
                                    String doOcr = doOcr(file);
                                    sb.append(doOcr).append(" ");
                                    log.debug("OCR Extracted: {}", doOcr);
                                    FileUtils.deleteQuietly(file);
                                } catch (Throwable th) {
                                    FileUtils.deleteQuietly(file);
                                    throw th;
                                }
                            }
                        }
                    }
                    StringReader stringReader = new StringReader(sb.toString());
                    inputStream.close();
                    return stringReader;
                } finally {
                    try {
                        PDDocument pDDocument3 = pDFParser.getPDDocument();
                        if (pDDocument3 != null) {
                            pDDocument3.close();
                        }
                    } catch (IOException e2) {
                    }
                }
            } catch (Exception e3) {
                log.warn("Failed to extract PDF text content", e3);
                StringReader stringReader2 = new StringReader(WebUtils.EMPTY_STRING);
                inputStream.close();
                return stringReader2;
            }
        } catch (Throwable th2) {
            inputStream.close();
            throw th2;
        }
    }

    private String doOcr(File file) throws Exception {
        String str = WebUtils.EMPTY_STRING;
        if (RegisteredExtractors.isRegistered(CuneiformTextExtractor.class.getCanonicalName())) {
            str = new CuneiformTextExtractor().doOcr(file);
        } else if (RegisteredExtractors.isRegistered(Tesseract3TextExtractor.class.getCanonicalName())) {
            str = new Tesseract3TextExtractor().doOcr(file);
        } else if (RegisteredExtractors.isRegistered(AbbyTextExtractor.class.getCanonicalName())) {
            str = new AbbyTextExtractor().doOcr(file);
        } else {
            log.warn("No OCR engine configured");
        }
        return str;
    }

    static {
        PDFParser.class.getName();
    }
}
