contract-review/qingyun-service/src/main/java/com/qingyun/service/compare/FetchAllPagesContent.java

package com.qingyun.service.compare;

import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.qingyun.common.exception.ServiceException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;

@Component
public class FetchAllPagesContent {

    @Value("${ocr.url}")
    private String orcServerUrl;

    // 日志
    private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class);

    // 获取所有页面内容
    public List<String> extractPDFPagesContent(File pdfFile) throws Exception {
        List<Triple<String, Integer, Boolean>> base64Pages = splitPdfToBase64Pages(pdfFile);
        if (CollectionUtil.isEmpty(base64Pages)) {
            throw new Exception("无法获取页面内容");
        }
        Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF

        List<String> contentList = new ArrayList<>();

        for (int i = 0; i < base64Pages.size(); i++) {
            int pageNum = i + 1;
            Triple<String, Integer, Boolean> base64Page = base64Pages.get(i);
            String base64 = base64Page.getLeft();
            Integer pageData = base64Page.getMiddle(); // 获取页数信息
            Boolean scannedPdf = base64Page.getRight();

            try {
                // 构造请求体
                Map<String, Object> requestBody = new HashMap<>();
                requestBody.put("file", base64);
                requestBody.put("fileType", scannedPdf ? 1 : 0);  // 文件类型。0表示PDF文件，1表示图像文件
                HttpRequest post = HttpUtil.createPost(orcServerUrl);
                post.header("Content-Type", "application/json");
                post.body(JSONObject.toJSONString(requestBody));
                HttpResponse response = post.execute();

                JSONObject jsonObject = JSONObject.parseObject(response.body());
                if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
                    log.error("请求失败，错误码：{}，错误信息：{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
                    // 请求失败，为每一页添加空字符串
                    for (int j = 0; j < pageData; j++) {
                        contentList.add("");
                    }
                    continue;
                }
                JSONObject result = jsonObject.getJSONObject("result");
                JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
                if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
                    // 没有识别结果，为每一页添加空字符串
                    for (int j = 0; j < pageData; j++) {
                        contentList.add("");
                    }
                    continue;
                }
                // 处理每一页的识别结果
                for (int j = 0; j < layoutParsingResults.size(); j++) {
                    JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j);
                    JSONObject markdown = layoutParsingResult.getJSONObject("markdown");
                    if (markdown == null) {
                        contentList.add("");
                        continue;
                    }
                    String text = markdown.getString("text");
                    if (StringUtils.isBlank(text)) {
                        contentList.add("");
                        continue;
                    }
                    contentList.add(text);
                }
                // 如果返回的页数少于预期页数，补充空字符串
                if (layoutParsingResults.size() < pageData) {
                    for (int j = layoutParsingResults.size(); j < pageData; j++) {
                        contentList.add("");
                    }
                }
            } catch (Exception e) {
                log.error("获取页面 {} 内容失败", pageNum, e);
                // 出错时为每一页添加空字符串保持页数一致
                for (int j = 0; j < pageData; j++) {
                    contentList.add("");
                }
            }
        }
        return contentList;
    }

    // 获取所有页面内容
    public String extractImgPagesContent(File imgFile) throws Exception {
        try {
            byte[] pageBytes = Files.readAllBytes(imgFile.toPath());
            String base64 = Base64.getEncoder().encodeToString(pageBytes);
            if (StrUtil.isBlank(base64)) {
                throw new ServiceException("无法获取图片内容");
            }
            Files.deleteIfExists(imgFile.toPath()); // 清理文件

            // 构造请求体
            Map<String, Object> requestBody = new HashMap<>();
            requestBody.put("file", base64);
            requestBody.put("fileType", 1);  // 文件类型。0表示PDF文件，1表示图像文件
            HttpRequest post = HttpUtil.createPost(orcServerUrl);
            post.header("Content-Type", "application/json");
            post.body(JSONObject.toJSONString(requestBody));
            HttpResponse response = post.execute();

            JSONObject jsonObject = JSONObject.parseObject(response.body());
            if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
                log.error("请求失败，错误码：{}，错误信息：{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
                return "";
            }
            JSONObject result = jsonObject.getJSONObject("result");
            JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
            if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
                return "";
            }
            JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0);
            JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown");
            if (markdown == null) {
                return "";
            }
            String text = markdown.getString("text");
            if (StringUtils.isBlank(text)) {
                return "";
            }
            return text;
        } catch (Exception e) {
            log.error("获取图片内容失败", e);
            throw new ServiceException("获取图片内容失败");
        } finally {
            try {
                if (imgFile != null && imgFile.exists()) {
                    Files.deleteIfExists(imgFile.toPath());
                }
            } catch (IOException e) {
                log.error("删除文件失败", e);
            }
        }
    }

    /**
     * L base64 M 临时文件 R 是否内容为图片
     * @param pdfFile
     * @return
     * @throws IOException
     */
    private static List<Triple<String, Integer, Boolean>> splitPdfToBase64Pages(File pdfFile) throws IOException {
        List<Triple<String, Integer, Boolean>> base64Pages = new ArrayList<>();
        try (PDDocument document = Loader.loadPDF(pdfFile)) {
            int pageCount = document.getNumberOfPages();
            // 每10页为一组进行切割
            for (int startPage = 0; startPage < pageCount; startPage += 10) {
                int endPage = Math.min(startPage + 10, pageCount);
                PDDocument pageDoc = new PDDocument();

                // 添加10页内容到新文档
                for (int i = startPage; i < endPage; i++) {
                    PDPage page = document.getPage(i);
                    pageDoc.addPage(page);
                }

                // 使用 try-with-resources 确保临时文件被正确处理
                File tempPageFile = null;
                try {
                    // 创建唯一的临时文件
                    tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf");

                    // 保存页面到临时文件
                    pageDoc.save(tempPageFile);

                    // 转换为 Base64
                    byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath());
                    String base64 = Base64.getEncoder().encodeToString(pageBytes);

                    // 判断是否扫描件
                    boolean scannedPdf = true;
                    try (PDDocument temp = Loader.loadPDF(tempPageFile)) {
                        PDFTextStripper stripper = new PDFTextStripper();
                        String text = stripper.getText(temp);
                        if (text != null && !text.trim().isEmpty()) {
                            scannedPdf = false;
                        }
                    }
                    // 中间字段记录这组包含的页数
                    int pagesInGroup = endPage - startPage;
                    base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf));

                } finally {
                    // 确保在任何情况下都关闭并删除临时文件
                    pageDoc.close();
                    if (tempPageFile != null && tempPageFile.exists()) {
                        Files.deleteIfExists(tempPageFile.toPath());
                    }
                }
            }
        }
        return base64Pages;
    }
}