first commit

2026-01-30 14:25:12 +08:00
commit 8dd8d2668a
899 changed files with 90844 additions and 0 deletions
--- a/qingyun-service/src/main/java/com/qingyun/service/compare/FetchAllPagesContent.java
+++ b/qingyun-service/src/main/java/com/qingyun/service/compare/FetchAllPagesContent.java
@@ -0,0 +1,225 @@
+package com.qingyun.service.compare;
+
+import cn.hutool.core.collection.CollectionUtil;
+import cn.hutool.core.util.StrUtil;
+import cn.hutool.http.HttpRequest;
+import cn.hutool.http.HttpResponse;
+import cn.hutool.http.HttpUtil;
+import com.alibaba.fastjson.JSONArray;
+import com.alibaba.fastjson.JSONObject;
+import com.qingyun.common.exception.ServiceException;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.commons.lang3.tuple.ImmutableTriple;
+import org.apache.commons.lang3.tuple.Triple;
+import org.apache.pdfbox.Loader;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.stereotype.Component;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.*;
+
+@Component
+public class FetchAllPagesContent {
+
+    @Value("${ocr.url}")
+    private String orcServerUrl;
+
+    // 日志
+    private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class);
+
+    // 获取所有页面内容
+    public List<String> extractPDFPagesContent(File pdfFile) throws Exception {
+        List<Triple<String, Integer, Boolean>> base64Pages = splitPdfToBase64Pages(pdfFile);
+        if (CollectionUtil.isEmpty(base64Pages)) {
+            throw new Exception("无法获取页面内容");
+        }
+        Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF
+
+        List<String> contentList = new ArrayList<>();
+
+        for (int i = 0; i < base64Pages.size(); i++) {
+            int pageNum = i + 1;
+            Triple<String, Integer, Boolean> base64Page = base64Pages.get(i);
+            String base64 = base64Page.getLeft();
+            Integer pageData = base64Page.getMiddle(); // 获取页数信息
+            Boolean scannedPdf = base64Page.getRight();
+
+            try {
+                // 构造请求体
+                Map<String, Object> requestBody = new HashMap<>();
+                requestBody.put("file", base64);
+                requestBody.put("fileType", scannedPdf ? 1 : 0);  // 文件类型。0表示PDF文件，1表示图像文件
+                HttpRequest post = HttpUtil.createPost(orcServerUrl);
+                post.header("Content-Type", "application/json");
+                post.body(JSONObject.toJSONString(requestBody));
+                HttpResponse response = post.execute();
+
+                JSONObject jsonObject = JSONObject.parseObject(response.body());
+                if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
+                    log.error("请求失败，错误码：{}，错误信息：{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
+                    // 请求失败，为每一页添加空字符串
+                    for (int j = 0; j < pageData; j++) {
+                        contentList.add("");
+                    }
+                    continue;
+                }
+                JSONObject result = jsonObject.getJSONObject("result");
+                JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
+                if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
+                    // 没有识别结果，为每一页添加空字符串
+                    for (int j = 0; j < pageData; j++) {
+                        contentList.add("");
+                    }
+                    continue;
+                }
+                // 处理每一页的识别结果
+                for (int j = 0; j < layoutParsingResults.size(); j++) {
+                    JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j);
+                    JSONObject markdown = layoutParsingResult.getJSONObject("markdown");
+                    if (markdown == null) {
+                        contentList.add("");
+                        continue;
+                    }
+                    String text = markdown.getString("text");
+                    if (StringUtils.isBlank(text)) {
+                        contentList.add("");
+                        continue;
+                    }
+                    contentList.add(text);
+                }
+                // 如果返回的页数少于预期页数，补充空字符串
+                if (layoutParsingResults.size() < pageData) {
+                    for (int j = layoutParsingResults.size(); j < pageData; j++) {
+                        contentList.add("");
+                    }
+                }
+            } catch (Exception e) {
+                log.error("获取页面 {} 内容失败", pageNum, e);
+                // 出错时为每一页添加空字符串保持页数一致
+                for (int j = 0; j < pageData; j++) {
+                    contentList.add("");
+                }
+            }
+        }
+        return contentList;
+    }
+
+    // 获取所有页面内容
+    public String extractImgPagesContent(File imgFile) throws Exception {
+        try {
+            byte[] pageBytes = Files.readAllBytes(imgFile.toPath());
+            String base64 = Base64.getEncoder().encodeToString(pageBytes);
+            if (StrUtil.isBlank(base64)) {
+                throw new ServiceException("无法获取图片内容");
+            }
+            Files.deleteIfExists(imgFile.toPath()); // 清理文件
+
+            // 构造请求体
+            Map<String, Object> requestBody = new HashMap<>();
+            requestBody.put("file", base64);
+            requestBody.put("fileType", 1);  // 文件类型。0表示PDF文件，1表示图像文件
+            HttpRequest post = HttpUtil.createPost(orcServerUrl);
+            post.header("Content-Type", "application/json");
+            post.body(JSONObject.toJSONString(requestBody));
+            HttpResponse response = post.execute();
+
+            JSONObject jsonObject = JSONObject.parseObject(response.body());
+            if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
+                log.error("请求失败，错误码：{}，错误信息：{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
+                return "";
+            }
+            JSONObject result = jsonObject.getJSONObject("result");
+            JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
+            if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
+                return "";
+            }
+            JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0);
+            JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown");
+            if (markdown == null) {
+                return "";
+            }
+            String text = markdown.getString("text");
+            if (StringUtils.isBlank(text)) {
+                return "";
+            }
+            return text;
+        } catch (Exception e) {
+            log.error("获取图片内容失败", e);
+            throw new ServiceException("获取图片内容失败");
+        } finally {
+            try {
+                if (imgFile != null && imgFile.exists()) {
+                    Files.deleteIfExists(imgFile.toPath());
+                }
+            } catch (IOException e) {
+                log.error("删除文件失败", e);
+            }
+        }
+    }
+
+    /**
+     * L base64 M 临时文件 R 是否内容为图片
+     * @param pdfFile
+     * @return
+     * @throws IOException
+     */
+    private static List<Triple<String, Integer, Boolean>> splitPdfToBase64Pages(File pdfFile) throws IOException {
+        List<Triple<String, Integer, Boolean>> base64Pages = new ArrayList<>();
+        try (PDDocument document = Loader.loadPDF(pdfFile)) {
+            int pageCount = document.getNumberOfPages();
+            // 每10页为一组进行切割
+            for (int startPage = 0; startPage < pageCount; startPage += 10) {
+                int endPage = Math.min(startPage + 10, pageCount);
+                PDDocument pageDoc = new PDDocument();
+
+                // 添加10页内容到新文档
+                for (int i = startPage; i < endPage; i++) {
+                    PDPage page = document.getPage(i);
+                    pageDoc.addPage(page);
+                }
+
+                // 使用 try-with-resources 确保临时文件被正确处理
+                File tempPageFile = null;
+                try {
+                    // 创建唯一的临时文件
+                    tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf");
+
+                    // 保存页面到临时文件
+                    pageDoc.save(tempPageFile);
+
+                    // 转换为 Base64
+                    byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath());
+                    String base64 = Base64.getEncoder().encodeToString(pageBytes);
+
+                    // 判断是否扫描件
+                    boolean scannedPdf = true;
+                    try (PDDocument temp = Loader.loadPDF(tempPageFile)) {
+                        PDFTextStripper stripper = new PDFTextStripper();
+                        String text = stripper.getText(temp);
+                        if (text != null && !text.trim().isEmpty()) {
+                            scannedPdf = false;
+                        }
+                    }
+                    // 中间字段记录这组包含的页数
+                    int pagesInGroup = endPage - startPage;
+                    base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf));
+
+                } finally {
+                    // 确保在任何情况下都关闭并删除临时文件
+                    pageDoc.close();
+                    if (tempPageFile != null && tempPageFile.exists()) {
+                        Files.deleteIfExists(tempPageFile.toPath());
+                    }
+                }
+            }
+        }
+        return base64Pages;
+    }
+}