package com.qingyun.service.compare; import cn.hutool.core.collection.CollectionUtil; import cn.hutool.core.util.StrUtil; import cn.hutool.http.HttpRequest; import cn.hutool.http.HttpResponse; import cn.hutool.http.HttpUtil; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.qingyun.common.exception.ServiceException; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutableTriple; import org.apache.commons.lang3.tuple.Triple; import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.*; @Component public class FetchAllPagesContent { @Value("${ocr.url}") private String orcServerUrl; // 日志 private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class); // 获取所有页面内容 public List extractPDFPagesContent(File pdfFile) throws Exception { List> base64Pages = splitPdfToBase64Pages(pdfFile); if (CollectionUtil.isEmpty(base64Pages)) { throw new Exception("无法获取页面内容"); } Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF List contentList = new ArrayList<>(); for (int i = 0; i < base64Pages.size(); i++) { int pageNum = i + 1; Triple base64Page = base64Pages.get(i); String base64 = base64Page.getLeft(); Integer pageData = base64Page.getMiddle(); // 获取页数信息 Boolean scannedPdf = base64Page.getRight(); try { // 构造请求体 Map requestBody = new HashMap<>(); requestBody.put("file", base64); requestBody.put("fileType", scannedPdf ? 1 : 0); // 文件类型。0表示PDF文件,1表示图像文件 HttpRequest post = HttpUtil.createPost(orcServerUrl); post.header("Content-Type", "application/json"); post.body(JSONObject.toJSONString(requestBody)); HttpResponse response = post.execute(); JSONObject jsonObject = JSONObject.parseObject(response.body()); if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) { log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg")); // 请求失败,为每一页添加空字符串 for (int j = 0; j < pageData; j++) { contentList.add(""); } continue; } JSONObject result = jsonObject.getJSONObject("result"); JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults"); if (layoutParsingResults == null || layoutParsingResults.isEmpty()) { // 没有识别结果,为每一页添加空字符串 for (int j = 0; j < pageData; j++) { contentList.add(""); } continue; } // 处理每一页的识别结果 for (int j = 0; j < layoutParsingResults.size(); j++) { JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j); JSONObject markdown = layoutParsingResult.getJSONObject("markdown"); if (markdown == null) { contentList.add(""); continue; } String text = markdown.getString("text"); if (StringUtils.isBlank(text)) { contentList.add(""); continue; } contentList.add(text); } // 如果返回的页数少于预期页数,补充空字符串 if (layoutParsingResults.size() < pageData) { for (int j = layoutParsingResults.size(); j < pageData; j++) { contentList.add(""); } } } catch (Exception e) { log.error("获取页面 {} 内容失败", pageNum, e); // 出错时为每一页添加空字符串保持页数一致 for (int j = 0; j < pageData; j++) { contentList.add(""); } } } return contentList; } // 获取所有页面内容 public String extractImgPagesContent(File imgFile) throws Exception { try { byte[] pageBytes = Files.readAllBytes(imgFile.toPath()); String base64 = Base64.getEncoder().encodeToString(pageBytes); if (StrUtil.isBlank(base64)) { throw new ServiceException("无法获取图片内容"); } Files.deleteIfExists(imgFile.toPath()); // 清理文件 // 构造请求体 Map requestBody = new HashMap<>(); requestBody.put("file", base64); requestBody.put("fileType", 1); // 文件类型。0表示PDF文件,1表示图像文件 HttpRequest post = HttpUtil.createPost(orcServerUrl); post.header("Content-Type", "application/json"); post.body(JSONObject.toJSONString(requestBody)); HttpResponse response = post.execute(); JSONObject jsonObject = JSONObject.parseObject(response.body()); if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) { log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg")); return ""; } JSONObject result = jsonObject.getJSONObject("result"); JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults"); if (layoutParsingResults == null || layoutParsingResults.isEmpty()) { return ""; } JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0); JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown"); if (markdown == null) { return ""; } String text = markdown.getString("text"); if (StringUtils.isBlank(text)) { return ""; } return text; } catch (Exception e) { log.error("获取图片内容失败", e); throw new ServiceException("获取图片内容失败"); } finally { try { if (imgFile != null && imgFile.exists()) { Files.deleteIfExists(imgFile.toPath()); } } catch (IOException e) { log.error("删除文件失败", e); } } } /** * L base64 M 临时文件 R 是否内容为图片 * @param pdfFile * @return * @throws IOException */ private static List> splitPdfToBase64Pages(File pdfFile) throws IOException { List> base64Pages = new ArrayList<>(); try (PDDocument document = Loader.loadPDF(pdfFile)) { int pageCount = document.getNumberOfPages(); // 每10页为一组进行切割 for (int startPage = 0; startPage < pageCount; startPage += 10) { int endPage = Math.min(startPage + 10, pageCount); PDDocument pageDoc = new PDDocument(); // 添加10页内容到新文档 for (int i = startPage; i < endPage; i++) { PDPage page = document.getPage(i); pageDoc.addPage(page); } // 使用 try-with-resources 确保临时文件被正确处理 File tempPageFile = null; try { // 创建唯一的临时文件 tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf"); // 保存页面到临时文件 pageDoc.save(tempPageFile); // 转换为 Base64 byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath()); String base64 = Base64.getEncoder().encodeToString(pageBytes); // 判断是否扫描件 boolean scannedPdf = true; try (PDDocument temp = Loader.loadPDF(tempPageFile)) { PDFTextStripper stripper = new PDFTextStripper(); String text = stripper.getText(temp); if (text != null && !text.trim().isEmpty()) { scannedPdf = false; } } // 中间字段记录这组包含的页数 int pagesInGroup = endPage - startPage; base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf)); } finally { // 确保在任何情况下都关闭并删除临时文件 pageDoc.close(); if (tempPageFile != null && tempPageFile.exists()) { Files.deleteIfExists(tempPageFile.toPath()); } } } } return base64Pages; } }