Files
contract-review/qingyun-service/src/main/java/com/qingyun/service/compare/FetchAllPagesContent.java
2026-01-30 14:25:12 +08:00

226 lines
9.8 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package com.qingyun.service.compare;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.qingyun.common.exception.ServiceException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;
@Component
public class FetchAllPagesContent {
@Value("${ocr.url}")
private String orcServerUrl;
// 日志
private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class);
// 获取所有页面内容
public List<String> extractPDFPagesContent(File pdfFile) throws Exception {
List<Triple<String, Integer, Boolean>> base64Pages = splitPdfToBase64Pages(pdfFile);
if (CollectionUtil.isEmpty(base64Pages)) {
throw new Exception("无法获取页面内容");
}
Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF
List<String> contentList = new ArrayList<>();
for (int i = 0; i < base64Pages.size(); i++) {
int pageNum = i + 1;
Triple<String, Integer, Boolean> base64Page = base64Pages.get(i);
String base64 = base64Page.getLeft();
Integer pageData = base64Page.getMiddle(); // 获取页数信息
Boolean scannedPdf = base64Page.getRight();
try {
// 构造请求体
Map<String, Object> requestBody = new HashMap<>();
requestBody.put("file", base64);
requestBody.put("fileType", scannedPdf ? 1 : 0); // 文件类型。0表示PDF文件1表示图像文件
HttpRequest post = HttpUtil.createPost(orcServerUrl);
post.header("Content-Type", "application/json");
post.body(JSONObject.toJSONString(requestBody));
HttpResponse response = post.execute();
JSONObject jsonObject = JSONObject.parseObject(response.body());
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
// 请求失败,为每一页添加空字符串
for (int j = 0; j < pageData; j++) {
contentList.add("");
}
continue;
}
JSONObject result = jsonObject.getJSONObject("result");
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
// 没有识别结果,为每一页添加空字符串
for (int j = 0; j < pageData; j++) {
contentList.add("");
}
continue;
}
// 处理每一页的识别结果
for (int j = 0; j < layoutParsingResults.size(); j++) {
JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j);
JSONObject markdown = layoutParsingResult.getJSONObject("markdown");
if (markdown == null) {
contentList.add("");
continue;
}
String text = markdown.getString("text");
if (StringUtils.isBlank(text)) {
contentList.add("");
continue;
}
contentList.add(text);
}
// 如果返回的页数少于预期页数,补充空字符串
if (layoutParsingResults.size() < pageData) {
for (int j = layoutParsingResults.size(); j < pageData; j++) {
contentList.add("");
}
}
} catch (Exception e) {
log.error("获取页面 {} 内容失败", pageNum, e);
// 出错时为每一页添加空字符串保持页数一致
for (int j = 0; j < pageData; j++) {
contentList.add("");
}
}
}
return contentList;
}
// 获取所有页面内容
public String extractImgPagesContent(File imgFile) throws Exception {
try {
byte[] pageBytes = Files.readAllBytes(imgFile.toPath());
String base64 = Base64.getEncoder().encodeToString(pageBytes);
if (StrUtil.isBlank(base64)) {
throw new ServiceException("无法获取图片内容");
}
Files.deleteIfExists(imgFile.toPath()); // 清理文件
// 构造请求体
Map<String, Object> requestBody = new HashMap<>();
requestBody.put("file", base64);
requestBody.put("fileType", 1); // 文件类型。0表示PDF文件1表示图像文件
HttpRequest post = HttpUtil.createPost(orcServerUrl);
post.header("Content-Type", "application/json");
post.body(JSONObject.toJSONString(requestBody));
HttpResponse response = post.execute();
JSONObject jsonObject = JSONObject.parseObject(response.body());
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
return "";
}
JSONObject result = jsonObject.getJSONObject("result");
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
return "";
}
JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0);
JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown");
if (markdown == null) {
return "";
}
String text = markdown.getString("text");
if (StringUtils.isBlank(text)) {
return "";
}
return text;
} catch (Exception e) {
log.error("获取图片内容失败", e);
throw new ServiceException("获取图片内容失败");
} finally {
try {
if (imgFile != null && imgFile.exists()) {
Files.deleteIfExists(imgFile.toPath());
}
} catch (IOException e) {
log.error("删除文件失败", e);
}
}
}
/**
* L base64 M 临时文件 R 是否内容为图片
* @param pdfFile
* @return
* @throws IOException
*/
private static List<Triple<String, Integer, Boolean>> splitPdfToBase64Pages(File pdfFile) throws IOException {
List<Triple<String, Integer, Boolean>> base64Pages = new ArrayList<>();
try (PDDocument document = Loader.loadPDF(pdfFile)) {
int pageCount = document.getNumberOfPages();
// 每10页为一组进行切割
for (int startPage = 0; startPage < pageCount; startPage += 10) {
int endPage = Math.min(startPage + 10, pageCount);
PDDocument pageDoc = new PDDocument();
// 添加10页内容到新文档
for (int i = startPage; i < endPage; i++) {
PDPage page = document.getPage(i);
pageDoc.addPage(page);
}
// 使用 try-with-resources 确保临时文件被正确处理
File tempPageFile = null;
try {
// 创建唯一的临时文件
tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf");
// 保存页面到临时文件
pageDoc.save(tempPageFile);
// 转换为 Base64
byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath());
String base64 = Base64.getEncoder().encodeToString(pageBytes);
// 判断是否扫描件
boolean scannedPdf = true;
try (PDDocument temp = Loader.loadPDF(tempPageFile)) {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(temp);
if (text != null && !text.trim().isEmpty()) {
scannedPdf = false;
}
}
// 中间字段记录这组包含的页数
int pagesInGroup = endPage - startPage;
base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf));
} finally {
// 确保在任何情况下都关闭并删除临时文件
pageDoc.close();
if (tempPageFile != null && tempPageFile.exists()) {
Files.deleteIfExists(tempPageFile.toPath());
}
}
}
}
return base64Pages;
}
}