first commit

This commit is contained in:
2026-01-30 14:25:12 +08:00
commit 8dd8d2668a
899 changed files with 90844 additions and 0 deletions

View File

@@ -0,0 +1,225 @@
package com.qingyun.service.compare;
import cn.hutool.core.collection.CollectionUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.qingyun.common.exception.ServiceException;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutableTriple;
import org.apache.commons.lang3.tuple.Triple;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;
@Component
public class FetchAllPagesContent {
@Value("${ocr.url}")
private String orcServerUrl;
// 日志
private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class);
// 获取所有页面内容
public List<String> extractPDFPagesContent(File pdfFile) throws Exception {
List<Triple<String, Integer, Boolean>> base64Pages = splitPdfToBase64Pages(pdfFile);
if (CollectionUtil.isEmpty(base64Pages)) {
throw new Exception("无法获取页面内容");
}
Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF
List<String> contentList = new ArrayList<>();
for (int i = 0; i < base64Pages.size(); i++) {
int pageNum = i + 1;
Triple<String, Integer, Boolean> base64Page = base64Pages.get(i);
String base64 = base64Page.getLeft();
Integer pageData = base64Page.getMiddle(); // 获取页数信息
Boolean scannedPdf = base64Page.getRight();
try {
// 构造请求体
Map<String, Object> requestBody = new HashMap<>();
requestBody.put("file", base64);
requestBody.put("fileType", scannedPdf ? 1 : 0); // 文件类型。0表示PDF文件1表示图像文件
HttpRequest post = HttpUtil.createPost(orcServerUrl);
post.header("Content-Type", "application/json");
post.body(JSONObject.toJSONString(requestBody));
HttpResponse response = post.execute();
JSONObject jsonObject = JSONObject.parseObject(response.body());
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
// 请求失败,为每一页添加空字符串
for (int j = 0; j < pageData; j++) {
contentList.add("");
}
continue;
}
JSONObject result = jsonObject.getJSONObject("result");
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
// 没有识别结果,为每一页添加空字符串
for (int j = 0; j < pageData; j++) {
contentList.add("");
}
continue;
}
// 处理每一页的识别结果
for (int j = 0; j < layoutParsingResults.size(); j++) {
JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j);
JSONObject markdown = layoutParsingResult.getJSONObject("markdown");
if (markdown == null) {
contentList.add("");
continue;
}
String text = markdown.getString("text");
if (StringUtils.isBlank(text)) {
contentList.add("");
continue;
}
contentList.add(text);
}
// 如果返回的页数少于预期页数,补充空字符串
if (layoutParsingResults.size() < pageData) {
for (int j = layoutParsingResults.size(); j < pageData; j++) {
contentList.add("");
}
}
} catch (Exception e) {
log.error("获取页面 {} 内容失败", pageNum, e);
// 出错时为每一页添加空字符串保持页数一致
for (int j = 0; j < pageData; j++) {
contentList.add("");
}
}
}
return contentList;
}
// 获取所有页面内容
public String extractImgPagesContent(File imgFile) throws Exception {
try {
byte[] pageBytes = Files.readAllBytes(imgFile.toPath());
String base64 = Base64.getEncoder().encodeToString(pageBytes);
if (StrUtil.isBlank(base64)) {
throw new ServiceException("无法获取图片内容");
}
Files.deleteIfExists(imgFile.toPath()); // 清理文件
// 构造请求体
Map<String, Object> requestBody = new HashMap<>();
requestBody.put("file", base64);
requestBody.put("fileType", 1); // 文件类型。0表示PDF文件1表示图像文件
HttpRequest post = HttpUtil.createPost(orcServerUrl);
post.header("Content-Type", "application/json");
post.body(JSONObject.toJSONString(requestBody));
HttpResponse response = post.execute();
JSONObject jsonObject = JSONObject.parseObject(response.body());
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
return "";
}
JSONObject result = jsonObject.getJSONObject("result");
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
return "";
}
JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0);
JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown");
if (markdown == null) {
return "";
}
String text = markdown.getString("text");
if (StringUtils.isBlank(text)) {
return "";
}
return text;
} catch (Exception e) {
log.error("获取图片内容失败", e);
throw new ServiceException("获取图片内容失败");
} finally {
try {
if (imgFile != null && imgFile.exists()) {
Files.deleteIfExists(imgFile.toPath());
}
} catch (IOException e) {
log.error("删除文件失败", e);
}
}
}
/**
* L base64 M 临时文件 R 是否内容为图片
* @param pdfFile
* @return
* @throws IOException
*/
private static List<Triple<String, Integer, Boolean>> splitPdfToBase64Pages(File pdfFile) throws IOException {
List<Triple<String, Integer, Boolean>> base64Pages = new ArrayList<>();
try (PDDocument document = Loader.loadPDF(pdfFile)) {
int pageCount = document.getNumberOfPages();
// 每10页为一组进行切割
for (int startPage = 0; startPage < pageCount; startPage += 10) {
int endPage = Math.min(startPage + 10, pageCount);
PDDocument pageDoc = new PDDocument();
// 添加10页内容到新文档
for (int i = startPage; i < endPage; i++) {
PDPage page = document.getPage(i);
pageDoc.addPage(page);
}
// 使用 try-with-resources 确保临时文件被正确处理
File tempPageFile = null;
try {
// 创建唯一的临时文件
tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf");
// 保存页面到临时文件
pageDoc.save(tempPageFile);
// 转换为 Base64
byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath());
String base64 = Base64.getEncoder().encodeToString(pageBytes);
// 判断是否扫描件
boolean scannedPdf = true;
try (PDDocument temp = Loader.loadPDF(tempPageFile)) {
PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(temp);
if (text != null && !text.trim().isEmpty()) {
scannedPdf = false;
}
}
// 中间字段记录这组包含的页数
int pagesInGroup = endPage - startPage;
base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf));
} finally {
// 确保在任何情况下都关闭并删除临时文件
pageDoc.close();
if (tempPageFile != null && tempPageFile.exists()) {
Files.deleteIfExists(tempPageFile.toPath());
}
}
}
}
return base64Pages;
}
}