226 lines
9.8 KiB
Java
226 lines
9.8 KiB
Java
package com.qingyun.service.compare;
|
||
|
||
import cn.hutool.core.collection.CollectionUtil;
|
||
import cn.hutool.core.util.StrUtil;
|
||
import cn.hutool.http.HttpRequest;
|
||
import cn.hutool.http.HttpResponse;
|
||
import cn.hutool.http.HttpUtil;
|
||
import com.alibaba.fastjson.JSONArray;
|
||
import com.alibaba.fastjson.JSONObject;
|
||
import com.qingyun.common.exception.ServiceException;
|
||
import org.apache.commons.lang3.StringUtils;
|
||
import org.apache.commons.lang3.tuple.ImmutableTriple;
|
||
import org.apache.commons.lang3.tuple.Triple;
|
||
import org.apache.pdfbox.Loader;
|
||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||
import org.apache.pdfbox.pdmodel.PDPage;
|
||
import org.apache.pdfbox.text.PDFTextStripper;
|
||
import org.slf4j.Logger;
|
||
import org.slf4j.LoggerFactory;
|
||
import org.springframework.beans.factory.annotation.Value;
|
||
import org.springframework.stereotype.Component;
|
||
|
||
import java.io.File;
|
||
import java.io.IOException;
|
||
import java.nio.file.Files;
|
||
import java.util.*;
|
||
|
||
@Component
|
||
public class FetchAllPagesContent {
|
||
|
||
@Value("${ocr.url}")
|
||
private String orcServerUrl;
|
||
|
||
// 日志
|
||
private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class);
|
||
|
||
// 获取所有页面内容
|
||
public List<String> extractPDFPagesContent(File pdfFile) throws Exception {
|
||
List<Triple<String, Integer, Boolean>> base64Pages = splitPdfToBase64Pages(pdfFile);
|
||
if (CollectionUtil.isEmpty(base64Pages)) {
|
||
throw new Exception("无法获取页面内容");
|
||
}
|
||
Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF
|
||
|
||
List<String> contentList = new ArrayList<>();
|
||
|
||
for (int i = 0; i < base64Pages.size(); i++) {
|
||
int pageNum = i + 1;
|
||
Triple<String, Integer, Boolean> base64Page = base64Pages.get(i);
|
||
String base64 = base64Page.getLeft();
|
||
Integer pageData = base64Page.getMiddle(); // 获取页数信息
|
||
Boolean scannedPdf = base64Page.getRight();
|
||
|
||
try {
|
||
// 构造请求体
|
||
Map<String, Object> requestBody = new HashMap<>();
|
||
requestBody.put("file", base64);
|
||
requestBody.put("fileType", scannedPdf ? 1 : 0); // 文件类型。0表示PDF文件,1表示图像文件
|
||
HttpRequest post = HttpUtil.createPost(orcServerUrl);
|
||
post.header("Content-Type", "application/json");
|
||
post.body(JSONObject.toJSONString(requestBody));
|
||
HttpResponse response = post.execute();
|
||
|
||
JSONObject jsonObject = JSONObject.parseObject(response.body());
|
||
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
|
||
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
|
||
// 请求失败,为每一页添加空字符串
|
||
for (int j = 0; j < pageData; j++) {
|
||
contentList.add("");
|
||
}
|
||
continue;
|
||
}
|
||
JSONObject result = jsonObject.getJSONObject("result");
|
||
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
|
||
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
|
||
// 没有识别结果,为每一页添加空字符串
|
||
for (int j = 0; j < pageData; j++) {
|
||
contentList.add("");
|
||
}
|
||
continue;
|
||
}
|
||
// 处理每一页的识别结果
|
||
for (int j = 0; j < layoutParsingResults.size(); j++) {
|
||
JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j);
|
||
JSONObject markdown = layoutParsingResult.getJSONObject("markdown");
|
||
if (markdown == null) {
|
||
contentList.add("");
|
||
continue;
|
||
}
|
||
String text = markdown.getString("text");
|
||
if (StringUtils.isBlank(text)) {
|
||
contentList.add("");
|
||
continue;
|
||
}
|
||
contentList.add(text);
|
||
}
|
||
// 如果返回的页数少于预期页数,补充空字符串
|
||
if (layoutParsingResults.size() < pageData) {
|
||
for (int j = layoutParsingResults.size(); j < pageData; j++) {
|
||
contentList.add("");
|
||
}
|
||
}
|
||
} catch (Exception e) {
|
||
log.error("获取页面 {} 内容失败", pageNum, e);
|
||
// 出错时为每一页添加空字符串保持页数一致
|
||
for (int j = 0; j < pageData; j++) {
|
||
contentList.add("");
|
||
}
|
||
}
|
||
}
|
||
return contentList;
|
||
}
|
||
|
||
// 获取所有页面内容
|
||
public String extractImgPagesContent(File imgFile) throws Exception {
|
||
try {
|
||
byte[] pageBytes = Files.readAllBytes(imgFile.toPath());
|
||
String base64 = Base64.getEncoder().encodeToString(pageBytes);
|
||
if (StrUtil.isBlank(base64)) {
|
||
throw new ServiceException("无法获取图片内容");
|
||
}
|
||
Files.deleteIfExists(imgFile.toPath()); // 清理文件
|
||
|
||
// 构造请求体
|
||
Map<String, Object> requestBody = new HashMap<>();
|
||
requestBody.put("file", base64);
|
||
requestBody.put("fileType", 1); // 文件类型。0表示PDF文件,1表示图像文件
|
||
HttpRequest post = HttpUtil.createPost(orcServerUrl);
|
||
post.header("Content-Type", "application/json");
|
||
post.body(JSONObject.toJSONString(requestBody));
|
||
HttpResponse response = post.execute();
|
||
|
||
JSONObject jsonObject = JSONObject.parseObject(response.body());
|
||
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
|
||
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
|
||
return "";
|
||
}
|
||
JSONObject result = jsonObject.getJSONObject("result");
|
||
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
|
||
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
|
||
return "";
|
||
}
|
||
JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0);
|
||
JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown");
|
||
if (markdown == null) {
|
||
return "";
|
||
}
|
||
String text = markdown.getString("text");
|
||
if (StringUtils.isBlank(text)) {
|
||
return "";
|
||
}
|
||
return text;
|
||
} catch (Exception e) {
|
||
log.error("获取图片内容失败", e);
|
||
throw new ServiceException("获取图片内容失败");
|
||
} finally {
|
||
try {
|
||
if (imgFile != null && imgFile.exists()) {
|
||
Files.deleteIfExists(imgFile.toPath());
|
||
}
|
||
} catch (IOException e) {
|
||
log.error("删除文件失败", e);
|
||
}
|
||
}
|
||
}
|
||
|
||
/**
|
||
* L base64 M 临时文件 R 是否内容为图片
|
||
* @param pdfFile
|
||
* @return
|
||
* @throws IOException
|
||
*/
|
||
private static List<Triple<String, Integer, Boolean>> splitPdfToBase64Pages(File pdfFile) throws IOException {
|
||
List<Triple<String, Integer, Boolean>> base64Pages = new ArrayList<>();
|
||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||
int pageCount = document.getNumberOfPages();
|
||
// 每10页为一组进行切割
|
||
for (int startPage = 0; startPage < pageCount; startPage += 10) {
|
||
int endPage = Math.min(startPage + 10, pageCount);
|
||
PDDocument pageDoc = new PDDocument();
|
||
|
||
// 添加10页内容到新文档
|
||
for (int i = startPage; i < endPage; i++) {
|
||
PDPage page = document.getPage(i);
|
||
pageDoc.addPage(page);
|
||
}
|
||
|
||
// 使用 try-with-resources 确保临时文件被正确处理
|
||
File tempPageFile = null;
|
||
try {
|
||
// 创建唯一的临时文件
|
||
tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf");
|
||
|
||
// 保存页面到临时文件
|
||
pageDoc.save(tempPageFile);
|
||
|
||
// 转换为 Base64
|
||
byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath());
|
||
String base64 = Base64.getEncoder().encodeToString(pageBytes);
|
||
|
||
// 判断是否扫描件
|
||
boolean scannedPdf = true;
|
||
try (PDDocument temp = Loader.loadPDF(tempPageFile)) {
|
||
PDFTextStripper stripper = new PDFTextStripper();
|
||
String text = stripper.getText(temp);
|
||
if (text != null && !text.trim().isEmpty()) {
|
||
scannedPdf = false;
|
||
}
|
||
}
|
||
// 中间字段记录这组包含的页数
|
||
int pagesInGroup = endPage - startPage;
|
||
base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf));
|
||
|
||
} finally {
|
||
// 确保在任何情况下都关闭并删除临时文件
|
||
pageDoc.close();
|
||
if (tempPageFile != null && tempPageFile.exists()) {
|
||
Files.deleteIfExists(tempPageFile.toPath());
|
||
}
|
||
}
|
||
}
|
||
}
|
||
return base64Pages;
|
||
}
|
||
}
|