first commit
This commit is contained in:
@@ -0,0 +1,225 @@
|
||||
package com.qingyun.service.compare;
|
||||
|
||||
import cn.hutool.core.collection.CollectionUtil;
|
||||
import cn.hutool.core.util.StrUtil;
|
||||
import cn.hutool.http.HttpRequest;
|
||||
import cn.hutool.http.HttpResponse;
|
||||
import cn.hutool.http.HttpUtil;
|
||||
import com.alibaba.fastjson.JSONArray;
|
||||
import com.alibaba.fastjson.JSONObject;
|
||||
import com.qingyun.common.exception.ServiceException;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.ImmutableTriple;
|
||||
import org.apache.commons.lang3.tuple.Triple;
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.*;
|
||||
|
||||
@Component
|
||||
public class FetchAllPagesContent {
|
||||
|
||||
@Value("${ocr.url}")
|
||||
private String orcServerUrl;
|
||||
|
||||
// 日志
|
||||
private static final Logger log = LoggerFactory.getLogger(FetchAllPagesContent.class);
|
||||
|
||||
// 获取所有页面内容
|
||||
public List<String> extractPDFPagesContent(File pdfFile) throws Exception {
|
||||
List<Triple<String, Integer, Boolean>> base64Pages = splitPdfToBase64Pages(pdfFile);
|
||||
if (CollectionUtil.isEmpty(base64Pages)) {
|
||||
throw new Exception("无法获取页面内容");
|
||||
}
|
||||
Files.deleteIfExists(pdfFile.toPath()); // 清理原始 PDF
|
||||
|
||||
List<String> contentList = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < base64Pages.size(); i++) {
|
||||
int pageNum = i + 1;
|
||||
Triple<String, Integer, Boolean> base64Page = base64Pages.get(i);
|
||||
String base64 = base64Page.getLeft();
|
||||
Integer pageData = base64Page.getMiddle(); // 获取页数信息
|
||||
Boolean scannedPdf = base64Page.getRight();
|
||||
|
||||
try {
|
||||
// 构造请求体
|
||||
Map<String, Object> requestBody = new HashMap<>();
|
||||
requestBody.put("file", base64);
|
||||
requestBody.put("fileType", scannedPdf ? 1 : 0); // 文件类型。0表示PDF文件,1表示图像文件
|
||||
HttpRequest post = HttpUtil.createPost(orcServerUrl);
|
||||
post.header("Content-Type", "application/json");
|
||||
post.body(JSONObject.toJSONString(requestBody));
|
||||
HttpResponse response = post.execute();
|
||||
|
||||
JSONObject jsonObject = JSONObject.parseObject(response.body());
|
||||
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
|
||||
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
|
||||
// 请求失败,为每一页添加空字符串
|
||||
for (int j = 0; j < pageData; j++) {
|
||||
contentList.add("");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
JSONObject result = jsonObject.getJSONObject("result");
|
||||
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
|
||||
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
|
||||
// 没有识别结果,为每一页添加空字符串
|
||||
for (int j = 0; j < pageData; j++) {
|
||||
contentList.add("");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// 处理每一页的识别结果
|
||||
for (int j = 0; j < layoutParsingResults.size(); j++) {
|
||||
JSONObject layoutParsingResult = layoutParsingResults.getJSONObject(j);
|
||||
JSONObject markdown = layoutParsingResult.getJSONObject("markdown");
|
||||
if (markdown == null) {
|
||||
contentList.add("");
|
||||
continue;
|
||||
}
|
||||
String text = markdown.getString("text");
|
||||
if (StringUtils.isBlank(text)) {
|
||||
contentList.add("");
|
||||
continue;
|
||||
}
|
||||
contentList.add(text);
|
||||
}
|
||||
// 如果返回的页数少于预期页数,补充空字符串
|
||||
if (layoutParsingResults.size() < pageData) {
|
||||
for (int j = layoutParsingResults.size(); j < pageData; j++) {
|
||||
contentList.add("");
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error("获取页面 {} 内容失败", pageNum, e);
|
||||
// 出错时为每一页添加空字符串保持页数一致
|
||||
for (int j = 0; j < pageData; j++) {
|
||||
contentList.add("");
|
||||
}
|
||||
}
|
||||
}
|
||||
return contentList;
|
||||
}
|
||||
|
||||
// 获取所有页面内容
|
||||
public String extractImgPagesContent(File imgFile) throws Exception {
|
||||
try {
|
||||
byte[] pageBytes = Files.readAllBytes(imgFile.toPath());
|
||||
String base64 = Base64.getEncoder().encodeToString(pageBytes);
|
||||
if (StrUtil.isBlank(base64)) {
|
||||
throw new ServiceException("无法获取图片内容");
|
||||
}
|
||||
Files.deleteIfExists(imgFile.toPath()); // 清理文件
|
||||
|
||||
// 构造请求体
|
||||
Map<String, Object> requestBody = new HashMap<>();
|
||||
requestBody.put("file", base64);
|
||||
requestBody.put("fileType", 1); // 文件类型。0表示PDF文件,1表示图像文件
|
||||
HttpRequest post = HttpUtil.createPost(orcServerUrl);
|
||||
post.header("Content-Type", "application/json");
|
||||
post.body(JSONObject.toJSONString(requestBody));
|
||||
HttpResponse response = post.execute();
|
||||
|
||||
JSONObject jsonObject = JSONObject.parseObject(response.body());
|
||||
if (jsonObject.containsKey("errorCode") && jsonObject.getIntValue("errorCode") == 500) {
|
||||
log.error("请求失败,错误码:{},错误信息:{}", jsonObject.getIntValue("errorCode"), jsonObject.getString("errorMsg"));
|
||||
return "";
|
||||
}
|
||||
JSONObject result = jsonObject.getJSONObject("result");
|
||||
JSONArray layoutParsingResults = result.getJSONArray("layoutParsingResults");
|
||||
if (layoutParsingResults == null || layoutParsingResults.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
JSONObject layoutParsingResults_0 = layoutParsingResults.getJSONObject(0);
|
||||
JSONObject markdown = layoutParsingResults_0.getJSONObject("markdown");
|
||||
if (markdown == null) {
|
||||
return "";
|
||||
}
|
||||
String text = markdown.getString("text");
|
||||
if (StringUtils.isBlank(text)) {
|
||||
return "";
|
||||
}
|
||||
return text;
|
||||
} catch (Exception e) {
|
||||
log.error("获取图片内容失败", e);
|
||||
throw new ServiceException("获取图片内容失败");
|
||||
} finally {
|
||||
try {
|
||||
if (imgFile != null && imgFile.exists()) {
|
||||
Files.deleteIfExists(imgFile.toPath());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
log.error("删除文件失败", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* L base64 M 临时文件 R 是否内容为图片
|
||||
* @param pdfFile
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
private static List<Triple<String, Integer, Boolean>> splitPdfToBase64Pages(File pdfFile) throws IOException {
|
||||
List<Triple<String, Integer, Boolean>> base64Pages = new ArrayList<>();
|
||||
try (PDDocument document = Loader.loadPDF(pdfFile)) {
|
||||
int pageCount = document.getNumberOfPages();
|
||||
// 每10页为一组进行切割
|
||||
for (int startPage = 0; startPage < pageCount; startPage += 10) {
|
||||
int endPage = Math.min(startPage + 10, pageCount);
|
||||
PDDocument pageDoc = new PDDocument();
|
||||
|
||||
// 添加10页内容到新文档
|
||||
for (int i = startPage; i < endPage; i++) {
|
||||
PDPage page = document.getPage(i);
|
||||
pageDoc.addPage(page);
|
||||
}
|
||||
|
||||
// 使用 try-with-resources 确保临时文件被正确处理
|
||||
File tempPageFile = null;
|
||||
try {
|
||||
// 创建唯一的临时文件
|
||||
tempPageFile = File.createTempFile("page_" + System.currentTimeMillis() + "_" + startPage, ".pdf");
|
||||
|
||||
// 保存页面到临时文件
|
||||
pageDoc.save(tempPageFile);
|
||||
|
||||
// 转换为 Base64
|
||||
byte[] pageBytes = Files.readAllBytes(tempPageFile.toPath());
|
||||
String base64 = Base64.getEncoder().encodeToString(pageBytes);
|
||||
|
||||
// 判断是否扫描件
|
||||
boolean scannedPdf = true;
|
||||
try (PDDocument temp = Loader.loadPDF(tempPageFile)) {
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
String text = stripper.getText(temp);
|
||||
if (text != null && !text.trim().isEmpty()) {
|
||||
scannedPdf = false;
|
||||
}
|
||||
}
|
||||
// 中间字段记录这组包含的页数
|
||||
int pagesInGroup = endPage - startPage;
|
||||
base64Pages.add(ImmutableTriple.of(base64, pagesInGroup, scannedPdf));
|
||||
|
||||
} finally {
|
||||
// 确保在任何情况下都关闭并删除临时文件
|
||||
pageDoc.close();
|
||||
if (tempPageFile != null && tempPageFile.exists()) {
|
||||
Files.deleteIfExists(tempPageFile.toPath());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return base64Pages;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user