Files
contract-review/qingyun-service/src/main/java/com/qingyun/service/compare/PageFingerprinter.java
2026-01-30 14:25:12 +08:00

32 lines
1.1 KiB
Java

package com.qingyun.service.compare;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 提取每页的摘要信息(指纹)
*/
public class PageFingerprinter {
public static List<String> extractPageSummaries(PDDocument document) throws IOException, IOException {
List<String> summaries = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
PDPage page = document.getPage(i);
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(i + 1);
stripper.setEndPage(i + 1);
String text = stripper.getText(document);
// 去掉回车符、空格、换行符、换页符、缩进符、'\r'
text = text.replaceAll("\\s+", "");
// 取前 100 字作为指纹(可根据需要扩展)
//String summary = text.length() > 500 ? text.substring(0, 500) : text;
summaries.add(text);
}
return summaries;
}
}