first commit

This commit is contained in:
2026-01-30 14:25:12 +08:00
commit 8dd8d2668a
899 changed files with 90844 additions and 0 deletions

View File

@@ -0,0 +1,31 @@
package com.qingyun.service.compare;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* 提取每页的摘要信息(指纹)
*/
public class PageFingerprinter {
public static List<String> extractPageSummaries(PDDocument document) throws IOException, IOException {
List<String> summaries = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
PDPage page = document.getPage(i);
PDFTextStripper stripper = new PDFTextStripper();
stripper.setStartPage(i + 1);
stripper.setEndPage(i + 1);
String text = stripper.getText(document);
// 去掉回车符、空格、换行符、换页符、缩进符、'\r'
text = text.replaceAll("\\s+", "");
// 取前 100 字作为指纹(可根据需要扩展)
//String summary = text.length() > 500 ? text.substring(0, 500) : text;
summaries.add(text);
}
return summaries;
}
}