package com.qingyun.service.compare; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.text.PDFTextStripper; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * 提取每页的摘要信息(指纹) */ public class PageFingerprinter { public static List extractPageSummaries(PDDocument document) throws IOException, IOException { List summaries = new ArrayList<>(); for (int i = 0; i < document.getNumberOfPages(); i++) { PDPage page = document.getPage(i); PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(i + 1); stripper.setEndPage(i + 1); String text = stripper.getText(document); // 去掉回车符、空格、换行符、换页符、缩进符、'\r' text = text.replaceAll("\\s+", ""); // 取前 100 字作为指纹(可根据需要扩展) //String summary = text.length() > 500 ? text.substring(0, 500) : text; summaries.add(text); } return summaries; } }