32 lines
1.1 KiB
Java
32 lines
1.1 KiB
Java
package com.qingyun.service.compare;
|
|
|
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
|
import org.apache.pdfbox.pdmodel.PDPage;
|
|
import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
|
import java.io.IOException;
|
|
import java.util.ArrayList;
|
|
import java.util.List;
|
|
|
|
/**
|
|
* 提取每页的摘要信息(指纹)
|
|
*/
|
|
public class PageFingerprinter {
|
|
public static List<String> extractPageSummaries(PDDocument document) throws IOException, IOException {
|
|
List<String> summaries = new ArrayList<>();
|
|
for (int i = 0; i < document.getNumberOfPages(); i++) {
|
|
PDPage page = document.getPage(i);
|
|
PDFTextStripper stripper = new PDFTextStripper();
|
|
stripper.setStartPage(i + 1);
|
|
stripper.setEndPage(i + 1);
|
|
String text = stripper.getText(document);
|
|
// 去掉回车符、空格、换行符、换页符、缩进符、'\r'
|
|
text = text.replaceAll("\\s+", "");
|
|
// 取前 100 字作为指纹(可根据需要扩展)
|
|
//String summary = text.length() > 500 ? text.substring(0, 500) : text;
|
|
summaries.add(text);
|
|
}
|
|
return summaries;
|
|
}
|
|
}
|