first commit
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
package com.qingyun.service.compare;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 提取每页的摘要信息(指纹)
|
||||
*/
|
||||
public class PageFingerprinter {
|
||||
public static List<String> extractPageSummaries(PDDocument document) throws IOException, IOException {
|
||||
List<String> summaries = new ArrayList<>();
|
||||
for (int i = 0; i < document.getNumberOfPages(); i++) {
|
||||
PDPage page = document.getPage(i);
|
||||
PDFTextStripper stripper = new PDFTextStripper();
|
||||
stripper.setStartPage(i + 1);
|
||||
stripper.setEndPage(i + 1);
|
||||
String text = stripper.getText(document);
|
||||
// 去掉回车符、空格、换行符、换页符、缩进符、'\r'
|
||||
text = text.replaceAll("\\s+", "");
|
||||
// 取前 100 字作为指纹(可根据需要扩展)
|
||||
//String summary = text.length() > 500 ? text.substring(0, 500) : text;
|
||||
summaries.add(text);
|
||||
}
|
||||
return summaries;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user