需要依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.6</version>
</dependency>
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>free.spire.doc</artifactId>
<version>2.7.3</version>
</dependency>
<!-- PDF识别 -->
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf.free</artifactId>
<version>2.2.2</version>
</dependency>
查找关键并生成关键字高亮的新文件
package cn.com.demo.util;
import java.awt.Color;
import java.io.File;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import com.spire.doc.Document;
import com.spire.doc.FileFormat;
import com.spire.doc.Section;
import com.spire.doc.documents.TextSelection;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import com.spire.pdf.general.find.PdfTextFind;
public class KeyWordFromOfficeAndSaveFile {
/**
* 通过关键字查找内容-PDF
* @param sourceFileName
* @param keyWord
* @return
*/
public static String getContentByKeyWord(String sourceFileName, String keyWord) {
int count = 0;
boolean flag = false;
StringBuffer buffer = new StringBuffer();
Document doc = new Document(sourceFileName);
for (int i = 0; i < doc.getSections().getCount(); i++) {
if (flag) {
break;
}
Section p = doc.getSections().get(i);
for (int j = 0; j < p.getParagraphs().getCount(); j++) {
String text = p.getParagraphs().get(j).getText();
if (text != null && text.contains(keyWord)) {
buffer.append(text).append("...\n");
if (++count > 2) {
flag = true;
break;
}
}
}
}
return buffer.toString();
}
/**
* 保存文件并高亮关键字-Word
* @param sourceFileName
* @param keyWord
* @param destFileName
*/
public static void saveAndHighlightWordText(String sourceFileName, String keyWord, String destFileName) {
// 加载Word文档
Document document = new Document(sourceFileName);
// 查找所有“荷塘”文本
TextSelection[] textSelections = document.findAllString(keyWord, false, false);
// 设置高亮颜色
for (TextSelection selection : textSelections) {
System.out.println(selection.getSelectedText());
selection.getAsOneRange().getCharacterFormat().setHighlightColor(Color.YELLOW);
}
// 保存文档
document.saveToFile(destFileName, FileFormat.Docx_2013);
}
/**
* 保存文件并高亮关键字-PDF
*
* @param fileName
* @param keyWord
*/
@SuppressWarnings("unchecked")
public static void saveAndHighlightPdfText(String sourceFileName, String keyWord, String destFileName) {
// 加载示例PDF文档
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(sourceFileName);
pdf.getPages().add();
PdfTextFind[] result = null;
// 遍历文档页面
for (PdfPageBase page : (Iterable<PdfPageBase>) pdf.getPages()) {
// 查找文档中所有的"添加"字符串
result = page.findText(keyWord).getFinds();
for (PdfTextFind find : result) {
try {
// 高亮显示查找结果
find.applyHighLight(Color.yellow);
} catch (Exception e) {
e.printStackTrace();
}
}
}
pdf.getPages().remove(pdf.getPages().get(pdf.getPages().getCount() - 1));
// 保存文档
pdf.saveToFile(destFileName);
pdf.close();
}
/**
* 通过关键字查找内容-PDF
*
* @param fileName
* @param keyWord
* @return
*/
public static String getContentByKeyPdf(String fileName, String keyWord) {
StringBuffer buffer = null;
try (PDDocument document = PDDocument.load(new File(fileName))) {
document.getClass();
if (!document.isEncrypted()) {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDFTextStripper tStripper = new PDFTextStripper();
String pdfFileInText = tStripper.getText(document);
String[] lines = pdfFileInText.split("\\r?\\n");
buffer = new StringBuffer();
int count = 0;
for (String line : lines) {
if (line.contains(keyWord)) {
count++;
buffer.append(line).append("......\n");
}
if (count > 2) {
break;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
System.out.println(buffer.toString());
return buffer.toString();
}
}