/**
* 提取WORD文档中的文本内容
* @param path
* @return
*/
private static String getWordText(String path) {
String resultText = null;
if (path.endsWith(".doc")) {
try (FileInputStream is = new FileInputStream(new File(path)); WordExtractor re = new WordExtractor(is);) {
resultText = re.getText();
} catch (IOException e) {
e.printStackTrace();
}
} else if (path.endsWith(".docx")) {
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
XWPFWordExtractor extractor = new XWPFWordExtractor(opcPackage);
resultText = extractor.getText();
extractor.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return resultText;
}
/**
* 提取PDF中的文本内容
* @param file
* @return
*/
private static String getPdfText(File file) {
String resultText = null;
try {
PDDocument pdDocument = PDDocument.load(file);
PDFTextStripper stripper = new PDFTextStripper();
resultText = stripper.getText(pdDocument);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return resultText;
}
WORD/PDF纯文本提取
最新推荐文章于 2024-04-23 18:52:22 发布