public static void parseWord(MultipartFile file) throws ParseException, IOException {
String buffer = "";
try {
if (file.getOriginalFilename().endsWith(".doc")) {
InputStream stream = file.getInputStream();
WordExtractor ex = new WordExtractor(stream);
buffer = ex.getText();
stream.close();
} else if (file.getOriginalFilename().endsWith("docx")) {
InputStream stream = file.getInputStream();
XWPFDocument document = new XWPFDocument(stream);
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
buffer = xwpfWordExtractor.getText();
stream.close();
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
}
// 截取文件内容
// 文件内容(例如获取开始时间 文件内容:工作开始时间: ___1999___年_9___月__9__日__0__时__0__分 结束时间)
String startTimeLine = buffer.substring(buffer.indexOf("开始时间"),buffer.indexOf("结束时间"));
String searchContent ="始时间:";
int startTimeStart = startTimeLine.indexOf(searchContent);
int startTimeEnd = startTimeLine.indexOf("结束");
String startTimeStr = startTimeLine.substring(startTimeStart+searchContent.length(),startTimeEnd).trim();
startTimeStr = startTimeStr.replaceAll(" ", "");
startTimeStr = startTimeStr.replaceAll("(?:年|月)", "-");
startTimeStr = startTimeStr.replaceAll("(?:日)", " ");
startTimeStr = startTimeStr.replaceAll("(?:时)", ":");
startTimeStr = startTimeStr.replaceAll("(?:分)", "");
startTimeStr = startTimeStr.replaceAll("(?:_|—)", "");
}
思路:先把上传的MultipartFile类型的word文件解析,然后从解析出来的String字符串提取文件内容,这里使用截取方法