首先,介绍一下poi。
将.doc或.docx文件转换为text的方法如下:
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
class WordToText{
public static String wordToText(String filepath)throws Exception{
String returnstr="";
try{
File file=new File(filepath);
FileInputStream fis=new FileInputStream(file);
if(filepath.endsWith(".doc")){
WordExtractor doc=new WordExtractor(fis);
returnstr=doc.getText();//提取.doc正文文本
}else if(filepath.endsWith(".docx")){
XWPFWordExtractor docx=new XWPFWordExtractor(POIXMLDocument.openPackage(filepath));
returnstr=docx.getText();
}
}catch(FileNotFoundException e){
e.printStackTrace();
}catch(IOException e){
e.printStackTrace();
}catch(Exception e){
e.printStackTrace();
}
return returnstr;
}
}