POI提取word出错
代码如下
package com.util.extract;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.RichTextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
public class FileExtractor {
public static String getSuffix(String fileName){
String suffix=fileName.substring(fileName.lastIndexOf(".")+1);
return suffix;
}
public static String wordExtractor(String fileName){
try{
InputStream in = new FileInputStream(fileName);
String text = "";
if(getSuffix(fileName).equals("doc")){
WordExtractor wordExtractor = new WordExtractor(in);
text = wordExtractor.getText();
}else if(getSuffix(fileName).equals("docx")){
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
text = ex.getText();
}else if(getSuffix(fileName).equals("pdf")){
PDFParser p = new PDFParser(in);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
text = ts.getText(p.getPDDocument());
}else{
text = "";
}
in.close();
return text;
}catch(IOException e){
e.printStackTrace();
return null;
} catch (XmlException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
} catch (OpenXML4JException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return null;
}
}
}
测试代码为
public class TestExtractor {
public static void main(String[] args) throws IOException{
String filename = "test.doc";
String content = FileExtractor.wordExtractor(filename);
System.out.println(content);
}
}
提取的文件大小为1.6M,并且不是纯文本,word中还有批注、目录等。
异常为
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 204138
at org.apache.poi.util.LittleEndian.getShort(LittleEndian.java:46)
at org.apache.poi.hwpf.model.ListLevel.<init>(ListLevel.java:120)
at org.apache.poi.hwpf.model.ListFormatOverrideLevel.<init>(ListFormatOverrideLevel.java:48)
at org.apache.poi.hwpf.model.ListTables.<init>(ListTables.java:91)
at org.apache.poi.hwpf.HWPFDocument.<init>(HWPFDocument.java:236)
at org.apache.poi.hwpf.HWPFDocument.<init>(HWPFDocument.java:131)
at org.apache.poi.hwpf.extractor.WordExtractor.<init>(WordExtractor.java:61)
at org.apache.poi.hwpf.extractor.WordExtractor.<init>(WordExtractor.java:53)
at com.util.extract.FileExtractor.wordExtractor(FileExtractor.java:41)
at com.util.extract.TestExtractor.main(TestExtractor.java:17)
不明白是什么原因,大致感觉可能是不是word文件不太大?