1、通过POI项目来索引。
public class POIWordDocHandler implements DocumentHandler{
public Document getDocument(InputStream is)throws DocumentHandlerException{
StringbodyText=null;
try{//提取文本字段,接收word文档的inputstream对象,允许把原文写入writer类,从wrtier类提取
WordDocument wd=new wordDocument(is);
StringWriterdocTextWriter=new StringWriter();
wd.writeAllText(bew PrintWriter(docTextWriter));
docTextWriter.close();
bodyText=docTextWriter.toString();
}
catch(Exception e){
throw new DocumentHandlerException("cannot extracttext from a word document",e);
}
if((bodyText!=null)&&(bodyText.trim().length()>0)){
Document doc=new Document();
doc.add(Fie
.UnStored("body",bodyText));return doc;
}
returnnull;
}
public static void main() throwsException{
POIWordDocHandler handler=new POIWordDocHan
er();Documentdoc=handler.getDocument(new FileInputStream(newFile(args[0])));
System.out.println(doc);
}
}
2、使用TextMining.org包API,支持从WORD6/95
public class TextMiningWordDocHan
er implementsDocumentHan er{publicDocument getDocument throws DocumentHandlerException(){
String bodyText=null;
try{
bodyText=newWordExtractor().extractText(is);//从InputStream对象中提取文本
}
catch (Exception e){
throw new DocumentHandlerException("cannot extract text from a worddocument",e);
}
if((bodyText!=null)&&(bodyText.trim().length()>0)){
Document doc=new Document();
doc.add(Fie
.unStored("body",bodyText));return doc;
}
return null;
}
publicstatic void main(String[] args) throws Exception{
TextMiningWordDocHandler handler=newTextMiningWordDocHandler();
Document doc=handler.getDocument(new FileInputStream(newFile(args[0])));
System.out.println(doc);
}
}