Tika是2008年才产生的一个apache的项目,主要用于打开各种不同的文档,1.0
public class IndexUtil {
public void index() {
try {
File f = new File("d:/lucene/example2/MyBatis 3 User Guide Simplified Chinese.pdf");
Directory dir = FSDirectory.open(new File("d:/lucene/file2"));
IndexWriter writer = new IndexWriter(dir,new IndexWriterConfig(Version.LUCENE_35, new MMSegAnalyzer()));
writer.deleteAll();
Document doc = new Document();
doc.add(new Field("content",new Tika().parse(f) ));
writer.addDocument(doc);
writer.close();
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public String tikaTool(File f) throws IOException, TikaException {
Tika tika = new Tika();
Metadata metadata = new Metadata();
metadata.set(Metadata.AUTHOR, "空号");
metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
String str = tika.parseToString(new FileInputStream(f),metadata);
for(String name:metadata.names() ) {
System.out.println(name+":"+metadata.get(name));
}
return str;
}
public String fileToTxt(File f) {
Parser parser = new AutoDetectParser();
InputStream is = null;
try {
Metadata metadata = new Metadata();
metadata.set(Metadata.AUTHOR, "空号");
metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
is = new FileInputStream(f);
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
context.set(Parser.class,parser);
parser.parse(is,handler, metadata,context);
for(String name:metadata.names()) {
System.out.println(name+":"+metadata.get(name));
}
return handler.toString();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (SAXException e) {
e.printStackTrace();
} catch (TikaException e) {
e.printStackTrace();
} finally {
try {
if(is!=null) is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return null;
}
}