maven坐标
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-server</artifactId>
<version>1.20</version>
</dependency>
测试代码
@Test
public void parseToPlainText() throws IOException, TikaException, org.xml.sax.SAXException {
BodyContentHandler handler = new BodyContentHandler();
File file=new File("C:/Users/yunxun/Desktop/开发总纲.doc");
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
ParseContext content = new ParseContext();
//自动检测文件类型
Parser parser = new AutoDetectParser();
InputStream stream = new FileInputStream(file);
content.set(Parser.class,parser);
parser.parse(stream, handler, metadata, content);
// System.out.println("数据内容:\n" + handler.toString());
System.out.println("元数据:");
String[] metadataNames = metadata.names();
for (String name : metadataNames) {
System.out.println(name + ": " + metadata.get(name));
}
}
结果: