<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.1</version>
</dependency>
这是tika的解析包
下面是测试代码,都有效
package com.crsri.tika.tes;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.SAXException;
import com.crsri.TgdsmApplicationTests;
/**
- Tika测试类
- @author liufei
*/
public class TikaTest extends TgdsmApplicationTests{
/**
* 解析超链接
* @throws MalformedURLException
* @throws IOException
* @throws TikaException
*/
@Test
public void tikaTest1() throws MalformedURLException, IOException, TikaException {
Tika tika = new Tika();
String parseToString = tika.parseToString(new URL("https://www.baidu.com"));
System.out.println(parseToString);
}
/**
* 解析doc文本
* @throws MalformedURLException
* @throws IOException
* @throws TikaException
*/
@Test
public void tikaTest2() throws MalformedURLException, IOException, TikaException {
Tika tika = new Tika();
File file = new File("D:\\caomao2.docx");
String parseToString = tika.parseToString(file);
System.out.println(parseToString);
}
/**
* 解析excle文本
* @throws MalformedURLException
* @throws IOException
* @throws TikaException
*/
@Test
public void tikaTest3() throws MalformedURLException, IOException, TikaException {
Tika tika = new Tika();
File file = new File("D:\\工作簿1.xlsx");
String parseToString = tika.parseToString(file);
System.out.println(parseToString);
}
/**
* 解析txt文本
* @throws MalformedURLException
* @throws IOException
* @throws TikaException
*/
@Test
public void tikaTest4() throws MalformedURLException, IOException, TikaException {
Tika tika = new Tika();
File file = new File("D:\\base64.txt");
String parseToString = tika.parseToString(file);
System.out.println(parseToString);
}
/**
* 解析pdf文本
* @throws MalformedURLException
* @throws IOException
* @throws TikaException
*/
@Test
public void tikaTest5() throws MalformedURLException, IOException, TikaException {
Tika tika = new Tika();
File file = new File("F:\\猫眼\\UML_Reference_Manual.pdf");
String parseToString = tika.parseToString(file);
System.out.println(parseToString);
}
/**
* 用特定api去解析正文的标题,类型,正文内容
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
@Test
public void test10() throws IOException, SAXException, TikaException {
FileInputStream input=new FileInputStream(new File("D:\\窗前明月光.docx"));//可以写文件路径,pdf,word,html等
BodyContentHandler textHandler=new BodyContentHandler();//获取内容
Metadata matadata=new Metadata();//Metadata对象保存了作者,标题等元数据
AutoDetectParser parser = new AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
ParseContext context=new ParseContext();
parser.parse(input, textHandler, matadata, context);//执行解析过程
input.close();
System.out.println("Title: "+matadata.get(Metadata.TITLE));
System.out.println("Type: "+matadata.get(Metadata.TYPE));
System.out.println("Body: "+textHandler.toString());//从textHandler打印正文
}
/**
* 用输入流的方式解析
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
@Test
public void test11() throws IOException, SAXException, TikaException {
FileInputStream input=new FileInputStream(new File("D:\\窗前明月光.docx"));//可以写文件路径,pdf,word,html等
Tika tika = new Tika();
String parseToString = tika.parseToString(input);
}
}