用tika来解析pdf,word,excle,txt,超链接

	<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
	<dependency>
	   <groupId>org.apache.tika</groupId>
       <artifactId>tika-parsers</artifactId>
       <version>1.1</version>
	</dependency>
	这是tika的解析包

下面是测试代码,都有效

package com.crsri.tika.tes;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.SAXException;
import com.crsri.TgdsmApplicationTests;
/**

  • Tika测试类
  • @author liufei

*/
public class TikaTest extends TgdsmApplicationTests{

 /**
  * 解析超链接
  * @throws MalformedURLException
  * @throws IOException
  * @throws TikaException
  */
 @Test
 public void tikaTest1() throws MalformedURLException, IOException, TikaException {
	 Tika tika = new Tika();
	 String parseToString = tika.parseToString(new URL("https://www.baidu.com"));
	 System.out.println(parseToString);
 }
 
 /**
  * 解析doc文本
  * @throws MalformedURLException
  * @throws IOException
  * @throws TikaException
  */
 @Test
 public void tikaTest2() throws MalformedURLException, IOException, TikaException {
	 Tika tika = new Tika();
	 File file = new File("D:\\caomao2.docx");
	 String parseToString = tika.parseToString(file);
	 System.out.println(parseToString);
 }
 
 /**
  * 解析excle文本
  * @throws MalformedURLException
  * @throws IOException
  * @throws TikaException
  */
 @Test
 public void tikaTest3() throws MalformedURLException, IOException, TikaException {
	 Tika tika = new Tika();
	 File file = new File("D:\\工作簿1.xlsx");
	 String parseToString = tika.parseToString(file);
	 System.out.println(parseToString);
 }
 
 /**
  * 解析txt文本
  * @throws MalformedURLException
  * @throws IOException
  * @throws TikaException
  */
 @Test
 public void tikaTest4() throws MalformedURLException, IOException, TikaException {
	 Tika tika = new Tika();
	 File file = new File("D:\\base64.txt");
	 String parseToString = tika.parseToString(file);
	 System.out.println(parseToString);
 }
 
 /**
  * 解析pdf文本
  * @throws MalformedURLException
  * @throws IOException
  * @throws TikaException
  */
 @Test
 public void tikaTest5() throws MalformedURLException, IOException, TikaException {
	 Tika tika = new Tika();
	 File file = new File("F:\\猫眼\\UML_Reference_Manual.pdf");
	 String parseToString = tika.parseToString(file);
	 System.out.println(parseToString);
 }
 
 /**
  * 用特定api去解析正文的标题,类型,正文内容
  * @throws IOException
  * @throws SAXException
  * @throws TikaException
  */
@Test
public void test10() throws IOException, SAXException, TikaException {
	 FileInputStream  input=new FileInputStream(new File("D:\\窗前明月光.docx"));//可以写文件路径,pdf,word,html等
     BodyContentHandler textHandler=new BodyContentHandler();//获取内容
     Metadata matadata=new Metadata();//Metadata对象保存了作者,标题等元数据
     AutoDetectParser parser = new  AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParser
     ParseContext context=new ParseContext();
     parser.parse(input, textHandler, matadata, context);//执行解析过程
     input.close();
     System.out.println("Title: "+matadata.get(Metadata.TITLE));
     System.out.println("Type: "+matadata.get(Metadata.TYPE));
     System.out.println("Body: "+textHandler.toString());//从textHandler打印正文
}

/**
 * 用输入流的方式解析
 * @throws IOException
 * @throws SAXException
 * @throws TikaException
 */
@Test
public void test11() throws IOException, SAXException, TikaException {
	FileInputStream  input=new FileInputStream(new File("D:\\窗前明月光.docx"));//可以写文件路径,pdf,word,html等
	Tika tika = new Tika();
	String parseToString = tika.parseToString(input);
}

}

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值