参考来源
https://blog.csdn.net/majianxiong_lzu/article/details/90024608
案例补充:
private static void test()throws Exception {
String src = "E:\\src\\测试数据\\pdf\\02.pdf" ;
File f = new File(src);
Tika tika = new Tika(); //创建一个Tika类
//利用Tika的detect方法检测文件的实际类型
System.out.println("filetype:"+tika.detect(src));
//利用Tika的parseToString()方法读取文件的文本内容
System.out.println("content:"+tika.parseToString(f));
//利用Tika的AutoDetectReader类检测文件的编码格式
AutoDetectReader dr = new AutoDetectReader(new FileInputStream(src));
System.out.println("charset:"+dr.getCharset().name());
//利用Tika的LanguIdentifier类检测字符串的语言
LanguageIdentifier identifier = new LanguageIdentifier("this is English");
System.out.println("language:"+identifier.getLanguage());
}