tika提取正文不乱码,但是当正文内容特别少时候,比如只有一个汉字时就会乱码,感觉他的编码的识别方法应该是基于一种策略。这种策略,是根据正文内容来计算的,所以当内容特别少时,编码计算容易失败!估计是使用统计学和启发式方法对网页源码进行编码探测。ICU4J就是基于第二种方式的类库,由IBM提供。 tika支持的编码检测方法有:HtmlEncodingDetector UniversalEncodingDetector Icu4jEncodingDetector
package com.jiepu.tika_demo;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import org.apache.tika.Tika;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.Icu4jEncodingDetector;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
/**
* Hello world!
*
*/
public class App {
public static void main(String[] args) throws Exception {
Tika tika = new Tika();
//System.out.println(tika.detect("http://127.0.0.1:8080/xd/index.txt"));
/*System.out.println(tika.detect("x.html"));
System.out.println(tika.detect("110.mp3"));
System.out.println(tika.detect("110.apk"));
System.out.println(tika.detect("110.ipa"));
System.out.println(tika.detect("110.exe"));
System.out.println(tika.detect("110.eml"));*/
//String content = tika.parseToString(new File("G:\\测试数据\\test-documents\\EmbeddedDocument.docx"));
//System.out.println(content);
//System.out.println(tika.translate("fuck", "en"));
//System.out.println(tika.detect(new File("G:\\测试数据\\test\\guangxi105.pdf")));
//tika.translate(text, sourceLanguage, targetLanguage)
File dir=new File("g:\\测试数据\\天猫\\");
for (File file: dir.listFiles()) {
System.out.print(file.getAbsolutePath()+" ");
InputStream fileInputStream=new FileInputStream(file);
String type=tika.detect(file.getAbsolutePath());
System.out.println(type);
Metadata metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE,type);
//CharsetDetector charsetDetector=new CharsetDetector();
//HtmlEncodingDetector UniversalEncodingDetector Icu4jEncodingDetector
EncodingDetector encodingDetector=new Icu4jEncodingDetector();
Charset encode=encodingDetector.detect(new BufferedInputStream(fileInputStream), new Metadata());
System.out.println(encode.name());
metadata.set(Metadata.CONTENT_ENCODING, encode.name());
//String content = tika.parseToString(fileInputStream,metadata);
String content = tika.parseToString(file);
if(content.equals(""))
{
System.out.println("content==null");
}else{
System.out.println(content);
}
//System.out.println(read(file.getAbsolutePath(), "gb2312"));
}
//test001();
//testdoc();
System.out.println("Hello World!");
}
public static String read(String fileName, String encoding) {
String string = "";
try {
BufferedReader in = new BufferedReader(new InputStreamReader(
new FileInputStream(fileName), encoding));
String str = "";
while ((str = in.readLine()) != null) {
string += str + "\n";
}
in.close();
} catch (Exception ex) {
ex.printStackTrace();
}
return string;
}
public static void write(String fileName, String encoding, String str) {
try {
Writer out = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(fileName), encoding));
out.write(str);
out.close();
} catch (Exception ex) {
ex.printStackTrace();
}
}
private static void testdoc() {
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
try {
InputStream stream = new FileInputStream("G:\\测试数据\\test-documents\\EmbeddedDocument.docx");
parser.parse(
stream,handler, metadata, new ParseContext());
System.out.println(handler.toString());
} catch (Exception e) {
e.printStackTrace();
} finally {
}
}
private static void test001() {
Parser parser = new AutoDetectParser(); // Should auto-detect!
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
try {
InputStream stream = new FileInputStream(
"G:/测试数据/test-documents/testMP4.m4a");
parser.parse(stream, handler, metadata, new ParseContext());
System.out.println(handler.toString());
stream.close();
} catch (Exception e) {
e.printStackTrace();
} finally {
}
}
}
package com.jiepu.tika_demo;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.txt.CharsetDetector;
import org.apache.tika.parser.txt.CharsetMatch;
import org.apache.tika.parser.txt.UniversalEncodingDetector;
/**
* 本类使用ICU4J包进行文档编码获取
*
*/
public class EncodeDetector {
/**
* 获取编码
* @throws IOException
* @throws Exception
*/
public static String getEncode(byte[] data,String url){
CharsetDetector detector = new CharsetDetector();
detector.setText(data);
CharsetMatch match = detector.detect();
String encoding = match.getName();
System.out.println("The Content in " + match.getName());
CharsetMatch[] matches = detector.detectAll();
System.out.println("All possibilities");
for (CharsetMatch m : matches) {
//System.out.println("CharsetName:" + m.getName() + " Confidence:"+ m.getConfidence());
}
return encoding;
}
public static String getEncode(InputStream data,String url) throws IOException{
CharsetDetector detector = new CharsetDetector();
detector.setText(data);
CharsetMatch match = detector.detect();
String encoding = match.getName();
System.out.println("The Content in " + match.getName());
CharsetMatch[] matches = detector.detectAll();
System.out.println("All possibilities");
for (CharsetMatch m : matches) {
// System.out.println("CharsetName:" + m.getName() + " Confidence:"+ m.getConfidence());
}
return encoding;
}
public static void main(String[] args) throws Exception {
String encode=getEncode(new BufferedInputStream(new FileInputStream("G:\\测试数据\\天猫\\002.txt")), "");
System.out.println(encode);
File file=new File("G:\\测试数据\\天猫\\002.txt");
InputStream stream=null;
try
{
stream=new FileInputStream(file);
EncodingDetector detector=new UniversalEncodingDetector();
Charset charset = detector.detect(new BufferedInputStream(stream), new Metadata());
System.out.println("编码:"+charset.name());
} finally
{
if (stream != null)
stream.close();
}
}
}
http://www.cnblogs.com/chenying99/archive/2013/03/07/2947296.html
http://www.pipetips.com/names/2013/03/20/246647.html