package com.linzl.cn.convert;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.Timestamp;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.SAXException;
/**
* apache 读取文档纯文本内容
* tika包下载 http://archive.apache.org/dist/tika/
* @author linzl
*
*/
public class GetPlainTextUtil {
private String str = "D:/测试目录/pureText/";
public String parseToString() throws IOException, SAXException,
TikaException {
File file = new File(str + "openWindow.zip");
InputStream stream = new FileInputStream(file);
Tika tika = new Tika();
try {
return tika.parseToString(stream);
} finally {
stream.close();
}
}
public String parseToPlainText() throws IOException, SAXException,
TikaException {
BodyContentHandler handler = new BodyContentHandler();
File file = new File(str + "Zip.zip");
file = new File(str + "html.html");
// file = new File(str + "Java.java");
// file = new File(str + "Odt.odt");
file = new File(str + "2007.dotx");
file = new File(str + "2007.potx");
file = new File(str + "2007.xltx");
InputStream stream = new FileInputStream(file);
Metadata metadata = new Metadata();
AutoDetectParser parser = new AutoDetectParser();
try {
parser.parse(stream, handler, metadata);
return handler.toString();
} finally {
stream.close();
}
}
public static void main(String[] args) throws IOException, SAXException,
TikaException {
long start = System.currentTimeMillis();
// 获取到的纯文本内容含有大量的换行,需要进行修改
String content = new GetPlainTextUtil().parseToPlainText();
System.out.println(content);
long end = System.currentTimeMillis();
// parseToStringExample
System.out.println("时间:" + (end - start));
}
}
java提取文档纯文本
最新推荐文章于 2024-03-09 11:51:00 发布