import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
public class TikaUtil {
public static String parseFile(File f ) throws Exception{
// Parser parser = new PDFParser();
// Parser parser = new HtmlParser();
//Parser parser = new OOXMLParser() 2010 office用这个
//Parser parser = new OfficeParser(); //2003以下用这个
Parser parser =new AutoDetectParser();//程序自动检测parser
InputStream iStream = new BufferedInputStream(new FileInputStream(f));
StringWriter stringWriter = new StringWriter();
Writer writer = new BufferedWriter(stringWriter);
ContentHandler handler = new BodyContentHandler(writer);
parser.parse(iStream, handler, new Metadata(), new ParseContext());
return stringWriter.toString();
}
}
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
public class TikaUtil {
public static String parseFile(File f ) throws Exception{
// Parser parser = new PDFParser();
}
}