package office;
/**
* 读取Doc,Excel,PDF,html,生成Txt文件,读取Txt生成Excel文件
* @author JavaAlpha
* @date 2011-8-1
* @version V 1.0
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.textmining.text.extraction.WordExtractor;public class ReadOffice {
/**
* @param args
*/
public static void main(String[] args) {
// readDoc("e:/1.doc");
// readExcel("e:/1.xls");
// readPDF("e:/1.pdf");
// readHtml("e:/1.html");
readHtmlAll("e:/1.html");
}
/**
* 创建TXT文件,写入文件内容
*
* @param text
*/
static void createTXTAndWriteDoc(String text, String path) {
FileOutputStream fos = null;
FileOutputStream out = null;
try {
// 新建一输出文件流,如果文件存在先删除文件
File f = new File(path);
if (f.exists()) {
f.delete();
}
fos = new FileOutputStream(f);
out = new FileOutputStream(f);
byte[] b = text.getBytes("GB2312");
out.write(b);
out.flush();
System.out.println("文件生成...");
} catch (Exception e) {
System.out.println("出现异常: " + e);
} finally {
try {
if (null != fos) {
fos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (null != out) {
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
fos = null;
out = null;
}
}
/**
* 读取DOC文件
*
* @param dir
* @throws Exception
*/
static void readDoc(String dir) {
// 创建输入流读取doc文件