package office;
/**
* 读取Doc,Excel,PDF,html,生成Txt文件,读取Txt生成Excel文件
* @author JavaAlpha
* @date 2011-8-1
* @version V 1.0
*/
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import javax.swing.text.BadLocationException;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.textmining.text.extraction.WordExtractor;
public class ReadOffice {
/**
* @param args
*/
public static void main(String[] args) {
// readDoc("e:/1.doc");
// readExcel("e:/1.xls");
// readPDF("e:/1.pdf");
// readHtml("e:/1.html");
readHtmlAll("e:/1.html");
}
/**
* 创建TXT文件,写入文件内容
*
* @param text
*/
static void createTXTAndWriteDoc(String text, String path) {
FileOutputStream fos = null;
FileOutputStream out = null;
try {
// 新建一输出文件流,如果文件存在先删除文件
File f = new File(path);
if (f.exists()) {
f.delete();
}
fos = new FileOutputStream(f);
out = new FileOutputStream(f);
byte[] b = text.getBytes("GB2312");
out.write(b);
out.flush();
System.out.println("文件生成...");
} catch (Exception e) {
System.out.println("出现异常: " + e);
} finally {
try {
if (null != fos) {
fos.close();
}
} catch (IOException e) {
e.printStackTrace();
}
try {
if (null != out) {
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
fos = null;
out = null;
}
}
/**
* 读取DOC文件
*
* @param dir
* @throws Exception
*/
static void readDoc(String dir) {
// 创建输入流读取doc文件
FileInputStream in = null;
WordExtractor extractor = null;
String text = null;
try {
in = new FileInputStream(new File(dir));
// 创建WordExtractor
extractor = new WordExtractor();
// 对doc文件进行提取
text = extractor.extractText(in);
System.out.println("text1:" + text);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (null != in) {
in.close();
}
} catch (IOException e) {
e.printStackTrace();
}
in = null;
}
// 写入文件内容
createTXTAndWriteDoc(text, "e:/doc.txt");
}
/**
* 读取Excel文件
*
* @param dir
*/
@SuppressWarnings("deprecation")
static void readExcel(String dir) {
/**
* @param filePath
* 文件路径
* @return 读出的Excel的内容
*/
StringBuffer buff = new StringBuffer();
try {
// 创建对Excel工作簿文件的引用
HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(dir));
// 创建对工作表的引用。
for (int numSheets = 0; numSheets < wb.getNumberOfSheets(); numSheets++) {
if (null != wb.getSheetAt(numSheets)) {
HSSFSheet aSheet = wb.getSheetAt(numSheets);// 获得一个sheet
for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
if (null != aSheet.getRow(rowNumOfSheet)) {
HSSFRow aRow = aSheet.getRow(rowNumOfSheet); // 获得一个行
for (int cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
if (null != aRow.getCell((short) cellNumOfRow)) {
HSSFCell aCell = aRow.getCell((short) cellNumOfRow);// 获得列值
switch (aCell.getCellType()) {
case HSSFCell.CELL_TYPE_FORMULA:
break;
case HSSFCell.CELL_TYPE_NUMERIC:
buff.append(aCell.getNumericCellValue()).append(' ');
break;
case HSSFCell.CELL_TYPE_STRING:
buff.append(aCell.getStringCellValue()).append(' ');
break;
}
}
}
buff.append(' ');
}
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 写入文件内容
createTXTAndWriteDoc(buff.toString(), "e:/excel.txt");
}
/**
* 读取Powerpoint文件
*
* @param dir
*/
static void readPPT(String dir) {
}
/**
* 读取PDF文件
*
* @param dir
*/
static void readPDF(String dir) {
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(dir);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (null != is) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (null != document) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
// 写入文件内容
createTXTAndWriteDoc(result, "e:/pdf.txt");
}
/**
* // 读取pdf文件
*
* @param file
* @throws Exception
*/
public void readPdf(String file) throws Exception {
// 是否排序
boolean sort = false;
// pdf文件名
String pdfFile = file;
// 输入文本文件名称
String textFile = null;
// 编码方式
String encoding = "GB2312";
// 开始提取页数
int startPage = 1;
// 结束提取页数
int endPage = Integer.MAX_VALUE;
// 文件输入流,生成文本文件
Writer output = null;
// 内存中存储的PDF Document
PDDocument document = null;
try {
try {
// 首先当作一个URL来装载文件,如果得到异常再从本地文件系统//去装载文件
URL url = new URL(pdfFile); // 注意参数已不是以前版本中的URL.而是File。
document = PDDocument.load(pdfFile);
// 获取PDF的文件名
String fileName = url.getFile();
// 以原来PDF的名称来命名新产生的txt文件
if (fileName.length() > 4) {
File outputFile = new File(fileName.substring(0, fileName.length() - 4) + ".txt");
textFile = outputFile.getName();
}
} catch (MalformedURLException e) {
// 如果作为URL装载得到异常则从文件系统装载 //注意参数已不是以前版本中的URL.而是File。
document = PDDocument.load(pdfFile);
if (pdfFile.length() > 4) {
textFile = pdfFile.substring(0, pdfFile.length() - 4) + ".txt";
}
}
// 文件输入流,写入文件倒textFile
output = new OutputStreamWriter(new FileOutputStream(textFile), encoding);
// PDFTextStripper来提取文本
PDFTextStripper stripper = null;
stripper = new PDFTextStripper();
// 设置是否排序
stripper.setSortByPosition(sort);
// 设置起始页
stripper.setStartPage(startPage);
// 设置结束页
System.out.print(stripper.getText(document));
stripper.setEndPage(endPage);
// 调用PDFTextStripper的writeText提取并输出文本
stripper.writeText(document, output);
} finally {
if (output != null) {
// 关闭输出流
output.close();
}
if (document != null) {
// 关闭PDF Document
document.close();
}
}
}
/**
* 读取Txt文件
*
* @param filePath
* @return
* @throws Exception
*/
public String getTextFromTxt(String filePath) throws Exception {
FileReader fr = new FileReader(filePath);
BufferedReader br = new BufferedReader(fr);
StringBuffer buff = new StringBuffer();
String temp = null;
while ((temp = br.readLine()) != null) {
buff.append(temp + " ");
}
br.close();
return buff.toString();
}
/**
* 读取RTF文件内容
*
* @param filePath
* @return
*/
public String getTextFromRtf(String filePath) {
String result = null;
File file = new File(filePath);
try {
DefaultStyledDocument styledDoc = new DefaultStyledDocument();
InputStream is = new FileInputStream(file);
new RTFEditorKit().read(is, styledDoc, 0);
result = new String(styledDoc.getText(0, styledDoc.getLength()).getBytes("ISO8859_1"));
// 提取文本,读取中文需要使用ISO8859_1编码,否则会出现乱码
} catch (IOException e) {
e.printStackTrace();
} catch (BadLocationException e) {
e.printStackTrace();
}
return result;
}
/**
* @param filePath
* 文件路径
* @return 获得html的全部内容
*/
public static String readHtml(String filePath) {
BufferedReader br = null;
StringBuffer sb = new StringBuffer();
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "GB2312"));
String temp = null;
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
// 写入文件内容
createTXTAndWriteDoc(sb.toString(), "e:/html.txt");
return sb.toString();
}
/**
* @param filePath
* 文件路径
* @return 获得的html文本内容
*/
public static void readHtmlAll(String filePath) {
// 得到body标签中的内容
String str = readHtml(filePath);
StringBuffer buff = new StringBuffer();
int maxindex = str.length() - 1;
int begin = 0;
int end;
// 截取>和<之间的内容
while ((begin = str.indexOf('>', begin)) < maxindex) {
end = str.indexOf('<', begin);
if (end - begin > 1) {
buff.append(str.substring(++begin, end));
}
begin = end + 1;
}
// 写入文件内容
createTXTAndWriteDoc(buff.toString(), "e:/htmlAll.txt");
//return buff.toString();
}
/**
* 以行为单位读取文件(文本文件)
*
* @param filePath
*/
public static void readFileByLine(String filePath) {
File file = new File(filePath);
BufferedReader bd = null;
Map<String, String> str = new HashMap<String, String>();
String s1 = "";
String s2 = "";
try {
bd = new BufferedReader(new InputStreamReader(new FileInputStream(file), "gb2312"));// 编码转换(关键的地方)
String temp = "";
int line = 1;
while ((temp = bd.readLine()) != null) {
if (temp.length() > 0) {
s1 = temp.substring(0, 3);
s1 = s1.trim();
s2 = temp.substring(4);
s2 = s2.trim();
str.put(s1, s2);
}
++line;
}
createExcel(str);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (bd != null)
bd.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 输出Excel文件,输出格式为多行两列
*
* @param map
*/
@SuppressWarnings( { "deprecation", "unchecked" })
static void createExcel(Map<String, String> map) {
try {
// 新建一输出文件流
FileOutputStream fOut = new FileOutputStream("e:/2.xls");
File file = new File("e:/2.xls");
if (file.exists()) {
file.delete();
}
// 创建新的Excel 工作簿
HSSFWorkbook workbook = new HSSFWorkbook();
// 在Excel工作簿中建一工作表,其名为缺省值
// 如要新建一名为"联系人用户名和电话"的工作表,其语句为:
HSSFSheet sheet = workbook.createSheet("联系人用户名和电话");
HSSFRow row = null;
// 在索引0的位置创建单元格(左上端)
HSSFCell cell1 = null;
HSSFCell cell2 = null;
Iterator iter = map.entrySet().iterator();
int i = 0;
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
Object key = entry.getKey();
Object val = entry.getValue();
row = sheet.createRow((short) i++);
cell1 = row.createCell((short) 0);
cell2 = row.createCell((short) 1);
// 定义单元格为字符串类型
cell1.setCellType(HSSFCell.CELL_TYPE_STRING);
cell2.setCellType(HSSFCell.CELL_TYPE_STRING);
// 在单元格中输入一些内容
cell1.setCellValue(key.toString());
cell2.setCellValue(val.toString());
if (i > 255) {
break;
}
}
// 把相应的Excel 工作簿存盘
workbook.write(fOut);
fOut.flush();
// 操作结束,关闭文件
fOut.close();
System.out.println("文件生成...");
} catch (Exception e) {
System.out.println("出现异常: " + e);
}
}
}