一个可读取:Excel,html,pdf,txt,word工具类:
package com.topsoft.info.services;
/**
* Created by IntelliJ IDEA.
* User: * Date: 2009-10-9
* Time: 10:40:12
* To change this template use File | Settings | File Templates.
*/
import java.io.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.FormTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.util.PDFTextStripper;
public class FileParseDomainImpl implements FileParseDomain {
private static final Log log = LogFactory
.getLog(FileParseDomainImpl.class);
/**
* 读取Excel类型的文件
*
* @param filePath 文件路径
* @return 读取后返回的饿字符串
*/
public String readExcel(String filePath) {
String content = "";// 字符串
File file = new File(filePath);
if (!file.exists()) {
return "";
}
InputStream in = null;
try {
in = new FileInputStream(filePath);
HSSFWorkbook workbook = new HSSFWorkbook(in);
ExcelExtractor extractor = new ExcelExtractor(workbook);
extractor.setFormulasNotResults(false);
extractor.setIncludeSheetNames(false);
content = extractor.getText();
} catch (Exception ex) {
log.debug("读取excel文件出错" + ex.getMessage(), ex);
throw new RuntimeException("读取excel文件出错" + ex.getMessage());
// ex.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return content.trim();
}
/**
* 读取html类型的文件
*
* @param filePath 文件路径
* @return 读取后返回的字符串
*/
public String readHtml(String filePath) {
StringBuffer content = new StringBuffer("");
String line = null;// 行
try {
File file = new File(filePath);
if (!file.exists()) {
return "";
}
FileInputStream fis = null;
fis = new FileInputStream(file);
BufferedReader reader = new BufferedReader(new InputStreamReader(
fis, "gbk"));// 这里的字符编码要注意,要对上html头文件的一致,否则会出乱码
while ((line = reader.readLine()) != null) {
content.append(line + "\n");
}
reader.close();
} catch (Exception ex) {
log.debug("读取html或htm文件出错" + ex.getMessage(), ex);
throw new RuntimeException("读取html或htm文件出错" + ex.getMessage());
// ex.printStackTrace();
}
return content.toString();
}
/**
* 读取pdf类型的文件
*
* @param filePath 文件路径
* @return 读取后返回的字符串
*/
public String readPDF(String filePath) {
File file = new File(filePath);
if (!file.exists()) {
return "";
}
StringBuffer content = new StringBuffer("");// 文档内容
FileInputStream fis = null;
try {
fis = new FileInputStream(filePath);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
} catch (Exception ex) {
log.debug("读取pdf文件出错" + ex.getMessage(), ex);
throw new RuntimeException("读取pdf文件出错" + ex.getMessage());
// ex.printStackTrace();
} finally {
try {
fis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return content.toString();
}
/**
* 读取文本类型的文件
*
* @param filePath 文件路径
* @return 读取后的字符串
*/
public String readTxt(String filePath) {
File file = new File(filePath);
if (!file.exists()) {
return "";
}
StringBuffer content = new StringBuffer("");// 文档内容
String s1 = null;
try {
FileReader reader = new FileReader(filePath);
BufferedReader br = new BufferedReader(reader);
while ((s1 = br.readLine()) != null) {
content.append(s1 + "\r");
}
br.close();
reader.close();
} catch (Exception e) {
log.debug("读取文本文件出错" + e.getMessage(), e);
throw new RuntimeException("读取文本文件出错" + e.getMessage());
// e.printStackTrace();
}
return content.toString().trim();
}
/**
* 读取word类型的文件
*
* @param filePath 文件路径
* @return 读取后的字符串
*/
public String readWord(String filePath) {
File file = new File(filePath);
if (!file.exists()) {
return "";
}
StringBuffer text = new StringBuffer("");
FileInputStream in = null;
try {
in = new FileInputStream(filePath);
HWPFDocument doc = new HWPFDocument(in);
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据
Paragraph pp = range.getParagraph(i);
text.append(pp.text());
}
in.close();
} catch (Exception ex) {
log.debug("读取word文件出错" + ex.getMessage(), ex);
throw new RuntimeException("读取word文件出错" + ex.getMessage());
// ex.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
String str = text.toString();
str = str.replaceAll(".*?", "");
str = str.replaceAll("\\d*?", "");
str = str.replaceAll("[]", "");
// str = str.replaceAll("\\r{2}\\r+", "\\r");
// System.out.println(str);
return str;
}
/**
* 读取html类温江并过滤标签
*/
public String readHtmlText(String filePath) {
File file = new File(filePath);
if (!file.exists()) {
return "";
}
String htmlText = readHtml(filePath);
Parser myParser;
NodeList nodeList = null;
StringBuffer result = new StringBuffer();
myParser = Parser.createParser(htmlText, "GBK");
NodeFilter textFilter = new NodeClassFilter(TextNode.class);
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
NodeFilter metaFilter = new NodeClassFilter(MetaTag.class);
NodeFilter styleFilter = new NodeClassFilter(StyleTag.class);
NodeFilter divFilter = new NodeClassFilter(Div.class);
NodeFilter formFilter = new NodeClassFilter(FormTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[]{textFilter, linkFilter,
metaFilter, styleFilter, divFilter, formFilter});
try {
nodeList = myParser.parse(lastFilter);
} catch (ParserException e) {
log.debug("html过滤出错(readHtmlText)" + e.getMessage(), e);
throw new RuntimeException("html过滤出错(readHtmlText)"
+ e.getMessage());
// log.info("html过滤出错(readHtmlText)");
// e.printStackTrace(); //To change body of catch statement use File
// | Settings | File Templates.
}
Node[] nodes = nodeList.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node anode = nodes[i];
if (anode instanceof TextNode) {
TextNode textnode = (TextNode) anode;
line = textnode.getText();
} else if (anode instanceof LinkTag) {
LinkTag linknode = (LinkTag) anode;
line = linknode.getLink();
}
if (isTrimEmpty(line))
continue;
result.append(line);
}
return result.toString().replaceAll(" ", "");
}
// public String officeToHtml(String paths, String savepaths) {
// File d = new File(paths);
// String filename = d.getName();
// // 判断是否为doc文件
// File s = new File(savepaths + "\\temp");
// boolean fl = s.exists();
// if (!fl) {
// s.mkdir();
// }
// String tpFile = savepaths + "\\temp\\" + filename.substring(0,
// (filename.length() - 4)) + ".html";
// boolean tpexist = new File(tpFile).exists();
// if ((paths.endsWith(".doc") || paths.endsWith(".xls")) && !tpexist) {
// String type = "";
// String property = "";
// Variant variant = null;
// if (paths.endsWith(".doc")) {
// type = "Word.Application";
// property = "Documents";
// variant = new Variant(8);
// } else if (paths.endsWith(".xls")) {
// type = "Excel.Application";
// property = "Workbooks";
// variant = new Variant(44);
// }
// // 打印当前目录路径
// ActiveXComponent app = new ActiveXComponent(type);
// System.out.println("启动word");
// // 要转换的word文件
// // HTML文件
// try {
// app.setProperty("Visible", new Variant(false));
// // 设置word不可见
// Dispatch docs = app.getProperty(property).toDispatch();
// Dispatch doc = Dispatch.invoke(docs, "Open", Dispatch.Method, new
// Object[]{paths, new Variant(false), new Variant(true)}, new
// int[1]).toDispatch();
// // 打开word文件
// Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]{tpFile,
// variant}, new int[1]);
// // 作为html格式保存到临时文件
// Variant f = new Variant(false);
// Dispatch.call(doc, "Close", f);
// } catch (Exception e) {
// e.printStackTrace();
// } finally {
// app.invoke("Quit", new Variant[]{});
// }
// System.out.println("转化完毕!");
// }
// System.out.println("开始读取");
// FileParseImpl imp = new FileParseImpl();
// return imp.readHtml(tpFile);
// }
private boolean isTrimEmpty(String astr) {
return (null == astr) || (astr.length() == 0) || isBlank(astr.trim());
}
/**
* 字符串是否为空:null或者长度为0.
*
* @param astr 源字符串.
* @return boolean
*/
private static boolean isBlank(String astr) {
return (null == astr) || (astr.length() == 0);
}
}
所用到的包在附近中。