package com;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;
public class Test {
public static String fileToRead2007 = "f:/tt.docx";
public static String fileToRead2003 = "f:/spring2.5 学习笔记.doc";
public static final int WORD_HTML = 8;
private static ArrayList filelist = new ArrayList();
// private static LinkedList filelist = new LinkedList();
public static void main(String[] args) throws Exception {
// 创建输入流读取DOC文件
long a = System.currentTimeMillis();
refreshFileList("f:\\word");
for (int i = 0; i < filelist.size(); i++) {
// 取得文件类型 2003
// System.out.println("filelist:"+filelist.get(i).toString());
if (filelist.get(i).toString().substring(
filelist.get(i).toString().length() - 3,
filelist.get(i).toString().length()).equals("doc")) {
System.out.println("filelist:" + filelist.get(i).toString());
FileInputStream in = new FileInputStream(filelist.get(i)
.toString());
String wordText2003 = Test.extractTextFromDOC(in);
System.out.println("wordText2003=======" + wordText2003);
} else if (filelist.get(i).toString().substring(
filelist.get(i).toString().length() - 4,
filelist.get(i).toString().length()).equals("docx")) {
System.out.println("filelist:" + filelist.get(i).toString());
String wordText2007 = Test.extractTextFromDOC2007(filelist.get(
i).toString());
System.out.println("wordText2007=======" + wordText2007);
}
}
System.out.println(System.currentTimeMillis() - a);
}
public static void wordToHtml(String docfile, String htmlfile) {
ActiveXComponent app = new ActiveXComponent("Word.Application"); // 启动word
try {
app.setProperty("Visible", new Variant(false));
Dispatch docs = app.getProperty("Documents").toDispatch();
Dispatch doc = Dispatch.invoke(
docs,
"Open",
Dispatch.Method,
new Object[] { docfile, new Variant(false),
new Variant(true) }, new int[1]).toDispatch();
Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] {
htmlfile, new Variant(WORD_HTML) }, new int[1]);
Variant f = new Variant(false);
Dispatch.call(doc, "Close", f);
} catch (Exception e) {
e.printStackTrace();
} finally {
app.invoke("Quit", new Variant[] {});
}
}
/**
* jacob word转html
* @param paths
* @param savepaths
*/
public static void change(String paths, String savepaths) {
File d = new File(paths);
System.out.println("d:" + d);
// 取得当前文件夹下所有文件和目录的列表
File lists[] = d.listFiles();
String pathss = new String("");
for (int i = 0; i < lists.length; i++) {
if (lists[i].isFile()) {
String fileName = lists[i].getName();
String fileType = new String("");
// 取得文件类型 2003
fileType = fileName.substring((fileName.length() - 3), fileName
.length());
// 判断是否为doc文件
if (fileType.equals("doc")) {
System.out.println("当前正在转换......");
// 打印当前目录路径
System.out.println(paths);
// 打印doc文件名
System.out.println(fileName.substring(0,
(fileName.length() - 4)));
ActiveXComponent app = new ActiveXComponent(
"Word.Application");// 启动word
String docpath = paths + fileName;
String htmlpath = savepaths
+ fileName.substring(0, (fileName.length() - 4));
String inFile = docpath;
// 要转换的word文件
String tpFile = htmlpath;
// HTML文件
boolean flag = false;
try {
app.setProperty("Visible", new Variant(false));
// 设置word不可见
Object docs = app.getProperty("Documents").toDispatch();
Object doc = Dispatch.invoke(
(Dispatch) docs,
"Open",
Dispatch.Method,
new Object[] { inFile, new Variant(false),
new Variant(true) }, new int[1])
.toDispatch();
//打开word文件
Dispatch.invoke((Dispatch) doc, "SaveAs",
Dispatch.Method, new Object[] { tpFile,
new Variant(8) }, new int[1]);
//作为html格式保存到临时文件
Variant f = new Variant(false);
Dispatch.call((Dispatch) doc, "Close", f);
flag = true;
} catch (Exception e) {
e.printStackTrace();
} finally {
app.invoke("Quit", new Variant[] {});
}
System.out.println("转化完毕!");
}
} else {
pathss = paths;
//进入下一级目录
pathss = pathss + lists[i].getName() + "";
//递归遍历所有目录
change(pathss, savepaths);
}
}
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC2007(String fileName)
throws IOException, OpenXML4JException, XmlException {
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
return ex.getText();
}
/**
* @Method: extractTextFromDOCX
* @Description: 从word 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC(InputStream is) throws IOException {
WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream
return ex.getTextFromPieces();
// return ex.getHeaderText();
}
/**
* 用迭代法遍历指定目录下所有的文件
* @param strPath
*/
public static void refreshFileList(String strPath) {
File dir = new File(strPath);
File[] files = dir.listFiles();
if (files == null)
return;
for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
refreshFileList(files[i].getAbsolutePath());
} else {
String strFileName = files[i].getAbsolutePath().toLowerCase();
// System.out.println("---"+strFileName);
filelist.add(files[i].getAbsolutePath());
}
}
}
}
POI遍历文件夹读取word文档
最新推荐文章于 2022-10-21 22:52:46 发布