POI遍历文件夹读取word文档

package com;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.LinkedList;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;

import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;

public class Test {

public static String fileToRead2007 = "f:/tt.docx";
public static String fileToRead2003 = "f:/spring2.5 学习笔记.doc";
public static final int WORD_HTML = 8;
private static ArrayList filelist = new ArrayList();
// private static LinkedList filelist = new LinkedList();

public static void main(String[] args) throws Exception {
// 创建输入流读取DOC文件
long a = System.currentTimeMillis();
refreshFileList("f:\\word");

for (int i = 0; i < filelist.size(); i++) {
// 取得文件类型 2003
// System.out.println("filelist:"+filelist.get(i).toString());
if (filelist.get(i).toString().substring(
filelist.get(i).toString().length() - 3,
filelist.get(i).toString().length()).equals("doc")) {
System.out.println("filelist:" + filelist.get(i).toString());
FileInputStream in = new FileInputStream(filelist.get(i)
.toString());
String wordText2003 = Test.extractTextFromDOC(in);
System.out.println("wordText2003=======" + wordText2003);
} else if (filelist.get(i).toString().substring(
filelist.get(i).toString().length() - 4,
filelist.get(i).toString().length()).equals("docx")) {
System.out.println("filelist:" + filelist.get(i).toString());
String wordText2007 = Test.extractTextFromDOC2007(filelist.get(
i).toString());
System.out.println("wordText2007=======" + wordText2007);
}

}
System.out.println(System.currentTimeMillis() - a);
}

public static void wordToHtml(String docfile, String htmlfile) {
ActiveXComponent app = new ActiveXComponent("Word.Application"); // 启动word

try {
app.setProperty("Visible", new Variant(false));
Dispatch docs = app.getProperty("Documents").toDispatch();
Dispatch doc = Dispatch.invoke(
docs,
"Open",
Dispatch.Method,
new Object[] { docfile, new Variant(false),
new Variant(true) }, new int[1]).toDispatch();
Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] {
htmlfile, new Variant(WORD_HTML) }, new int[1]);
Variant f = new Variant(false);
Dispatch.call(doc, "Close", f);

} catch (Exception e) {
e.printStackTrace();
} finally {
app.invoke("Quit", new Variant[] {});
}

}

/**
* jacob word转html
* @param paths
* @param savepaths
*/
public static void change(String paths, String savepaths) {

File d = new File(paths);
System.out.println("d:" + d);
// 取得当前文件夹下所有文件和目录的列表
File lists[] = d.listFiles();
String pathss = new String("");
for (int i = 0; i < lists.length; i++) {

if (lists[i].isFile()) {
String fileName = lists[i].getName();
String fileType = new String("");

// 取得文件类型 2003
fileType = fileName.substring((fileName.length() - 3), fileName
.length());

// 判断是否为doc文件
if (fileType.equals("doc")) {
System.out.println("当前正在转换......");

// 打印当前目录路径
System.out.println(paths);

// 打印doc文件名
System.out.println(fileName.substring(0,
(fileName.length() - 4)));
ActiveXComponent app = new ActiveXComponent(
"Word.Application");// 启动word

String docpath = paths + fileName;
String htmlpath = savepaths
+ fileName.substring(0, (fileName.length() - 4));

String inFile = docpath;
// 要转换的word文件
String tpFile = htmlpath;
// HTML文件

boolean flag = false;

try {
app.setProperty("Visible", new Variant(false));

// 设置word不可见
Object docs = app.getProperty("Documents").toDispatch();
Object doc = Dispatch.invoke(
(Dispatch) docs,
"Open",
Dispatch.Method,
new Object[] { inFile, new Variant(false),
new Variant(true) }, new int[1])
.toDispatch();

//打开word文件
Dispatch.invoke((Dispatch) doc, "SaveAs",
Dispatch.Method, new Object[] { tpFile,
new Variant(8) }, new int[1]);

//作为html格式保存到临时文件
Variant f = new Variant(false);
Dispatch.call((Dispatch) doc, "Close", f);
flag = true;
} catch (Exception e) {
e.printStackTrace();
} finally {
app.invoke("Quit", new Variant[] {});
}
System.out.println("转化完毕!");
}

} else {
pathss = paths;
//进入下一级目录
pathss = pathss + lists[i].getName() + "";
//递归遍历所有目录
change(pathss, savepaths);
}
}
}

/**
* @Method: extractTextFromDOCX
* @Description: 从word 2007文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC2007(String fileName)
throws IOException, OpenXML4JException, XmlException {
OPCPackage opcPackage = POIXMLDocument.openPackage(fileName);
POIXMLTextExtractor ex = new XWPFWordExtractor(opcPackage);
return ex.getText();
}

/**
* @Method: extractTextFromDOCX
* @Description: 从word 2003文档中提取纯文本
*
* @param
* @return String
* @throws
*/
public static String extractTextFromDOC(InputStream is) throws IOException {
WordExtractor ex = new WordExtractor(is); // is是WORD文件的InputStream
return ex.getTextFromPieces();
// return ex.getHeaderText();

}

/**
* 用迭代法遍历指定目录下所有的文件
* @param strPath
*/
public static void refreshFileList(String strPath) {
File dir = new File(strPath);
File[] files = dir.listFiles();

if (files == null)
return;

for (int i = 0; i < files.length; i++) {
if (files[i].isDirectory()) {
refreshFileList(files[i].getAbsolutePath());
} else {
String strFileName = files[i].getAbsolutePath().toLowerCase();
// System.out.println("---"+strFileName);
filelist.add(files[i].getAbsolutePath());
}
}
}
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值