前言
需求:通过读取word文件来获取文档字数,便于实施业务。
一开始用的Tika来做的,由于特殊字符解析的不是很正确,又换成POI来读取也有点问题,
最后用Jacob来读取解析,达到预定的结果集。再次记录下,有不对的地方欢迎指正。
Tika的实现方式
引入依赖
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-core -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.20</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-parsers -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.20</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.tika/tika-app -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-app</artifactId>
<version>1.20</version>
</dependency>
通过parseToString方法取文件内容
public static String extract(File file) {
if (file == null || !file.exists()) {
throw new AppException("文件不存在");
}
String suffix = getSuffix(file);
if (!".doc".equals(suffix) && !".docx".equals(suffix) &&
!".pdf".equals(suffix) && !".txt".equals(suffix)) {
throw new AppException("不支持的文件格式");
}
Tika tika = new Tika();
try {
return tika.parseToString(file);
} catch (IOException | TikaException e) {
logger.error("cannot extract file content ", e);
}
return "";
}
public static String getSuffix(File file) {
if (file.isFile()) {
String name = Utils.trimilSpace(file.getName());
if (name.contains(".")) {
String suffix = name.substring(name.lastIndexOf("."));
return suffix.toLowerCase();
}
}
return null;
}
得到文档通过Tika解析的字符串,来做统计处理
此实现方式参考 https://www.cnblogs.com/caer/p/6036408.html
public static int getMSWordsCount(String context) {
int words_count = 0;
// 中文单词
String cn_words = context.replaceAll("[^(\\u4e00-\\u9fa5,。《》?;’‘:“”【】、)(……¥!·)]", "");
int cn_words_count = cn_words.length();
// 非中文单词
String non_cn_words = context.replaceAll("[^(a-zA-Z0-9`\\-=\';.,/~!@#$%^&*()_+|}{\":><?\\[\\])]", " ");
int non_cn_words_count = 0;
String[] ss = non_cn_words.split(" ");
for (String s : ss) {
if (s.trim().length() != 0)
non_cn_words_count++;
}
//中文和非中文单词合计
words_count = cn_words_count + non_cn_words_count;
// ToolLog.d(ConstString.TAG, "汉字:" + cn_words_count + "\n\t字符:" + non_cn_words_count);
return words_count;
}
最终实现word字数的统计但与打开office里面显示的有出入不得在另谋途径 (若文件内没有特殊属性可以用此方法)
POI读取方式
方式1
此方式参考https://gitee.com/chunsw/codes/epjix5938htz2krgfo0ln29 兴趣的同学移步
maven依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.14</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.3.2</version>
</dependency>
import org.apache.commons.lang3.ArrayUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
/**
* 统计字符数 字数
* @return [统计字符数,字数,运行时间]
*/
public class CountDoc {
static int[] wordCountNew(String doc, boolean isDebug) throws Exception {
long time = System.currentTimeMillis();
InputStream is = new FileInputStream(new File(doc));
WordExtractor ex = new WordExtractor(is);
int cnt = 0;
int sumCount = 0;//统计总字符数
StringBuilder builder = new StringBuilder();
for (String text : ex.getParagraphText()) {
// text = text.replaceAll("\u0007", "").replaceAll("\f", "")
// .replaceAll("\r", "").replaceAll("\n", "")
// .replaceAll("\u0015", "");
if (isDebug) {
text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' });
} else {
text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' });
}
String prefix = " TOC \\o \\u \u0014";
if (text.startsWith(prefix))
text = text.substring(prefix.length());
// flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001";
// flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001";
int start = text.indexOf("\u0013");
int end = text.indexOf("\u0014\u0001");
if (start >= 0 && end > start) {
text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", "");
}
text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", "");
String flag = "\u0013 HYPERLINK";
int pos = text.indexOf(flag);
if (pos >= 0) {
String[] arr = text.split(" \u0014");
text = text.substring(0, pos) + arr[1];
}
if (text.length() >= 767) {
// word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入
// if (text.replaceAll(" ", "").length() < text.length() - 767) { //
text = text.replaceAll(" {767,}", "");
// }
}
if (isDebug)
builder.append(text);
cnt += text.length();
sumCount +=text.split(" ").length;
}
int t = Long.valueOf(System.currentTimeMillis() - time).intValue();
if (isDebug) {
System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "")
System.out.println(cnt);
System.out.println(t + " ms");
}
return new int[] { cnt, sumCount,t };
}
private static String trimAllChars(String text, char[] chars) {
if (text == null || text.isEmpty())
return text;
StringBuilder builder = new StringBuilder();
for (int i = 0; i < text.length(); i++) {
if (!ArrayUtils.contains(chars, text.charAt(i)))
builder.append(text.charAt(i));
}
return builder.toString();
}
}
方式2
/**
* POI 读取word文件内容
* @param path 文件路径
* @return 文件内容字符串
*/
public static String readWord(String path) {
String buffer = "";
try {
if (path.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(path));
WordExtractor ex = new WordExtractor(is);
buffer = ex.getText();
ex.close();
} else if (path.endsWith("docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
buffer = extractor.getText();
extractor.close();
} else {
System.out.println("此文件不是word文件!");
}
} catch (Exception e) {
e.printStackTrace();
}
return buffer;
}
再用此方式统计解析 https://www.cnblogs.com/caer/p/6036408.html
此上两种方式都于我的目的有些出入
Jacob方式(完全符合)
Jacob简介使用信息我就不在此阐述了,直接上实现方式
下载jar包
http://download.csdn.net/download/javashixiaofei/9696752
如上是jacob-1.17-M2.jar对应的jar包和dll文件…但是我在maven仓库中并没有发现jacob-1.17版本的.
所以如果使用maven项目的话推荐下载jacob-1.14版本的jar包和dll文件.
http://download.csdn.net/detail/ab873497717/5853741这个是dll文件,jar包文件可以去Maven仓库中去找.
重点
要说的一点就是关于到office里面宏的问题请参考下面两个链接
详细参考https://www.iteye.com/blog/men4661273-2097871
有关于 BuiltInDocumentProperties里面的属性 参考http://blog.sina.com.cn/s/blog_803215760102xjt3.html
更多宏元素属性https://docs.microsoft.com/zh-cn/office/vba/api/Word.Selection
/**
* 通过 JACOB 统计word 字数
* @param path word文档所在路径
*/
public static Integer wordCount(String path) {
ActiveXComponent wordCom = null;
Dispatch wrdDocs = null;
String suffix = getSuffix(path);
if (!".doc".equals(suffix) && !".docx".equals(suffix) &&
!".pdf".equals(suffix) && !".txt".equals(suffix)) {
throw new AppException("不支持的文件格式");
}
try {
// 建立ActiveX部件
wordCom = new ActiveXComponent("Word.Application");
//word应用程序不可见
wordCom.setProperty("Visible", false);
// 返回wrdCom.Documents的Dispatch
wrdDocs = wordCom.getProperty("Documents").toDispatch();//Documents表示word的所有文档窗口(word是多文档应用程序)
// 调用wrdCom.Documents.Open方法打开指定的word文档,返回wordDoc
Dispatch wordDoc = Dispatch.call(wrdDocs, "Open", path, false, true, false).toDispatch();
Dispatch activeDocument = Dispatch.get(wordCom, "ActiveDocument").toDispatch();
int count = Dispatch.call(activeDocument, "BuiltInDocumentProperties",new Variant(15)).toInt();
//关闭文档且不保存
Dispatch.call(wordDoc, "Close", new Variant(false));
return count;
} catch (Exception e) {
logger.warn("Failed to convert '{}'.", path);
//出现此错误,应该强制杀死进程
try {
Runtime.getRuntime().exec("taskkill /f /im WINWORD.exe*");
} catch (IOException ie) {
logger.warn("failed to kill winword ", ie);
}
}finally {
if (wordCom != null) {
try {
//退出进程对象
wordCom.invoke("Quit", new Variant[0]);
} catch (Exception e) {
logger.warn("Quit word app failed", e);
}
wordCom = null;
wrdDocs = null;
}
ComThread.Release();
}
return null;
}
public static String getSuffix(String fileName) {
if (fileName.contains(".")) {
String suffix = fileName.substring(fileName.lastIndexOf("."));
return trimilSpace(suffix.toLowerCase());
}
throw new AppException("文件没有后缀");
}