<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.13</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>1.13</version>
</dependency>
/**
* 创建excel
* @param srcPath word所在文件夹路径
* @return
*/
public Workbook getData(String srcPath){
XSSFWorkbook wb = new XSSFWorkbook();
Sheet sheet = wb.createSheet("信息");
Row row = sheet.createRow(0);
String[] headers={"文件名", "正文"};
Cell cell = null;
//合并单元格
// CellRangeAddress region = new CellRangeAddress(0,0,1,2);
// sheet.addMergedRegion(region);
//添加表头
for(int i=0;i<headers.length;i++){
cell = row.createCell(i);
cell.setCellValue(headers[i]);
}
int i = 1;
getText(srcPath, sheet, i);
return wb;
}
/**
* 把word内容转换成纯文本,放进excel格子中
* @param srcPath word文件所在文件夹
* @param sheet 创建的excel
* @param i excel的第几行
* @return
*/
private int getText(String srcPath, Sheet sheet, int i) {
File file = new File(srcPath);
Tika tika = new Tika();
if (file.isDirectory()) {
File[] files = file.listFiles();
for (File file1 : files) {
try {
if (file1.isDirectory()) {
i = getText(file1.getAbsolutePath(), sheet, i);
} else {
String fileName = file1.getName();
if (fileName.contains("~$")) {
continue;
}
if (fileName.endsWith("doc") || fileName.endsWith("docx") || fileName.endsWith("wps")) {
//获取word中的纯文本
String text = tika.parseToString(file1);
Row row = sheet.createRow(i);
row.createCell(0).setCellValue(fileName);
row.createCell(1).setCellValue(text);
i++;
}
}
}catch (Exception e){
// System.out.println("错误" + file1.getAbsolutePath());
// e.printStackTrace();
}
}
}
return i;
}
/**
* 把excel保存在本地
* @param src word文件所在文件夹
* @param des 保存文件完整文件名,test.xlsx
*/
public void saveLocal(String src, String des){
OutputStream outputStream = null;
try {
Workbook workbook = getData(src);
outputStream = new FileOutputStream(des);
workbook.write(outputStream);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (outputStream != null){
try {
outputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
问题:
1、旧版本的word文件不能转换,比如word2003;
2、excel单元格的最多只放 32,767字符。尝试合并单元格并不能解决这个问题