1.table->word
/**
* 获取表格数据
* @param filePath 文档url * @param picPath 图片存储地址
* @throws Exception */ public static List<String> getWordExcel2007(String filePath,String picPath,XWPFDocument doc) throws Exception { //文件名 String fileName = filePath.substring(filePath.lastIndexOf("\\")+1, filePath.length()-5); if (picPath != null && picPath.trim().length() > 0) { // 建立图片文件目录 File imgFile = new File(picPath); if (!imgFile.exists()) { imgFile.mkdir(); } } InputStream is = new FileInputStream(filePath); List<String> list = new ArrayList<String>(); doc = new XWPFDocument(is); XWPFParagraph[] paras = doc.getParagraphs().toArray(new XWPFParagraph[0]); //创建一个表格 List<XWPFTable> tables = doc.getTables(); //System.out.println(tables.size()); XWPFDocument doc2 = new XWPFDocument(); doc2.createTable(); //设置页边距 setDocumentMargin(doc2,"1797", "1440", "1797", "1440"); OutputStream os = null; String fileOutPath = null; String picoutpath = null; //将表格抽取并单独写成word for (int i =0;i< tables.size();i++) { //设置表格宽度 tables.get(i).setWidth(8000); doc2.setTable(0, tables.get(i)); fileOutPath = picPath+File.separator+fileName+"_table"+ i +".docx"; picoutpath = picPath+File.separator+fileName+"pic"+ i +".png"; list.add(picoutpath); os = new FileOutputStream(fileOutPath); //写入文件 doc2.write(os); //文档转换为图片 picPath = picPath.replace("\\", "/"); word2img(fileOutPath,picPath); } os.close(); return list; }
2.word->pdf->图片
/**
* 表格文档转图片
* @param fileOutPath 表格文档路径
* @param picPath 表格文档转图片存储路径
*/
private static void word2img(String fileOutPath, String picPath) throws Exception {
int index = fileOutPath.lastIndexOf("\\");
String fileName = fileOutPath.substring(index+1,fileOutPath.length()-5);
String toFileName = picPath+File.separator + fileName +".pdf";
wordToPDF(fileOutPath,toFileName);
}
public static int wordToPDF(String sfileName,String toFileName) throws Exception{
System.out.println("启动Word...");
long start = System.currentTimeMillis();
ActiveXComponent app = null;
Dispatch doc = null;
try {
app = new ActiveXComponent("Word.Application");
// 设置word不可见
app.setProperty("Visible", new Variant(false));
// 打开word文件
Dispatch docs = app.getProperty("Documents").toDispatch();
if(docs != null){
//doc = Dispatch.call(docs, "Open" , sourceFile).toDispatch();
doc = Dispatch.invoke(docs,"Open",Dispatch.Method,new Object[] {sfileName, new Variant(false),new Variant(true) }, new int[1]).toDispatch();
//if(doc != null){
System.out.println("打开文档..." + sfileName);
System.out.println("转换文档到PDF..." + toFileName);
File tofile = new File(toFileName);
// System.err.println(getDocPageSize(new File(sfileName)));
if (tofile.exists()) {
tofile.delete();
}
// Dispatch.call(doc, "SaveAs", destFile, 17);
// 作为html格式保存到临时文件::参数 new Variant(8)其中8表示word转html;7表示word转txt;44表示Excel转html;17表示word转成pdf。。
Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] {
toFileName, new Variant(17) }, new int[1]);
long end = System.currentTimeMillis();
System.out.println("转换完成..用时:" + (end - start) + "ms.");
}
//此处pdf转图片
pdf2multiImage(toFileName,toFileName.substring(0,toFileName.length()-4)+".png",20);
} catch (Exception e) {
e.printStackTrace();
System.out.println("========Error:文档转换失败:" + e.getMessage());
}catch(Throwable t){
t.printStackTrace();
} finally {
// 关闭word
Dispatch.call(doc,"Close",false);
System.out.println("关闭文档");
if (app != null)
app.invoke("Quit", new Variant[] {});
}
//如果没有这句话,winword.exe进程将不会关闭
ComThread.Release();
return 1;
}
这里考虑到页面显示的问题我把多页的pdf转成了一个长图,如果想存单图的可以自行百度
/**
* 将pdf中的maxPage页,转换成一张图片
* @param pdfFile pdf的路径
* @param outpath 输出的图片的路径[包括名称]
* @param maxPage pdf的页数【比如Pdf有3页,如果maxPage=2,则将pdf中的前2页转成图片,如果超过pdf实际页数,则按实际页数转换】
*/
private static void pdf2multiImage(String pdfFile, String outpath, int maxPage) {
try {
InputStream is = new FileInputStream(pdfFile);
PDDocument pdf = PDDocument.load(is, true);
List<PDPage> pages = pdf.getDocumentCatalog().getAllPages();
List<BufferedImage> piclist = new ArrayList<BufferedImage>();
int actSize = pages.size(); // pdf中实际的页数
if (actSize < maxPage) maxPage = actSize;
for (int i = 0; i < maxPage; i++) {
piclist.add(pages.get(i).convertToImage());
}
yPic(piclist, outpath);
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 将宽度相同的图片,竖向追加在一起 ##注意:宽度必须相同
* @param piclist 文件流数组
* @param outPath 输出路径
*/
public static void yPic(List<BufferedImage> piclist, String outPath) {// 纵向处理图片
if (piclist == null || piclist.size() <= 0) {
System.out.println("图片数组为空!");
return;
}
try {
int height = 0, // 总高度
width = 0, // 总宽度
_height = 0, // 临时的高度 , 或保存偏移高度
__height = 0, // 临时的高度,主要保存每个高度
picNum = piclist.size();// 图片的数量
File fileImg = null; // 保存读取出的图片
int[] heightArray = new int[picNum]; // 保存每个文件的高度
BufferedImage buffer = null; // 保存图片流
List<int[]> imgRGB = new ArrayList<int[]>(); // 保存所有的图片的RGB
int[] _imgRGB; // 保存一张图片中的RGB数据
for (int i = 0; i < picNum; i++) {
buffer = piclist.get(i);
heightArray[i] = _height = buffer.getHeight();// 图片高度
if (i == 0) {
width = buffer.getWidth();// 图片宽度
}
height += _height; // 获取总高度
_imgRGB = new int[width * _height];// 从图片中读取RGB
_imgRGB = buffer.getRGB(0, 0, width, _height, _imgRGB, 0, width);
imgRGB.add(_imgRGB);
}
_height = 0; // 设置偏移高度为0
// 生成新图片
BufferedImage imageResult = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
for (int i = 0; i < picNum; i++) {
__height = heightArray[i];
if (i != 0) _height += __height; // 计算偏移高度
imageResult.setRGB(0, _height, width, __height, imgRGB.get(i), 0, width); // 写入流中
}
File outFile = new File(outPath);
ImageIO.write(imageResult, "jpg", outFile);// 写图片
} catch (Exception e) {
e.printStackTrace();
}
}
3.内容处理
/**
* 获取word20007文本
* @param filePath
* @param picPath
* @return
* @throws Exception
*/
public static Map<String,Object> getWordDir2007(String filePath, InputStream in,String picPath,String fileName)
throws Exception {
Map<String,Object> map = new HashMap<String, Object>();
InputStream is = null;
XWPFDocument doc = null;
try {
// 获取输入流
// is = new FileInputStream(new File(filePath));
is = in;
List<String> list = new ArrayList<String>();
doc = new XWPFDocument(is);
List<XWPFParagraph> paras = doc.getParagraphs();
List<IBodyElement> bodyElements = doc.getBodyElements();
//记录table的index
List<Integer> listIndex = new ArrayList<>();
for(int i = 0 ;i<bodyElements.size();i++){
BodyElementType elementType = bodyElements.get(i).getElementType();
if(BodyElementType.TABLE.equals(elementType)){
listIndex.add(i);
}
}
//计算循环次数
int foreachNum = paras.size() + listIndex.size()-1;
for(int i = 0;i<paras.size() ;i++){
int j = 0;
if(listIndex.contains(i)){
String tableUrl = "<img src='"+ picPath+File.separator+fileName+ "pic"+j+".png' width='400' height='200' />";
list.add(tableUrl);
String text = paras.get(i).getParagraphText()+"</br>";
if(text.contains(",")){
text=text.replaceFirst(",", "");
}
list.add(text);
j++;
}else{
String text = paras.get(i).getParagraphText()+"</br>";
if(text.contains(",")){
text=text.replaceFirst(",", "");
}
list.add(text);
}
}
//获取表格数据
List<String> tablePicPath = getWordExcel2007(filePath,picPath,doc);
if(list!=null && list.size()>0){
String str = list.toString();
str = str.substring(1,str.length()-1);
map.put(fileName,str);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
return map;
}finally{
if(is!=null){
is.close();
}
if(doc!=null){
doc.close();
}
}
return map;
}
文档表格抽取用到了 jacob
<!-- word2pdf-->
<dependency>
<groupId>net.sf.jacob-project</groupId>
<artifactId>jacob</artifactId>
<version>1.14.3</version>
</dependency>
并且需要在你的jdk/jre/bin目录放入jacob-1.14.3-x64.dll。这个百度下很多下载的