docx文档表格抽取(word)

最新推荐文章于 2024-08-06 04:31:47 发布

somnus9298

最新推荐文章于 2024-08-06 04:31:47 发布

阅读量1.4k

点赞数

分类专栏：文档解析 word poi 文章标签： word 文档解析表格提取

本文链接：https://blog.csdn.net/somnus9298/article/details/73920514

版权

文档解析同时被 3 个专栏收录

3 篇文章 0 订阅

订阅专栏

word

3 篇文章 0 订阅

订阅专栏

poi

2 篇文章 0 订阅

订阅专栏

1.table->word

/**
    * 获取表格数据
    * @param filePath 文档url    * @param picPath 图片存储地址
    * @throws Exception    */   public static List<String> getWordExcel2007(String filePath,String picPath,XWPFDocument doc) throws Exception {      //文件名      String fileName = filePath.substring(filePath.lastIndexOf("\\")+1, filePath.length()-5);      if (picPath != null && picPath.trim().length() > 0) {         // 建立图片文件目录         File imgFile = new File(picPath);         if (!imgFile.exists()) {            imgFile.mkdir();         }      }      InputStream is = new FileInputStream(filePath);      List<String> list = new ArrayList<String>();      doc = new XWPFDocument(is);      XWPFParagraph[] paras = doc.getParagraphs().toArray(new XWPFParagraph[0]);      //创建一个表格      List<XWPFTable> tables = doc.getTables();      //System.out.println(tables.size());      XWPFDocument doc2 = new XWPFDocument();      doc2.createTable();      //设置页边距      setDocumentMargin(doc2,"1797", "1440", "1797", "1440");      OutputStream os = null;      String fileOutPath = null;      String picoutpath = null;      //将表格抽取并单独写成word      for (int i =0;i< tables.size();i++) {         //设置表格宽度         tables.get(i).setWidth(8000);         doc2.setTable(0, tables.get(i));         fileOutPath = picPath+File.separator+fileName+"_table"+ i +".docx";         picoutpath = picPath+File.separator+fileName+"pic"+ i +".png";         list.add(picoutpath);         os = new FileOutputStream(fileOutPath);         //写入文件         doc2.write(os);         //文档转换为图片         picPath = picPath.replace("\\", "/");         word2img(fileOutPath,picPath);      }      os.close();      return list;   }

2.word->pdf->图片

/**
 * 表格文档转图片
 * @param fileOutPath 表格文档路径
 * @param picPath 表格文档转图片存储路径
    */
private static void word2img(String fileOutPath, String picPath) throws Exception {
   int index = fileOutPath.lastIndexOf("\\");
   String fileName = fileOutPath.substring(index+1,fileOutPath.length()-5);
   String toFileName = picPath+File.separator + fileName +".pdf";
   wordToPDF(fileOutPath,toFileName);
}

 public static int wordToPDF(String sfileName,String toFileName) throws Exception{
        System.out.println("启动Word...");
        long start = System.currentTimeMillis();
        ActiveXComponent app = null;
        Dispatch doc = null;
        try {
            app = new ActiveXComponent("Word.Application");
            // 设置word不可见
            app.setProperty("Visible", new Variant(false));
            // 打开word文件
            Dispatch docs = app.getProperty("Documents").toDispatch();
            if(docs != null){
            //doc = Dispatch.call(docs,  "Open" , sourceFile).toDispatch();

            doc = Dispatch.invoke(docs,"Open",Dispatch.Method,new Object[] {sfileName, new Variant(false),new Variant(true) }, new int[1]).toDispatch();
            //if(doc != null){
                System.out.println("打开文档..." + sfileName);
                System.out.println("转换文档到PDF..." + toFileName);
                File tofile = new File(toFileName);
                // System.err.println(getDocPageSize(new File(sfileName)));
                if (tofile.exists()) {
                    tofile.delete();
                }
//          Dispatch.call(doc, "SaveAs",  destFile,  17);
                // 作为html格式保存到临时文件：：参数 new Variant(8)其中8表示word转html;7表示word转txt;44表示Excel转html;17表示word转成pdf。。
                Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] {
                        toFileName, new Variant(17) }, new int[1]);

                long end = System.currentTimeMillis();
                System.out.println("转换完成..用时：" + (end - start) + "ms.");
            }
            //此处pdf转图片
            pdf2multiImage(toFileName,toFileName.substring(0,toFileName.length()-4)+".png",20);

        } catch (Exception e) {
            e.printStackTrace();
            System.out.println("========Error:文档转换失败：" + e.getMessage());
        }catch(Throwable t){
            t.printStackTrace();
        } finally {
            // 关闭word
            Dispatch.call(doc,"Close",false);
            System.out.println("关闭文档");
            if (app != null)
                app.invoke("Quit", new Variant[] {});
        }
        //如果没有这句话,winword.exe进程将不会关闭
        ComThread.Release();
        return 1;
    }

这里考虑到页面显示的问题我把多页的pdf转成了一个长图，如果想存单图的可以自行百度

/**
 * 将pdf中的maxPage页，转换成一张图片
 * @param pdfFile pdf的路径
 * @param outpath 输出的图片的路径[包括名称]
 * @param maxPage pdf的页数【比如Pdf有3页，如果maxPage=2，则将pdf中的前2页转成图片，如果超过pdf实际页数，则按实际页数转换】
 */
private static void pdf2multiImage(String pdfFile, String outpath, int maxPage) {
    try {
        InputStream is = new FileInputStream(pdfFile);
        PDDocument pdf = PDDocument.load(is, true);
        List<PDPage> pages = pdf.getDocumentCatalog().getAllPages();
        List<BufferedImage> piclist = new ArrayList<BufferedImage>();
        int actSize = pages.size(); // pdf中实际的页数
        if (actSize < maxPage) maxPage = actSize;
        for (int i = 0; i < maxPage; i++) {
            piclist.add(pages.get(i).convertToImage());
        }
        yPic(piclist, outpath);
        is.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}
/**
 * 将宽度相同的图片，竖向追加在一起 ##注意：宽度必须相同
 * @param piclist 文件流数组
 * @param outPath  输出路径
 */
public static void yPic(List<BufferedImage> piclist, String outPath) {// 纵向处理图片
    if (piclist == null || piclist.size() <= 0) {
        System.out.println("图片数组为空!");
        return;
    }
    try {
        int height = 0, // 总高度
                width = 0, // 总宽度
                _height = 0, // 临时的高度 , 或保存偏移高度
                __height = 0, // 临时的高度，主要保存每个高度
                picNum = piclist.size();// 图片的数量
        File fileImg = null; // 保存读取出的图片
        int[] heightArray = new int[picNum]; // 保存每个文件的高度
        BufferedImage buffer = null; // 保存图片流
        List<int[]> imgRGB = new ArrayList<int[]>(); // 保存所有的图片的RGB
        int[] _imgRGB; // 保存一张图片中的RGB数据
        for (int i = 0; i < picNum; i++) {
            buffer = piclist.get(i);
            heightArray[i] = _height = buffer.getHeight();// 图片高度
            if (i == 0) {
                width = buffer.getWidth();// 图片宽度
            }
            height += _height; // 获取总高度
            _imgRGB = new int[width * _height];// 从图片中读取RGB
            _imgRGB = buffer.getRGB(0, 0, width, _height, _imgRGB, 0, width);
            imgRGB.add(_imgRGB);
        }
        _height = 0; // 设置偏移高度为0
        // 生成新图片
        BufferedImage imageResult = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
        for (int i = 0; i < picNum; i++) {
            __height = heightArray[i];
            if (i != 0) _height += __height; // 计算偏移高度
            imageResult.setRGB(0, _height, width, __height, imgRGB.get(i), 0, width); // 写入流中
        }
        File outFile = new File(outPath);
        ImageIO.write(imageResult, "jpg", outFile);// 写图片
    } catch (Exception e) {
        e.printStackTrace();
    }
}

3.内容处理

    /**
    * 获取word20007文本
    * @param filePath
    * @param picPath
    * @return
    * @throws Exception
    */
   public static Map<String,Object> getWordDir2007(String filePath, InputStream in,String picPath,String fileName)
         throws Exception {
      Map<String,Object> map = new HashMap<String, Object>();
      InputStream is = null;
      XWPFDocument doc = null;
      try {
         // 获取输入流
//       is = new FileInputStream(new File(filePath));
         is = in;
         List<String> list = new ArrayList<String>();
          doc = new XWPFDocument(is);
         List<XWPFParagraph> paras = doc.getParagraphs();

         List<IBodyElement> bodyElements = doc.getBodyElements();
         //记录table的index
         List<Integer> listIndex = new ArrayList<>();
         for(int i = 0 ;i<bodyElements.size();i++){
            BodyElementType elementType = bodyElements.get(i).getElementType();
            if(BodyElementType.TABLE.equals(elementType)){
               listIndex.add(i);
            }
         }
         //计算循环次数
         int foreachNum = paras.size() + listIndex.size()-1;
         for(int i = 0;i<paras.size() ;i++){
            int j = 0;
            if(listIndex.contains(i)){
               String tableUrl = "<img src='"+ picPath+File.separator+fileName+ "pic"+j+".png' width='400' height='200' />";
               list.add(tableUrl);
               String text = paras.get(i).getParagraphText()+"</br>";
               if(text.contains(",")){
                  text=text.replaceFirst(",", "");
               }
               list.add(text);
               j++;
            }else{
               String text = paras.get(i).getParagraphText()+"</br>";
               if(text.contains(",")){
                  text=text.replaceFirst(",", "");
               }
               list.add(text);
            }
         }
         //获取表格数据
         List<String> tablePicPath = getWordExcel2007(filePath,picPath,doc);
         if(list!=null && list.size()>0){
            String str = list.toString();
            str = str.substring(1,str.length()-1);
            map.put(fileName,str);
         }
         
      } catch (Exception e) {
         // TODO Auto-generated catch block
         e.printStackTrace();
         return map;
      }finally{
         if(is!=null){
            is.close();
         }
         if(doc!=null){
            doc.close();
         }
      }
      return map;

   }

文档表格抽取用到了 jacob

<!-- word2pdf-->
<dependency>
    <groupId>net.sf.jacob-project</groupId>
    <artifactId>jacob</artifactId>
    <version>1.14.3</version>
</dependency>

并且需要在你的jdk/jre/bin目录放入jacob-1.14.3-x64.dll。这个百度下很多下载的