使用apose实现doc转pdf或者pdf转doc，同时支持目录检测

feitian633

已于 2022-07-28 16:48:31 修改

阅读量1.1k

点赞数

分类专栏： doc pdf 文章标签： java

于 2022-07-28 16:45:37 首次发布

本文链接：https://blog.csdn.net/feitian633/article/details/126038480

版权

doc 同时被 2 个专栏收录

1 篇文章 0 订阅

订阅专栏

pdf

1 篇文章 0 订阅

订阅专栏

package com.word;

import java.io.File;
import java.io.IOException;

import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;

/**
 * word转pdf的工具，效果较好,使用该工具需要将dll下面的jacob-1.20-x64.dll复制到jdb/bin目录下面，运行环境应该在windows下，没有页码不对问题。
 * 
 * @time 2021年11月17日下午2:18:33
 */
public class Word2PdfUtil {

    static final int wdDoNotSaveChanges = 0;// 不保存待定的更改。
    static final int wdFormatPDF = 17;// word转PDF 格式

    public static void main(String[] args) throws IOException {
        String source1 = "C:\\aa.docx";
        String target1 = "C:\\bb.pdf";
        Word2PdfUtil.word2pdf(source1, target1);
    }

    /**
     * 
     * @param source
     *            word路径
     * @param target
     *            生成的pdf路径
     * @return
     */
    public static boolean word2pdf(String source, String target) {
        System.out.println("Word转PDF开始启动...");
        long start = System.currentTimeMillis();
        ActiveXComponent app = null;
        try {
            app = new ActiveXComponent("Word.Application");
            app.setProperty("Visible", false);
            Dispatch docs = app.getProperty("Documents").toDispatch();
            System.out.println("打开文档：" + source);
            Dispatch doc = Dispatch.call(docs, "Open", source, false, true).toDispatch();
            System.out.println("转换文档到PDF：" + target);
            File tofile = new File(target);
            if (tofile.exists()) {
                tofile.delete();
            }
            Dispatch.call(doc, "SaveAs", target, wdFormatPDF);
            Dispatch.call(doc, "Close", false);
            long end = System.currentTimeMillis();
            System.out.println("转换完成，用时：" + (end - start) + "ms");
            return true;
        } catch (Exception e) {
            System.out.println("Word转PDF出错：" + e.getMessage());
            return false;
        } finally {
            if (app != null) {
                app.invoke("Quit", wdDoNotSaveChanges);
            }
        }
    }

}

提取pdf目录：

package com.pdf.search;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode;
import org.apache.pdfbox.text.PDFTextStripper;

public class PdfMulu {
   public static void main(String[] args) throws Exception {
      long old = System.currentTimeMillis();
      try {
         String srcFile = "D://功能说明 (1).pdf";
         List<String> pages = readPDF(srcFile);
         List<String> titles = new ArrayList<>();
         boolean flag = false;
         String reg = "^(?=.*目)(?=.*录).+";
         Pattern p = Pattern.compile(reg);
         for (String page : pages) {
            Matcher m = p.matcher(page);
            if (m.find()&&!flag) {
               flag = true;
               System.out.println("开始目录检测：");
               titles = getTitles(page,titles);
            }else{
               titles = getTitles(page,titles);
            }

//          String [] arr = page.split("\\s+");
//                if(arr.length>0){
//                    System.out.println(arr[arr.length-1]);
//                }
         }
//       System.out.println(titles);
         //查找目录内容在第几页或者第几页有没有目录文字
         File pdfFile = new File(srcFile);
         byte[] pdfData = new byte[(int) pdfFile.length()];
         FileInputStream inputStream = null;
         try {
            inputStream = new FileInputStream(pdfFile);
            inputStream.read(pdfData);
         } catch (IOException e) {
            throw e;
         } finally {
            if (inputStream != null) {
               try {
                  inputStream.close();
               } catch (IOException e) {
               }
            }
         }
         for(String title :titles){
            String ptitle = title.split("%%%%%%")[0].replaceAll("\\s*","");
            String pnum = title.split("%%%%%%")[1].trim();
//          if(Integer.parseInt(pnum)<113){
//             continue;
//          }
//          System.out.println(ptitle+":"+pnum);
            if(!PdfKeywordFinder.getPageNums(pdfData,ptitle,Integer.parseInt(pnum))){
               System.out.println("目录："+title.split("%%%%%%")[0]+"；显示页码为："+Integer.parseInt(pnum)+"，与实际不符。");
            }
         }
      } catch (Exception e) {
         e.printStackTrace();
      }
      long now = System.currentTimeMillis();
      System.out.println("共耗时：" + ((now - old) / 1000.0) + "秒");
   }

   public static  List<String> readPDF(String file) throws IOException {
      List result = new ArrayList();
      FileInputStream is = new FileInputStream(file);
      PDFParser parser = new PDFParser(new RandomAccessBuffer(is));
      parser.parse();
      PDDocument doc = parser.getPDDocument();
      PDFTextStripper textStripper =new PDFTextStripper();
      for(int i=1;i<=doc.getNumberOfPages();i++){
         textStripper.setStartPage(i);
         textStripper.setEndPage(i);
         textStripper.setSortByPosition(true);//按顺序行读
         String s=textStripper.getText(doc);
         result.add(s);
      }
      doc.close();
      return result;
   }




   public static List<String> getTitles (String content,List<String> titles){
      String[] lines = content.split("\r\n");
      String reg= "(.*)(\\,{7,1000}|\\.{7,1000})(.*){1,1000}";
      Pattern p = Pattern.compile(reg);
      for (int i = 0; i < lines.length; i++) {
         String str= lines[i];
         Matcher m = p.matcher(str);
         if( m.find()){
            String ma = m.group().trim();
            if(str.split("(\\.{7,1000})").length>1){
               String[] title = ma.split("(\\.{7,1000})");
               titles.add(title[0]+"%%%%%%"+title[1]);
//             System.out.println(title[0]+"$$$"+title[1]);
            }
         }
      }
      return titles;
   }


   public void printBookmarks(PDOutlineNode bookmark,String indentation) throws IOException{
      PDOutlineItem current = bookmark.getFirstChild();
      while(current != null){
         int pages = 0;
         if(current.getDestination() instanceof PDPageDestination){
            PDPageDestination pd = (PDPageDestination) current.getDestination();
            pages = pd.retrievePageNumber();
         }
         if (current.getAction()  instanceof PDActionGoTo) {
            PDActionGoTo gta = (PDActionGoTo) current.getAction();
            if (gta.getDestination() instanceof PDPageDestination) {
               PDPageDestination pd = (PDPageDestination) gta.getDestination();
               pages = pd.retrievePageNumber();
            }
         }
         if (pages == 0) {
            System.out.println(indentation+current.getTitle());
         }else{
            System.out.println(indentation+current.getTitle()+"------"+pages);
         }
         printBookmarks( current, indentation);
         current = current.getNextSibling();
      }
   }


   //获取页码
   public static void getPageNumbers(HashMap<String, Object> bookmark) {
      if (bookmark == null) return;
      if ("GoTo".equals(bookmark.get("Action"))) {
         String page = (String) bookmark.get("Page");
         if (page != null) {
            page = page.trim();
            int idx = page.indexOf(' ');
            int pageNum;
            if (idx < 0) {
               pageNum = Integer.parseInt(page);
               System.out.println("pageNum :" + pageNum);
            } else {
               pageNum = Integer.parseInt(page.substring(0, idx));
               System.out.println("pageNum:" + pageNum);
            }
         }
         @SuppressWarnings("unchecked")
         ArrayList<HashMap<String, Object>> kids = (ArrayList<HashMap<String, Object>>) bookmark.get("Kids");
         if (kids == null) return;
         for (Iterator<HashMap<String, Object>> i = kids.iterator(); i.hasNext(); ) {
            getPageNumbers(i.next());
         }
      }
   }
}

下载地址：使用apose实现doc转pdf或者pdf转doc，含license.xml文件，还有word、pdf目录检测等功能。-Java文档类资源-CSDN下载