package com.word; import java.io.File; import java.io.IOException; import com.jacob.activeX.ActiveXComponent; import com.jacob.com.Dispatch; /** * word转pdf的工具,效果较好,使用该工具需要将dll下面的jacob-1.20-x64.dll复制到jdb/bin目录下面,运行环境应该在windows下,没有页码不对问题。 * * @time 2021年11月17日下午2:18:33 */ public class Word2PdfUtil { static final int wdDoNotSaveChanges = 0;// 不保存待定的更改。 static final int wdFormatPDF = 17;// word转PDF 格式 public static void main(String[] args) throws IOException { String source1 = "C:\\aa.docx"; String target1 = "C:\\bb.pdf"; Word2PdfUtil.word2pdf(source1, target1); } /** * * @param source * word路径 * @param target * 生成的pdf路径 * @return */ public static boolean word2pdf(String source, String target) { System.out.println("Word转PDF开始启动..."); long start = System.currentTimeMillis(); ActiveXComponent app = null; try { app = new ActiveXComponent("Word.Application"); app.setProperty("Visible", false); Dispatch docs = app.getProperty("Documents").toDispatch(); System.out.println("打开文档:" + source); Dispatch doc = Dispatch.call(docs, "Open", source, false, true).toDispatch(); System.out.println("转换文档到PDF:" + target); File tofile = new File(target); if (tofile.exists()) { tofile.delete(); } Dispatch.call(doc, "SaveAs", target, wdFormatPDF); Dispatch.call(doc, "Close", false); long end = System.currentTimeMillis(); System.out.println("转换完成,用时:" + (end - start) + "ms"); return true; } catch (Exception e) { System.out.println("Word转PDF出错:" + e.getMessage()); return false; } finally { if (app != null) { app.invoke("Quit", wdDoNotSaveChanges); } } } }
提取pdf目录:
package com.pdf.search; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.pdfbox.io.RandomAccessBuffer; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem; import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineNode; import org.apache.pdfbox.text.PDFTextStripper; public class PdfMulu { public static void main(String[] args) throws Exception { long old = System.currentTimeMillis(); try { String srcFile = "D://功能说明 (1).pdf"; List<String> pages = readPDF(srcFile); List<String> titles = new ArrayList<>(); boolean flag = false; String reg = "^(?=.*目)(?=.*录).+"; Pattern p = Pattern.compile(reg); for (String page : pages) { Matcher m = p.matcher(page); if (m.find()&&!flag) { flag = true; System.out.println("开始目录检测:"); titles = getTitles(page,titles); }else{ titles = getTitles(page,titles); } // String [] arr = page.split("\\s+"); // if(arr.length>0){ // System.out.println(arr[arr.length-1]); // } } // System.out.println(titles); //查找目录内容在第几页或者第几页有没有目录文字 File pdfFile = new File(srcFile); byte[] pdfData = new byte[(int) pdfFile.length()]; FileInputStream inputStream = null; try { inputStream = new FileInputStream(pdfFile); inputStream.read(pdfData); } catch (IOException e) { throw e; } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { } } } for(String title :titles){ String ptitle = title.split("%%%%%%")[0].replaceAll("\\s*",""); String pnum = title.split("%%%%%%")[1].trim(); // if(Integer.parseInt(pnum)<113){ // continue; // } // System.out.println(ptitle+":"+pnum); if(!PdfKeywordFinder.getPageNums(pdfData,ptitle,Integer.parseInt(pnum))){ System.out.println("目录:"+title.split("%%%%%%")[0]+";显示页码为:"+Integer.parseInt(pnum)+",与实际不符。"); } } } catch (Exception e) { e.printStackTrace(); } long now = System.currentTimeMillis(); System.out.println("共耗时:" + ((now - old) / 1000.0) + "秒"); } public static List<String> readPDF(String file) throws IOException { List result = new ArrayList(); FileInputStream is = new FileInputStream(file); PDFParser parser = new PDFParser(new RandomAccessBuffer(is)); parser.parse(); PDDocument doc = parser.getPDDocument(); PDFTextStripper textStripper =new PDFTextStripper(); for(int i=1;i<=doc.getNumberOfPages();i++){ textStripper.setStartPage(i); textStripper.setEndPage(i); textStripper.setSortByPosition(true);//按顺序行读 String s=textStripper.getText(doc); result.add(s); } doc.close(); return result; } public static List<String> getTitles (String content,List<String> titles){ String[] lines = content.split("\r\n"); String reg= "(.*)(\\,{7,1000}|\\.{7,1000})(.*){1,1000}"; Pattern p = Pattern.compile(reg); for (int i = 0; i < lines.length; i++) { String str= lines[i]; Matcher m = p.matcher(str); if( m.find()){ String ma = m.group().trim(); if(str.split("(\\.{7,1000})").length>1){ String[] title = ma.split("(\\.{7,1000})"); titles.add(title[0]+"%%%%%%"+title[1]); // System.out.println(title[0]+"$$$"+title[1]); } } } return titles; } public void printBookmarks(PDOutlineNode bookmark,String indentation) throws IOException{ PDOutlineItem current = bookmark.getFirstChild(); while(current != null){ int pages = 0; if(current.getDestination() instanceof PDPageDestination){ PDPageDestination pd = (PDPageDestination) current.getDestination(); pages = pd.retrievePageNumber(); } if (current.getAction() instanceof PDActionGoTo) { PDActionGoTo gta = (PDActionGoTo) current.getAction(); if (gta.getDestination() instanceof PDPageDestination) { PDPageDestination pd = (PDPageDestination) gta.getDestination(); pages = pd.retrievePageNumber(); } } if (pages == 0) { System.out.println(indentation+current.getTitle()); }else{ System.out.println(indentation+current.getTitle()+"------"+pages); } printBookmarks( current, indentation); current = current.getNextSibling(); } } //获取页码 public static void getPageNumbers(HashMap<String, Object> bookmark) { if (bookmark == null) return; if ("GoTo".equals(bookmark.get("Action"))) { String page = (String) bookmark.get("Page"); if (page != null) { page = page.trim(); int idx = page.indexOf(' '); int pageNum; if (idx < 0) { pageNum = Integer.parseInt(page); System.out.println("pageNum :" + pageNum); } else { pageNum = Integer.parseInt(page.substring(0, idx)); System.out.println("pageNum:" + pageNum); } } @SuppressWarnings("unchecked") ArrayList<HashMap<String, Object>> kids = (ArrayList<HashMap<String, Object>>) bookmark.get("Kids"); if (kids == null) return; for (Iterator<HashMap<String, Object>> i = kids.iterator(); i.hasNext(); ) { getPageNumbers(i.next()); } } } }
下载地址:使用apose实现doc转pdf或者pdf转doc,含license.xml文件,还有word、pdf目录检测等功能。-Java文档类资源-CSDN下载