使用java进行pdf转word实战

package com.ljl; /**
 * @author  
 * @Package_name PACKAGE_NAME
 * @Date 2019/4/1 0001 上午 8:30
 * @Description
 */

import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
//import org.apache.pdfbox.util.PDFTextStripper;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

public class Pdf2word
{
    public static void main(String[] args)
    {
        try{
            String pdfFile = "E:/Hadoop2.pdf";
            PDDocument doc = PDDocument.load(new File(pdfFile));
            int pagenumber = doc.getNumberOfPages();
            pdfFile = pdfFile.substring(0, pdfFile.lastIndexOf("."));
            String fileName = pdfFile + "_new.doc";
            File file = new File(fileName);
            if (!file.exists())
            {
                file.createNewFile();
            }
            FileOutputStream fos = new FileOutputStream(fileName);
            Writer writer = new OutputStreamWriter(fos, "UTF-8");
            PDFTextStripper stripper = new PDFTextStripper();
            stripper.setSortByPosition(true);// 排序
            stripper.setStartPage(1);// 设置转换的开始页
            stripper.setEndPage(pagenumber);// 设置转换的结束页
            stripper.writeText(doc, writer);
            writer.close();
            doc.close();
            System.out.println("pdf转换word成功!");
        }
        catch (IOException e)
        {
            e.printStackTrace();
        }
        try {
            String pdfFile = "E:/Hadoop2.pdf";
            PDDocument doc = PDDocument.load(new File(pdfFile));
            List<PDImageXObject> imagelist=  getImageListFromPDF(doc,0);
            System.out.println("图片个数 : "+imagelist.size());

            for (int i=0;i<imagelist.size();i++){
                writeImageInputStream(imagelist.get(i));//写入文件系统
            }
        }catch (Exception e){
            e.printStackTrace();
        }
        
    }
     /**
       * 从pdf文档中读取所有的图片信息
       * 
       * @return
       * @throws Exception 
       */
        public static List<PDImageXObject> getImageListFromPDF(PDDocument document, Integer startPage) throws Exception {
          List<PDImageXObject> imageList = new ArrayList<PDImageXObject>();
          if(null != document){
           PDPageTree pages = document.getPages();
           startPage = startPage == null ? 0 : startPage;
           int len = pages.getCount();
              System.out.println("页数 "+len);
           if(startPage < len){
            for(int i=startPage;i<len;i++){
             PDPage page = pages.get(i);
             if(page!=null) {
                 if( page.getResources()!=null) {
                     Iterable<COSName> objectNames = page.getResources().getXObjectNames();

                     for (COSName imageObjectName : objectNames) {
                         if (page.getResources().isImageXObject(imageObjectName)) {
                             imageList.add((PDImageXObject) page.getResources().getXObject(imageObjectName));
                         }
                     }
                 }else {
                     System.out.println("当前页面没有图片 page.getResources() is null ");
                 }
             }else {
                 System.out.println("page is null ");
             }

            }
           }
          }
          return imageList;
         }
    /**
       * 读取图片文件流信息
       * @param image
       * @return
       * @throws Exception 
       */
         public static InputStream getImageInputStream(PDImageXObject image) throws Exception
         {
         if(null!=image && null!= image.getImage())
          {
           BufferedImage bufferImage = image.getImage();
           ByteArrayOutputStream os = new ByteArrayOutputStream();
           ImageIO.write(bufferImage, image.getSuffix(), os);
           return new ByteArrayInputStream(os.toByteArray());
          }
            return null;
         }

    /**
     * 写入文件系统
     * @param image
     * @throws Exception
     */
    public static void writeImageInputStream(PDImageXObject image) throws Exception
    {
        if(null!=image && null!= image.getImage()) {
            //粗略写入到文件系统
            Date date=new Date();
            String name = date.getTime()+"_image" ;
            File imgFile = new File("E:/images/" + name + "." + image.getSuffix());//写入的地址
            FileOutputStream fout = new FileOutputStream(imgFile);
            ByteArrayOutputStream os = new ByteArrayOutputStream();

            BufferedImage imageb = image.getImage();
            ImageIO.write(imageb, image.getSuffix(), os);
            InputStream is = new ByteArrayInputStream(os.toByteArray());
            int byteCount = 0;
            byte[] bytes = new byte[1024];
            while ((byteCount = is.read(bytes)) > 0) {
                fout.write(bytes, 0, byteCount);
            }
            fout.close();
            is.close();
        }
    }
 
}

上面是使用pdfbox-2.0.11.jar进行pdf转word的代码,和对pdf中图片的读取写入到文件系统,附带pom配置

<dependencies>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.11</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.bouncycastle</groupId>
      <artifactId>bcprov-jdk16</artifactId>
      <version>1.46</version>
    </dependency>
    
    <dependency>
      <groupId>commons-logging</groupId>
      <artifactId>commons-logging</artifactId>
      <version>1.2</version>
    </dependency>

    <dependency>
      <groupId>com.ibm.icu</groupId>
      <artifactId>icu4j</artifactId>
      <version>4.8</version>
    </dependency>

    <dependency>
      <groupId>org.apache.pdfbox</groupId>
      <artifactId>pdfbox</artifactId>
      <version>2.0.11</version>
    </dependency>
    <dependency>
      <groupId>org.apache.pdfbox</groupId>
      <artifactId>fontbox</artifactId>
      <version>2.0.11</version>
    </dependency>
  </dependencies>

 

 

 

jar包下载地址https://download.csdn.net/download/javabuilt/10575651

https://download.csdn.net/download/warren_one/10131767

  • 2
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值