pdf转html？pdf转图片

最新推荐文章于 2024-07-01 08:59:26 发布

赏烟雨じ觞离愁

最新推荐文章于 2024-07-01 08:59:26 发布

阅读量619

点赞数

分类专栏：笔记文章标签： java 后端 intellij idea

本文链接：https://blog.csdn.net/asuala/article/details/106273375

版权

笔记专栏收录该内容

3 篇文章 0 订阅

订阅专栏

项目有需求，需要在微信直接浏览pdf文件。ios是可以的，安卓微信端就需要先下载，再用其他工具打开pdf，比如QQ浏览器（坑）。

有需求，就要想解决办法。原来的方法是pdf转html，先前提供的思路（后面发现带到坑里去了）。然后就万事找度娘，马上就找到方法了。就是使用pdfdom转换

引用包

	implementation 'net.sf.cssbox:pdf2dom:1.9'
	implementation 'org.apache.pdfbox:pdfbox:2.0.19'
	implementation 'org.apache.pdfbox:pdfbox-tools:2.0.19'

工具类（找的太多了，就不写引用地址了）

import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;

import java.io.*;

/**
 * @description:pdf转html
 * @author:
 * @create: 2020/05/15
 **/
public class PDFToHTMLUtils {
    /*
    pdf转换html
     */
    public static String pdfToHtml(String filePath)  {
        String outputPath = filePath.substring(0,filePath.lastIndexOf(".")+1)+"html";
        byte[] bytes = getBytes(filePath);
//        try() 写在()里面会自动关闭流
        try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputPath)),"UTF-8"));){
            //加载PDF文档
            PDDocument document = PDDocument.load(bytes);
            PDFDomTree pdfDomTree = new PDFDomTree();
            pdfDomTree.writeText(document,out);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return outputPath;
    }
    /*
    将文件转换为byte数组
     */
    private static byte[] getBytes(String filePath){
        byte[] buffer = null;
        try {
            File file = new File(filePath);
            FileInputStream fis = new FileInputStream(file);
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            byte[] b = new byte[4*1024];
            int n;
            while ((n = fis.read(b)) != -1) {
                bos.write(b, 0, n);
            }
            fis.close();
            buffer = bos.toByteArray();
            bos.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return buffer;
    }
}

这块代码丝毫没有问题，唯一的缺点就是效率了。1MB的pdf转换需要30秒左右，体积大了9倍，10MB的需要一分钟以上，体积大了五倍。而且会有格式错乱问题。
解决思路是按页解析，这块没找到代码，不知道是不支持还是啥。看源码就算了，懒。还有一个利用jacob.jar这个包完成按页解析的，只是这个jar包支持不太好，还要改环境（以前解析Excel研究过，没成功，看了一会代码，直接pass）

后来看见第三方服务转换的结果，有了想法。转html不行，可以转图片啊。就度娘了一下，果然方法超级多。

一开始用的是 PDFRenderer，说是效率高，就是对中文字符不太友好，需要系统安装字库（没研究）。自己试了，表格文字大把缺失，pass



import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;

/**
 * @description:转换
 * @author: 
 * @create: 2020/05/21
 **/
public class PDFToImage {
        private final int maxPage = 30;

        public Map change(String PDFPath) {
        //按照文件路径读取PDF文档，并将其按页转换为图片
        if (PDFPath == null || "".equals(PDFPath)) {
        }
        PDFFile pdfFile = this.getPdfFile(PDFPath);
        String path = PDFPath.substring(0, PDFPath.lastIndexOf("/")+1);
        String imageFile = PDFPath.substring(PDFPath.lastIndexOf("/") + 1, PDFPath.lastIndexOf("."));
        Map map = this.pdf2Images(pdfFile, path, imageFile);
        return map;
    }

        /**
         * PDF文档读取.
         *
         * @param filePath -- 待读取PDF文件的路径.
         * @return null 或者 PDFFile instance.
         */
        private PDFFile getPdfFile(String filePath) {
        try {
            //load a pdf file from byte buffer.
            File file = new File(filePath);
            RandomAccessFile raf = new RandomAccessFile(file, "r");
            FileChannel channel = raf.getChannel();
            ByteBuffer buf = channel.map(FileChannel.MapMode.READ_ONLY, 0,
                    channel.size());
            PDFFile pdfFile = new PDFFile(buf);

            return pdfFile;
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return null;
    }

        /**
         * PDF文档按页转换为图片.
         *
         * @param pdfFile       -- PDFFile instance
         * @param imageSavePath -- 图片保存路径.
         * @param fileName      -- 保存图片文件夹名称.
         */
        private Map pdf2Images(PDFFile pdfFile, String imageSavePath, String fileName) {
//        if(pdfFile == null ) { //待转换文档不存在，返回false.
//            return false;
//        }

        //将转换后图片存放于path路径下

        String path = imageSavePath + fileName+"/";
        File filePath = new File(path);
        if (!filePath.exists()) { //判断以文件名命名的文件夹是否存在.
            filePath.mkdirs();
        }

        //取得当前文件夹下的所有jpg格式的文件名.
        String[] imageNames = filePath.list(new ImageFilter());
        if (imageNames.length == 0) { //当前文件夹下没有文件.
            //将pdf文档按页转为图片.
            String imagePath = "";
            try {
                //对转换页数进行限制,最多只转换前maxPage页.
                int pages = pdfFile.getNumPages();
                if (pages > maxPage) {
                    pages = maxPage;
                }

                for (int i = 1; i <= pages; i++) {
                    // draw the page to an image
                    PDFPage page = pdfFile.getPage(i);
                    // get the width and height for the doc at the default zoom
                    Rectangle rect = new Rectangle(0,
                            0,
                            (int) page.getBBox().getWidth(),
                            (int) page.getBBox().getHeight());
                    // generate the image
                    Image img = page.getImage(rect.width, rect.height, // width & height
                            rect, // clip rect
                            null, // null for the ImageObserver
                            true, // fill background with white
                            true // block until drawing is done
                    );

                    BufferedImage tag = new BufferedImage(rect.width,
                            rect.height,
                            BufferedImage.TYPE_INT_RGB);

                    tag.getGraphics().drawImage(img,
                            0,
                            0,
                            rect.width,
                            rect.height,
                            null);


                    imagePath = path + i + ".jpg";
                    FileOutputStream out = new FileOutputStream(imagePath); // 输出到文件流.
                    ImageIO.write(tag, "jpg", out);
                    out.close();
                }
            } catch (Exception ex) {
                ex.printStackTrace();

            }
        }

        //取得当前文件夹下的所有jpg格式的文件名.
        imageNames = filePath.list(new ImageFilter());
        //对文件名排序.
        Arrays.sort(imageNames, new FileNameComparator());

        Map<String, Object> map = new HashMap<>();
//        servletRequest.setAttribute("state", "s");
//        servletRequest.setAttribute("fileName", fileName);
//        servletRequest.setAttribute("imageNames", imageNames);
        map.put("fileName", fileName);
        map.put("imageNames", imageNames);
        return map;
    }

        class FileNameComparator implements Comparator {
            public final int compare(Object first, Object second) {
                String[] fir = ((String) first).split("\\.");
                String[] sec = ((String) second).split("\\.");

                int firstPage = Integer.parseInt(fir[0]);
                int secondPage = Integer.parseInt(sec[0]);
                int diff = firstPage - secondPage;
                if (diff > 0)
                    return 1;
                if (diff < 0)
                    return -1;
                else
                    return 0;
            }
        }
        //图片jpg过滤器类
        class ImageFilter implements FilenameFilter {
            public boolean isImageFile(String fileName) {
                if (fileName.toLowerCase().endsWith("jpg")) {
                    return true;
                } else {
                    return false;
                }
            }


            public boolean accept(File dir, String name) {
                return isImageFile(name);
            }
        }
}

PDFBox，这是目前我最满意的方案，转换速度，大小，清晰度都很满意。（如果有更好的方法，一起交流）

引入的jar包，自己随便建了一个java文件，总是找不到类文件，出错是在没有引入commons-loggin这个jar包，谨记。（如果使用框架，应该不会因为这个出现bug，脑阔疼）

import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;

/**
 * @description:转换2
 * @author: 
 * @create: 2020/05/21
 **/
public class PdfBox {

    public void toImage() {
        try {
            File file = new File("F:/**.pdf");
            PDDocument doc = PDDocument
                    .load(file, MemoryUsageSetting.setupTempFileOnly());
//            int pageCount = doc.getPageCount();
            int pageCount = doc.getNumberOfPages();
            System.out.println(pageCount);
            PDFRenderer pdfRenderer = new PDFRenderer(doc);
            BufferedOutputStream outputStream = null;
            String imgPath;
            float dpi=90;
            for (int i = 0; i < 1; i++) {

                imgPath = "F:/**/" + i + ".png";
                outputStream = new BufferedOutputStream(new FileOutputStream(imgPath));
                BufferedImage image = pdfRenderer.renderImageWithDPI(i, dpi, ImageType.RGB);
                ImageIO.write(image, "png", outputStream);
                outputStream.close();

            }
            doc.close();
            System.out.println("over");
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }

    }
}