实现DOC、DOCX转换为PDF 再将PDF转换为图片

实现DOC、DOCX转换为PDF 再将PDF转换为图片

首先导入需要的依赖

<dependency>
    <groupId>cn.hutool</groupId>
    <artifactId>hutool-all</artifactId>
    <version>5.4.7</version>
</dependency>

<!--pdf转换工具-->
<dependency>
    <groupId>net.sf.cssbox</groupId>
    <artifactId>pdf2dom</artifactId>
    <version>1.7</version>
</dependency>
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.12</version>
</dependency>
<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox-tools</artifactId>
    <version>2.0.12</version>
</dependency>
<dependency>
    <groupId>com.lowagie</groupId>
    <artifactId>itext</artifactId>
    <version>2.0.7</version>
</dependency>
<!--文档转换工具-->
<dependency>
    <groupId>com.aspose</groupId>
    <artifactId>aspose-words</artifactId>
    <version>15.8.0</version>
    <scope>system</scope>
    <systemPath>${basedir}/lib/aspose-words-15.8.0-jdk16.jar</systemPath>
</dependency>

其中aspose-words-15.8.0-jdk16.jar需要从外部引入:链接:https://pan.baidu.com/s/1eqMR_6lvt8HHIAxTvs09fA 提取码:o5ct

资源目录下添加Locense.xml

<License>
    <Data>
        <Products>
            <Product>Aspose.Total for Java</Product>
            <Product>Aspose.Words for Java</Product>
        </Products>
        <EditionType>Enterprise</EditionType>
        <SubscriptionExpiry>20991231</SubscriptionExpiry>
        <LicenseExpiry>20991231</LicenseExpiry>
        <SerialNumber>8bfe198c-7f0c-4ef8-8ff0-acc3237bf0d7</SerialNumber>
    </Data>
    <Signature>
        sNLLKGMUdF0r8O1kKilWAGdgfs2BvJb/2Xp8p5iuDVfZXmhppo+d0Ran1P9TKdjV4ABwAgKXxJ3jcQTqE/2IRfqwnPf8itN8aFZlV3TJPYeD3yWE7IT55Gz6EijUpC7aKeoohTb4w2fpox58wWoF3SNp6sK6jDfiAUGEHYJ9pjU=
    </Signature>
</License>

将word转换为PDF的工具类

import cn.hutool.core.collection.CollUtil;
import cn.hutool.system.OsInfo;
import cn.hutool.system.SystemUtil;
import com.aspose.words.Document;
import com.aspose.words.FontSettings;
import com.aspose.words.License;
import com.aspose.words.SaveFormat;
import lombok.extern.slf4j.Slf4j;

import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.util.Arrays;

@Slf4j
public class AsposeUtil {

    private static final String[] WORD = {"doc", "docx", "wps", "wpt", "txt"};
    private static final String[] EXCEL = {"xls", "xlsx", "et", "xlsm"};
    private static final String[] PPT = {"ppt", "pptx"};
    private static final String[] PDF = {"pdf"};
    private static final String[] IMG = {"bmp", "jpg", "png", "tif", "gif", "pcx", "tga", "exif", "fpx", "svg", "psd", "cdr", "pcd", "dxf", "ufo", "eps", "ai", "raw", "WMF", "webp", "avif", "apng"};

    private static final String TYPE_UNSUPPORT = "不支持的格式";
    private static final String TYPE_WORD = "TYPE_WORD";
    private static final String TYPE_EXCEL = "TYPE_EXCEL";
    private static final String TYPE_PPT = "TYPE_PPT";
    private static final String TYPE_PDF = "TYPE_PDF";
    private static final String TYPE_IMG = "TYPE_IMG";


    private boolean judgeLicense() {
        boolean result = false;
        try {
         InputStream is = AsposeUtil.class.getClassLoader().getResourceAsStream("License.xml");
            License aposeLic = new License();
            aposeLic.setLicense(is);
            result = true;
        } catch (Exception e) {
            log.error("Aspose License 文档转换失败!", e);
        }
        return result;
    }

    /**
     * 根据文件名判断文件类型
     */
    private String getType(String fileName) {
        String suffix = fileName.substring(fileName.lastIndexOf(".") + 1).toLowerCase(); // 后缀
        if (CollUtil.contains(Arrays.asList(WORD), suffix)) {
            return TYPE_WORD;
        } else if (CollUtil.contains(Arrays.asList(EXCEL), suffix)) {
            return TYPE_EXCEL;
        } else if (CollUtil.contains(Arrays.asList(PPT), suffix)) {
            return TYPE_PPT;
        } else if (CollUtil.contains(Arrays.asList(PDF), suffix)) {
            return TYPE_PDF;
        } else if (CollUtil.contains(Arrays.asList(IMG), suffix)) {
            return TYPE_IMG;
        } else {
            return TYPE_UNSUPPORT;
        }
    }

    public static void main(String[] args) throws Exception {
    }

    /**
     * 文件转化pdf
     * @param fileName 文件名称
     * @param in     文件输入流
     * @return 转换后的pdf地址 或 格式不支持预览
     */
    public byte[] toPdf(String fileName, InputStream in) throws Exception {
        if (!judgeLicense()) {
            throw new Exception();
        }
        String type = getType(fileName);
        if (TYPE_WORD.equals(type)) {
            try (ByteArrayOutputStream tmp = wordToPdfStream(in)) {
                return tmp.toByteArray();
            } catch (Exception e) {
                log.error("word转换pdf失败!", e);
            }
        }
        return new byte[0];
    }


    private ByteArrayOutputStream wordToPdfStream(InputStream in) throws Exception {
        Document doc = new Document(in);
        OsInfo osInfo = SystemUtil.getOsInfo();
        if (osInfo.isLinux()) {// 提前将字体安装到linux如下路径 TODO 如果是linux环境则需要将windows下字体(C:\Windows\Fonts)提前复制到这个目录下
            FontSettings.setFontsFolder("/usr/share/fonts/chinese", true);
        }
        ByteArrayOutputStream dstStream = new ByteArrayOutputStream();
        doc.save(dstStream, SaveFormat.PDF);
        return dstStream;
    }



    //private ByteArrayOutputStream excelToPdfStream(InputStream in) throws Exception {
    //    Workbook excel = new Workbook(in);
    //    ByteArrayOutputStream dstStream = new ByteArrayOutputStream();
    //    excel.save(dstStream, SaveFormat.PDF);
    //    return dstStream;
    //}
    //
    //private ByteArrayOutputStream pptToPdfStream(InputStream in) throws Exception {
    //    Presentation ppt = new Presentation (in);
    //    ByteArrayOutputStream dstStream = new ByteArrayOutputStream();
    //    ppt.save(dstStream, SaveFormat.PDF);
    //    return dstStream;
    //}
}

将PDF转换为图片的工具类

import cn.hutool.core.util.ObjectUtil;
import com.google.common.collect.Lists;
import com.lowagie.text.pdf.PdfReader;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.List;

@Slf4j
public class PdfUtil {

    public static final int DEFAULT_DPI = 150;

    /***
     * PDF文件转PNG图片,全部页数
     *
     * @param pdfFilePath pdf完整路径
     * @param dpi dpi越大转换后越清晰,相对转换速度越慢
     */
    public static void pdf2Image(String pdfFilePath, int dpi) {
        File file = new File(pdfFilePath);
        PDDocument pdDocument;
        try {
            String imgPdfPath = file.getParent();
            int dot = file.getName().lastIndexOf('.');
            // 获取图片文件名
            String imagePdfName = file.getName().substring(0, dot);

            pdDocument = PDDocument.load(file);
            PDFRenderer renderer = new PDFRenderer(pdDocument);
            /* dpi越大转换后越清晰,相对转换速度越慢 */
            PdfReader reader = new PdfReader(pdfFilePath);
            int pages = reader.getNumberOfPages();
            StringBuffer imgFilePath;
            for (int i = 0; i < pages; i++) {
                String imgFilePathPrefix = imgPdfPath + File.separator + imagePdfName;
                imgFilePath = new StringBuffer();
                imgFilePath.append(imgFilePathPrefix);
                imgFilePath.append("_");
                imgFilePath.append((i + 1));
                imgFilePath.append(".png");
                File dstFile = new File(imgFilePath.toString());
                BufferedImage image = renderer.renderImageWithDPI(i, dpi);
                ImageIO.write(image, "png", dstFile);
            }
            log.info("PDF文档转PNG图片成功!");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * pdf转图片
     * 多页PDF会每页转换为一张图片,下面会有多页组合成一页的方法
     * @param pdfFile pdf文件路径
     * @param outPath 图片输出路径
     * @param dpi 相当于图片的分辨率,值越大越清晰,但是转换时间变长
     */
    public static void pdf2multiImageFile(String pdfFile, String outPath, int dpi) {
        if (ObjectUtil.isEmpty(dpi)) {
            // 如果没有设置DPI,默认设置为150
            dpi = DEFAULT_DPI;
        }
        try (PDDocument pdf = PDDocument.load(new FileInputStream(pdfFile))) {
            int actSize = pdf.getNumberOfPages();
            List<BufferedImage> picList = Lists.newArrayList();
            for (int i = 0; i < actSize; i++) {
                BufferedImage image = new PDFRenderer(pdf).renderImageWithDPI(i, dpi, ImageType.RGB);
                picList.add(image);
            }
            // 组合图片
            ImageUtil.combinationImages2File(picList, outPath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * pdf转图片
     * 多页PDF会每页转换为一张图片,下面会有多页组合成一页的方法
     * @param pdfBytes pdf文件字节数组
     * @param dpi 相当于图片的分辨率,值越大越清晰,但是转换时间变长
     */
    public static byte[] pdfBytes2multiImageBytes(byte[] pdfBytes, int dpi) {
        if (ArrayUtils.isEmpty(pdfBytes)) {
            return new byte[0];
        }
        if (dpi <= 0) {
            // 如果没有设置DPI,默认设置为150
            dpi = DEFAULT_DPI;
        }
        try (PDDocument pdf = PDDocument.load(pdfBytes)) {
            int actSize = pdf.getNumberOfPages();
            List<BufferedImage> picList = Lists.newArrayList();
            PDFRenderer renderer = new PDFRenderer(pdf);
            for (int i = 0; i < actSize; i++) {
                BufferedImage image = renderer.renderImageWithDPI(i, dpi, ImageType.RGB);
                picList.add(image);
            }
            // 组合图片
            return ImageUtil.combinationImages2Bytes(picList);
        } catch (IOException e) {
            log.error("pdf解析失败!", e);
        }
        return new byte[0];
    }

}

实现多张图片组合的工具类

import lombok.extern.slf4j.Slf4j;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

@Slf4j
public class ImageUtil {

    /**
     * 将宽度相同的图片,竖向追加在一起 ##注意:宽度必须相同
     * @param picList 文件流数组
     */
    private static BufferedImage yPic(List<BufferedImage> picList) {// 纵向处理图片
        if (picList == null || picList.size() <= 0) {
            log.info("图片数组为空!");
            return null;
        }
        try {
            // 总高度
            int height = 0,
                    // 总宽度
                    width = 0,
                    // 临时的高度 , 或保存偏移高度
                    offsetHeight,
                    // 临时的高度,主要保存每个高度
                    tmpHeight,
                    // 图片的数量
                    picNum = picList.size();
            // 保存每个文件的高度
            int[] heightArray = new int[picNum];
            // 保存图片流
            BufferedImage buffer;
            // 保存所有的图片的RGB
            List<int[]> imgRgb = new ArrayList<>();
            // 保存一张图片中的RGB数据
            int[] tmpImgRgb;
            for (int i = 0; i < picNum; i++) {
                buffer = picList.get(i);
                // 图片高度
                heightArray[i] = offsetHeight = buffer.getHeight();
                if (i == 0) {
                    // 图片宽度
                    width = buffer.getWidth();
                }
                // 获取总高度
                height += offsetHeight;
                // 从图片中读取RGB
                tmpImgRgb = new int[width * offsetHeight];
                tmpImgRgb = buffer.getRGB(0, 0, width, offsetHeight, tmpImgRgb, 0, width);
                imgRgb.add(tmpImgRgb);
            }
            // 设置偏移高度为0
            offsetHeight = 0;
            // 生成新图片
            BufferedImage imageResult = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
            for (int i = 0; i < picNum; i++) {
                tmpHeight = heightArray[i];
                if (i != 0) {
                    // 计算偏移高度
                    offsetHeight += tmpHeight;
                }
                // 写入流中
                imageResult.setRGB(0, offsetHeight, width, tmpHeight, imgRgb.get(i), 0, width);
            }
            return imageResult;
        } catch (Exception e) {
            log.error("图片合成失败!", e);
        }
        return null;
    }

    /**
     * 将图片列表组合成长图
     * 注意:文件宽度必须一致
     * @param picList 图片列表
     * @return 转换后的字节数组
     */
    public static byte[] combinationImages2Bytes(List<BufferedImage> picList) {
        BufferedImage image = yPic(picList);
        if (image != null) {
            try (ByteArrayOutputStream baos = new ByteArrayOutputStream()) {
                boolean png = ImageIO.write(image, "png", baos);
                if (png) {
                    return baos.toByteArray();
                }
            } catch (IOException e) {
                log.error("图片组合失败!", e);
            }
        }
        return new byte[0];
    }

    /**
     * 将图片组合并输出到文件中
     * @param picList 文件流数组
     * @param outPath 输出路径
     */
    public static void combinationImages2File(List<BufferedImage> picList, String outPath) throws IOException, NullPointerException {
        BufferedImage image = yPic(picList);
        File outFile = new File(outPath);
        ImageIO.write(image, "png", outFile);
    }

}

示例代码

public class TestDemo {

    // 文件绝对路径
    private static String filepath = "D:\\***\\";
    // 文件名称
    private static String filename = "****.docx";

    public static void main(String[] args) throws Exception {
        if (filename.endsWith(".pdf")) {
            PdfUtil.pdf2multiImageFile(filepath, "pdf2png.png",130);// 文件输出到某个路径下面
        } else {// WORD就先生成pdf在生成快照
            byte[] pdfBytes = new AsposeUtil().toPdf(filename, new FileInputStream(filepath + filename));
            final byte[] bytes =
                    PdfUtil.pdfBytes2multiImageBytes(pdfBytes, 130);
            final FileOutputStream fileOutputStream = new FileOutputStream("word2png.png");
            fileOutputStream.write(bytes);
        }
    }

}

运行完这个main方法你就会发现大功告成!

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
要将PDF文件转换DOCX文件,您可以使用Python的“pdfminer”和“python-docx”库。您需要安装这些库,然后按照以下步骤进行操作: 1. 导入所需的库: ```python import io import os import docx from pdfminer.converter import TextConverter from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfpage import PDFPage ``` 2. 创建一个函数来将PDF文件转换为文本: ```python def pdf_to_text(pdf_file): resource_manager = PDFResourceManager() text_stream = io.StringIO() codec = 'utf-8' laparams = pdfminer.layout.LAParams() converter = TextConverter(resource_manager, text_stream, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(resource_manager, converter) password = "" maxpages = 0 caching = True page_nums = set() for page in PDFPage.get_pages(pdf_file, page_nums, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) converter.close() text = text_stream.getvalue() text_stream.close() return text ``` 3. 创建一个函数来将文本转换DOCX文件: ```python def text_to_docx(text, output): doc = docx.Document() doc.add_paragraph(text) doc.save(output) ``` 4. 最后,您可以将上面的两个函数组合在一起来实现转换: ```python pdf_file = open('example.pdf', 'rb') text = pdf_to_text(pdf_file) pdf_file.close() output = 'example.docx' text_to_docx(text, output) ``` 以上代码中,我们将PDF文件“example.pdf转换为文本,然后将文本转换DOCX文件“example.docx”。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值