依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>
<dependency>
<groupId>net.coobird</groupId>
<artifactId>thumbnailator</artifactId>
<version>0.4.8</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.9</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.9</version>
</dependency>
处理图片的工具-代码
package com.example.pdf.Pdf2wordNew;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.xmlbeans.XmlException;
import org.apache.xmlbeans.XmlToken;
import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
import org.openxmlformats.schemas.drawingml.x2006.main.CTPositiveSize2D;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;
import java.io.InputStream;
/**
* @program: pdf
* @author: xlk
* @create: 2022-11-21 10:30
*/
public class MyXWPFDocument extends XWPFDocument {
public MyXWPFDocument(InputStream in) throws Exception {
super(in);
}
public MyXWPFDocument() {
super();
}
public MyXWPFDocument(OPCPackage pkg) throws Exception {
super(pkg);
}
/**
* 处理图片工具
* @param id
* @param width 宽
* @param height 高
* @param paragraph 段落
*/
public void createPicture(int id, int width, int height, XWPFParagraph paragraph) {
final int EMU = 9525;
width *= EMU;
height *= EMU;
String blipId = getAllPictures().get(id).getPackageRelationship().getId();
CTInline inline = paragraph.createRun().getCTR().addNewDrawing().addNewInline();
String picXml = ""
+ "<a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">"
+ " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
+ " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">"
+ " <pic:nvPicPr>" + " <pic:cNvPr id=\""
+ id
+ "\" name=\"Generated\"/>"
+ " <pic:cNvPicPr/>"
+ " </pic:nvPicPr>"
+ " <pic:blipFill>"
+ " <a:blip r:embed=\""
+ blipId
+ "\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\"/>"
+ " <a:stretch>"
+ " <a:fillRect/>"
+ " </a:stretch>"
+ " </pic:blipFill>"
+ " <pic:spPr>"
+ " <a:xfrm>"
+ " <a:off x=\"0\" y=\"0\"/>"
+ " <a:ext cx=\""
+ width
+ "\" cy=\""
+ height
+ "\"/>"
+ " </a:xfrm>"
+ " <a:prstGeom prst=\"rect\">"
+ " <a:avLst/>"
+ " </a:prstGeom>"
+ " </pic:spPr>"
+ " </pic:pic>"
+ " </a:graphicData>" + "</a:graphic>";
inline.addNewGraphic().addNewGraphicData();
XmlToken xmlToken = null;
try {
xmlToken = XmlToken.Factory.parse(picXml);
} catch (XmlException xe) {
xe.printStackTrace();
}
inline.set(xmlToken);
inline.setDistT(0);
inline.setDistB(0);
inline.setDistL(0);
inline.setDistR(0);
CTPositiveSize2D extent = inline.addNewExtent();
extent.setCx(width);
extent.setCy(height);
CTNonVisualDrawingProps docPr = inline.addNewDocPr();
docPr.setId(id);
docPr.setName("图片名称");
docPr.setDescr("描述信息");
}
}
开始转换
package com.example.pdf.Pdf2wordNew;
import net.coobird.thumbnailator.Thumbnails;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
public class Pdf2wordNew {
public static void main(String[] args) throws Exception {
try {
String pdfFileName = "C:\\Users\\11949\\Desktop\\新建文件夹 (2)\\面试题.pdf";
PDDocument pdf = PDDocument.load(new File(pdfFileName));
int pageNumber = pdf.getNumberOfPages();
String docFileName = pdfFileName.substring(0, pdfFileName.lastIndexOf(".")) + ".doc";
File file = new File(docFileName);
if (!file.exists()) {
file.createNewFile();
}
MyXWPFDocument document = new MyXWPFDocument();
FileOutputStream fos = new FileOutputStream(docFileName);
//提取每一页的图片和文字,添加到 word 中
for (int i = 0; i < pageNumber; i++) {
PDPage page = pdf.getPage(i);
PDResources resources = page.getResources();
Iterable<COSName> names = resources.getXObjectNames();
Iterator<COSName> iterator = names.iterator();
while (iterator.hasNext()) {
COSName cosName = iterator.next();
if (resources.isImageXObject(cosName)) {
PDImageXObject imageXObject = (PDImageXObject) resources.getXObject(cosName);
File outImgFile = new File("C:\\Users\\11949\\Desktop\\新建文件夹 (2)\\"
+ System.currentTimeMillis() + ".jpg");
Thumbnails.of(imageXObject.getImage()).scale(1).rotate(0).toFile(outImgFile);
BufferedImage bufferedImage = ImageIO.read(outImgFile);
int width = bufferedImage.getWidth();
int height = bufferedImage.getHeight();
if (width > 600) {
double ratio = Math.round((double) width / 550.0);
System.out.println("缩放比ratio:" + ratio);
width = (int) (width / ratio);
height = (int) (height / ratio);
}
System.out.println("width: " + width + ", height: " + height);
FileInputStream in = new FileInputStream(outImgFile);
byte[] ba = new byte[in.available()];
in.read(ba);
ByteArrayInputStream byteInputStream = new ByteArrayInputStream(ba);
XWPFParagraph picture = document.createParagraph();
//添加图片
document.addPictureData(byteInputStream, MyXWPFDocument.PICTURE_TYPE_JPEG);
//图片大小、位置
document.createPicture(document.getAllPictures().size() - 1, width, height, picture);
}
}
PDFTextStripper stripper = new PDFTextStripper();
stripper.setSortByPosition(true);
stripper.setStartPage(i);
stripper.setEndPage(i);
//当前页中的文字
String text = stripper.getText(pdf);
System.out.println(" ========== " + text);
XWPFParagraph textParagraph = document.createParagraph();
XWPFRun textRun = textParagraph.createRun();
// 处理换行问题
if (text.contains("\r\n")) {
String[] split = text.split("\r\n");
List<String> strsToList1 = Arrays.asList(split);
for (String str : strsToList1) {
System.out.println(str);
textRun.setText(str);
textRun.addCarriageReturn();
}
}
// textRun.setText(text);
textRun.setFontFamily("仿宋");
textRun.setFontSize(10);
//换行
// 插入换行符
textParagraph.setWordWrap(true);
}
document.write(fos);
fos.close();
pdf.close();
System.out.println("pdf转换解析结束!!----");
} catch (IOException e) {
e.printStackTrace();
}
}
}