思路:先将pdf按照页数分割成图片,在将分割的图片做图片识别,提取文字,最后将提取到的文字解析或者保存到txt文件。
图片识别我使用的是百度开发者中心提供的 图片识别接口,我在上一篇文章中有详细说明,这里直接跳过,文本只调用的百度开发者中的通用文字接口,对于pdf中的表格,图像,印章 无法做到识别;但是在百度开发者中心已经提供了相关接口,可自行参考接口说明,百度开发中心的相关使用和介绍也上上一篇博客中。
链接:https://blog.csdn.net/qq_30515213/article/details/102897601
我是maven项目,目录结构如下(除了PdfResolveService.java ,其他几个文件均在我上一篇文章中,源码也可以在上一篇可以下载):
pom.xml内容如下:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.wsk</groupId>
<artifactId>baidu</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>
<dependencies>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.46</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.12</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.17</version>
</dependency>
</dependencies>
<build>
<finalName>ROOT</finalName>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.2</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
pfd服务类:PdfResolveService.java
package com.baidu.api;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import com.alibaba.fastjson.JSONObject;
public class PdfResolveService {
public static void main(String[] args) {
pdfToImage("d:/home/pdf/通知.pdf","d:/home/image/20180125/");
}
/**
* 把pdf文件按页数分割,并调用生成图片方法
*
* @param pdfUrl
* 文件流数组
* @param imageUrl
* 输出路径
* @return imageNames
* 解析图片文件名称集合
*/
public static List<String> pdfToImage(String pdfUrl , String imageUrl){
long start = System.currentTimeMillis();
String fileName = pdfUrl.substring(pdfUrl.lastIndexOf("/")+1, pdfUrl.length());
fileName = fileName.substring(0,fileName.lastIndexOf("."));
//String filepath = "E:/长安1号-提取指令1份20180104.pdf";
PDDocument document = new PDDocument();
File pdfFile = new File(pdfUrl);
List<String> imageNames = new ArrayList<String>();
if(pdfFile.exists()){
try {
document = PDDocument.load(pdfFile, (String)null);
int size = document.getNumberOfPages();
List<BufferedImage> piclist = new ArrayList();
for(int i=0 ; i < size; i++){
BufferedImage image = new PDFRenderer(document).renderImageWithDPI(i,300,ImageType.RGB);
piclist.add(image);
}
document.close();
isFileExist(imageUrl); //判断文件夹是否存在
imageNames = yPicTxtByPage(piclist,imageUrl,fileName); //生成图片和txt到路劲下
} catch (InvalidPasswordException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
} catch (IOException e) {
// TODO 自动生成的 catch 块
e.printStackTrace();
}
long end = System.currentTimeMillis();
System.out.println(end-start);
}
return imageNames;
}
/**
* 把pdf文件按页数生成多个png图片
*
* @param piclist
* 文件流数组
* @param outPath
* 输出路径
* @param fileName
* 解析文件名称
* @return imageNames
* 解析图片文件名称集合
*/
public static List<String> yPicTxtByPage(List<BufferedImage> piclist, String outPath, String fileName) {// 纵向处理图片
if (piclist == null || piclist.size() <= 0) {
System.out.println("图片数组为空!");
return null;
}
List<String> imageNames = new ArrayList<String>();
try {
int height = 0, // 总高度
width = 0, // 总宽度
picNum = piclist.size();// 图片的数量
BufferedImage buffer = null; // 保存图片流
int[] _imgRGB; // 保存一张图片中的RGB数据
List<int[]> imgRGB = new ArrayList<int[]>(); // 保存所有的图片的RGB
String imageName = "";
for (int i = 0; i < picNum; i++) {
buffer = piclist.get(i);
height = buffer.getHeight();// 图片高度
width = buffer.getWidth();// 图片宽度
// 生成新图片
BufferedImage imageResult = new BufferedImage(width, height,
BufferedImage.TYPE_INT_BGR);
_imgRGB = new int[width * height];// 从图片中读取RGB
_imgRGB = buffer
.getRGB(0, 0, width, height, _imgRGB, 0, width);
imgRGB.add(_imgRGB);
imageResult.setRGB(0, 0, width, height,imgRGB.get(i),
0, width); // 写入流中
imageName = fileName+i;
File outFile = new File(outPath+fileName+i+".png");
ByteArrayOutputStream out = new ByteArrayOutputStream();
ImageIO.write(imageResult, "png", out);// 写图片
byte[] b = out.toByteArray();
FileOutputStream output = new FileOutputStream(outFile);
output.write(b);
pngToTxt(outPath+fileName+i+".png",outPath+fileName+i+".txt");
out.close();
output.close();
imageNames.add(imageName);
}
return imageNames;
} catch (Exception e) {
e.printStackTrace();
}
return imageNames;
}
/**
* 把png图片生成txt文本返回到指定目录
*
* @param imageUrl
* 图片路劲
* @return txtUrl
* 解析生成txt路劲(utf-8)格式
*/
public static void pngToTxt(String imageUrl,String txtUrl){
//File imageFile = new File("e:/PDFBox1.png");
File imageFile = new File(imageUrl);
//判断文件是否存在
if(imageFile.exists()){
try {
//String path = "D:\\home\\image\\20180125\\0.png";
String result = "";
try {
long now = System.currentTimeMillis();
String str = MyImgCheck.checkFile(imageUrl);//调用图片解析文字服务
//根据需要解析组装返回的数据
Map<String,Object> mapObj = JSONObject.parseObject(str,Map.class);
List<Map<String,Object>>words_result=(List<Map<String,Object>>)mapObj.get("words_result");
for(int i=0;i<words_result.size();i++){
System.out.println("第"+i+"行:"+words_result.get(i).get("words"));
}
System.out.println("耗时:" + (System.currentTimeMillis() - now) / 1000 + "s");
result=str;
} catch (URISyntaxException | IOException e) {
e.printStackTrace();
}
System.out.println(result);
OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(txtUrl),"UTF-8");
osw.write(result);
osw.flush();
osw.close();
} catch (Exception e) {
System.err.println(e.getMessage());
}
}
}
/**
* 判断路劲是否存在,不存在则创建
*
* @param strPath
* 文件夹路劲
* @return boolean
* 是否存在
*/
public static boolean isFileExist(String strPath){
File file = new File(strPath);
if(file.exists()){
return true;
}else{
file.mkdirs();
return false;
}
}
}
运行结果:
解析的文字:
生成的图片和保存的txt: