java解析pdf格式文件获取文本内容

最新推荐文章于 2024-05-13 10:08:24 发布

乔妹的船长

最新推荐文章于 2024-05-13 10:08:24 发布

阅读量3.6k

点赞数 5

分类专栏：图文识别文章标签： pdf解析文字图片提取文字

本文链接：https://blog.csdn.net/qq_30515213/article/details/102921814

版权

图文识别专栏收录该内容

4 篇文章 0 订阅

订阅专栏

思路：先将pdf按照页数分割成图片，在将分割的图片做图片识别，提取文字，最后将提取到的文字解析或者保存到txt文件。

图片识别我使用的是百度开发者中心提供的图片识别接口，我在上一篇文章中有详细说明，这里直接跳过，文本只调用的百度开发者中的通用文字接口，对于pdf中的表格，图像，印章无法做到识别；但是在百度开发者中心已经提供了相关接口，可自行参考接口说明，百度开发中心的相关使用和介绍也上上一篇博客中。

链接：https://blog.csdn.net/qq_30515213/article/details/102897601

我是maven项目，目录结构如下(除了PdfResolveService.java ，其他几个文件均在我上一篇文章中，源码也可以在上一篇可以下载)：

pom.xml内容如下：

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.wsk</groupId>
    <artifactId>baidu</artifactId>
    <version>1.0-SNAPSHOT</version>
    <packaging>jar</packaging>
    <dependencies>
        <!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>1.2.46</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
        <dependency>
            <groupId>org.apache.httpcomponents</groupId>
            <artifactId>httpclient</artifactId>
            <version>4.5.5</version>
        </dependency>
        <!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
        <dependency>
            <groupId>commons-codec</groupId>
            <artifactId>commons-codec</artifactId>
            <version>1.12</version>
        </dependency>
       <!-- https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox -->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.17</version>
        </dependency>
    </dependencies>

    <build>
        <finalName>ROOT</finalName>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>2.3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

pfd服务类：PdfResolveService.java

package com.baidu.api;

import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import javax.imageio.ImageIO;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

import com.alibaba.fastjson.JSONObject;



public class PdfResolveService {
	public static void main(String[] args) {
		pdfToImage("d:/home/pdf/通知.pdf","d:/home/image/20180125/");
	}
	/** 
     * 把pdf文件按页数分割，并调用生成图片方法
     *  
     * @param pdfUrl 
     *            文件流数组 
     * @param imageUrl 
     *            输出路径
     * @return  imageNames
     * 			  解析图片文件名称集合
     */  
	public static List<String> pdfToImage(String pdfUrl , String imageUrl){
		long start = System.currentTimeMillis();
		String fileName = pdfUrl.substring(pdfUrl.lastIndexOf("/")+1, pdfUrl.length());
		fileName = fileName.substring(0,fileName.lastIndexOf("."));
        //String filepath = "E:/长安1号-提取指令1份20180104.pdf";  
        PDDocument document = new PDDocument();  
        File pdfFile = new File(pdfUrl);
        List<String> imageNames = new ArrayList<String>();
        if(pdfFile.exists()){
	        try {
				document = PDDocument.load(pdfFile, (String)null);
		        int size = document.getNumberOfPages();  
		        List<BufferedImage> piclist = new ArrayList();   
		        for(int i=0 ; i < size; i++){  
		            BufferedImage  image = new PDFRenderer(document).renderImageWithDPI(i,300,ImageType.RGB);  
		            piclist.add(image);  
		        }  
		        document.close();
		        isFileExist(imageUrl); //判断文件夹是否存在
		        imageNames = yPicTxtByPage(piclist,imageUrl,fileName); //生成图片和txt到路劲下
	        } catch (InvalidPasswordException e) {
				// TODO 自动生成的 catch 块
				e.printStackTrace();
			} catch (IOException e) {
				// TODO 自动生成的 catch 块
				e.printStackTrace();
			}
	        long end = System.currentTimeMillis();  
	        System.out.println(end-start);
        }
        return imageNames;
	}
	/** 
     * 把pdf文件按页数生成多个png图片 
     *  
     * @param piclist 
     *            文件流数组 
     * @param outPath 
     *            输出路径
     * @param fileName 
     *            解析文件名称
     * @return  imageNames
     * 			  解析图片文件名称集合 
     */  
    public static List<String> yPicTxtByPage(List<BufferedImage> piclist, String outPath, String fileName) {// 纵向处理图片  
        if (piclist == null || piclist.size() <= 0) {  
            System.out.println("图片数组为空!");  
            return null;  
        }  
        List<String> imageNames = new ArrayList<String>();
        try {  
            int height = 0, // 总高度  
            width = 0, // 总宽度  
            picNum = piclist.size();// 图片的数量  
            BufferedImage buffer = null; // 保存图片流  
            int[] _imgRGB; // 保存一张图片中的RGB数据
            List<int[]> imgRGB = new ArrayList<int[]>(); // 保存所有的图片的RGB
            String imageName = "";
            for (int i = 0; i < picNum; i++) {  
            	buffer = piclist.get(i);
            	height = buffer.getHeight();// 图片高度
            	width = buffer.getWidth();// 图片宽度
            	// 生成新图片  
            	BufferedImage imageResult = new BufferedImage(width, height,  
            			BufferedImage.TYPE_INT_BGR); 
            	_imgRGB = new int[width * height];// 从图片中读取RGB
            	_imgRGB = buffer  
                        .getRGB(0, 0, width, height, _imgRGB, 0, width);
            	imgRGB.add(_imgRGB); 
                imageResult.setRGB(0, 0, width, height,imgRGB.get(i),  
                        0, width); // 写入流中 
                imageName = fileName+i;
                File outFile = new File(outPath+fileName+i+".png");  
                ByteArrayOutputStream out = new ByteArrayOutputStream();  
                ImageIO.write(imageResult, "png", out);// 写图片  
                byte[] b = out.toByteArray();  
                FileOutputStream output = new FileOutputStream(outFile);  
                output.write(b);
                pngToTxt(outPath+fileName+i+".png",outPath+fileName+i+".txt");
                out.close();  
                output.close();
                imageNames.add(imageName);
            }  
            return imageNames;
        } catch (Exception e) {  
            e.printStackTrace();  
        }
		return imageNames; 
    }
    /** 
     * 把png图片生成txt文本返回到指定目录 
     *  
     * @param imageUrl 
     *            图片路劲 
     * @return  txtUrl
     * 			  解析生成txt路劲(utf-8)格式
     */ 
	public static void pngToTxt(String imageUrl,String txtUrl){
		//File imageFile = new File("e:/PDFBox1.png");
		File imageFile = new File(imageUrl);
    	//判断文件是否存在
    	if(imageFile.exists()){
    		try {
    			//String path = "D:\\home\\image\\20180125\\0.png";
    			String result = "";
    	        try {
    	            long now = System.currentTimeMillis();
    	            String str =  MyImgCheck.checkFile(imageUrl);//调用图片解析文字服务
    	            //根据需要解析组装返回的数据
                    Map<String,Object> mapObj = JSONObject.parseObject(str,Map.class);  
                    List<Map<String,Object>>words_result=(List<Map<String,Object>>)mapObj.get("words_result");
                    for(int i=0;i<words_result.size();i++){
                    	System.out.println("第"+i+"行："+words_result.get(i).get("words"));
                    }
    	            System.out.println("耗时：" + (System.currentTimeMillis() - now) / 1000 + "s");
    	            result=str;
    	        } catch (URISyntaxException | IOException e) {
    	            e.printStackTrace();
    	        }
    			
    			System.out.println(result);
    			OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(txtUrl),"UTF-8");
    			osw.write(result);            
    			osw.flush();            
    			osw.close();
    		} catch (Exception e) {
    			System.err.println(e.getMessage());
    		}
    	}
	}
	/** 
     *  判断路劲是否存在，不存在则创建
     *  
     * @param strPath 
     *            文件夹路劲 
     * @return boolean 
     *            是否存在 
     */
    public static boolean isFileExist(String strPath){
    	File file = new File(strPath);
    	if(file.exists()){
    		return true;
    	}else{
    		file.mkdirs();
    		return false;
    	}
    }
}

运行结果：

解析的文字：

生成的图片和保存的txt:

乔妹的船长

关注

5
点赞
踩
12

收藏

觉得还不错? 一键收藏
2
评论
java解析pdf格式文件获取文本内容

思路：先将pdf按照页数分割成图片，在将分割的图片做图片识别，提取文字，最后将提取到的文字解析或者保存到txt文件。图片识别我使用的是百度开发者中心提供的图片识别接口，我在上一篇文章中有详细说明，这里直接跳过，文本只调用的百度开发者中的通用文字接口，对于pdf中的表格，图像，印章无法做到识别；但是在百度开发者中心已经提供了相关接口，可自行参考接口说明，百度开发中心的相关使用和介绍也上上一篇...
复制链接

扫一扫