如何利用pdfbox将pdf解析为txt

最新推荐文章于 2021-07-14 14:38:40 发布

Java潘老师

最新推荐文章于 2021-07-14 14:38:40 发布

阅读量1k

点赞数

分类专栏：插件

本文链接：https://blog.csdn.net/mixika99/article/details/77936317

版权

插件专栏收录该内容

6 篇文章

订阅专栏

利用apache的pdfbox将pdf解析为txt文件，需要的最基本包如下：

pdfbox-0.7.3.jar

fontbox-2.0.7.jar

commons-logging-1.2.jar

点我下载：点击打开下载链接

下面是以d盘pdf目录下的pdf文件为例：

package com;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.sql.SQLException;

import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

public class CsrPdfRead {

	/**
	 *读取文本文件解析文本
	 * @param args
	 * @throws InterruptedException 
	 * @throws SQLException 
	 */
	public static void main(String[] args) throws InterruptedException, SQLException {
		String path="d:/pdf";
		String filepath="";
		  File file=new File(path);
		  File[] tempList = file.listFiles();
		  System.out.println("该目录下对象个数："+tempList.length);
		  for (int i = 0; i < tempList.length; i++) {
		   if (tempList[i].isFile()) {
			    filepath=path+"/"+tempList[i].getName();
			    System.out.println(path+"/"+filepath);
			    if(!filepath.toUpperCase().endsWith(".TXT")){
			    	System.out.println(getOrderText(filepath));
	        	}
		   }
		   if (tempList[i].isDirectory()) {
		    System.out.println("文件夹："+tempList[i]);
		   }
		  }
		
		
		
	}
	public static String getOrderText(String filepath){
		String filetxt="";
		 try {
			 
             String encoding="GBK";
           String newfilepath=  getTextFromPDF(filepath);
             if(newfilepath==null){
            	 return "";
             }
             File file=new File(newfilepath);
             if(file.isFile() && file.exists()){ //判断文件是否存在
            	 InputStreamReader read = new InputStreamReader(
                 new FileInputStream(file),encoding);//考虑到编码格式
                 BufferedReader bufferedReader = new BufferedReader(read);
                 String lineTxt = null;
                 int a = 0;
                 int b = 0;
                 String bb = "";
                 while((lineTxt = bufferedReader.readLine()) != null){
                	//System.out.println(lineTxt);
                	 if(lineTxt==null || lineTxt.equals("")||lineTxt.trim().length()==0){
                		 continue;
                	 }
                     if(lineTxt.contains("Order ID")){
                    	 String[] oid = lineTxt.split("\\u0029\\s*P");
                    	 if(oid.length>0){
                    		 bb+=oid[0].substring(oid[0].lastIndexOf("(")+10);
                     		b++;
                    	 } 
                     }
               
                     if(lineTxt.contains("-001")){
                    	 if(lineTxt.lastIndexOf("-001")-11>=0){
                    	 bb+=" "+lineTxt.substring(lineTxt.lastIndexOf("-001")-11,lineTxt.lastIndexOf("-001")+4);
                    	 b++;
                    	 }
                     }
                     if(lineTxt.contains("end of line")){
                    	// System.out.println(++a);
                    	 if(b==2){
                    		 System.out.println(bb);
                    		 //filetxt+=++a+":"+bb+"/r/n";
                    		 bb="";
                    		 b=0;
                    	 }
                     }
                 }
                 read.close();
     }else{
         System.out.println("找不到指定的文件");
     }
     } catch (Exception e) {
         System.out.println("读取文件内容出错");
         e.printStackTrace();
     }
     return filetxt;
	}
	
	 public  static String getTextFromPDF(String pdfFilePath) {  
	        String result = null;  
	        FileInputStream is = null;  
	        PDDocument document = null;  
	        try {  
	        	if(pdfFilePath.toUpperCase().endsWith(".TXT")){
	        		return null;
	        	}
	            is = new FileInputStream(pdfFilePath);  
	            PDFParser parser = new PDFParser(new RandomAccessBuffer(is));  
	            parser.parse();  
	            document = parser.getPDDocument();
	            System.out.print(document);
	            PDFTextStripper stripper = new PDFTextStripper();  
	            System.out.println(pdfFilePath);
	            result = stripper.getText(document); 
	            FileWriter fw = new FileWriter(pdfFilePath+".txt",false);
	            fw.write(result);
	            fw.flush();
	            fw.close();
	        } catch (FileNotFoundException e) {  
	            e.printStackTrace();  
	        } catch (IOException e) {  
	            e.printStackTrace();  
	        }catch (Exception e){
	        	  e.printStackTrace();
	        }finally {
	            if (is != null) {  
	                try {  
	                    is.close();  
	                } catch (IOException e) {  
	                    e.printStackTrace();  
	                }  
	            }  
	            if (document != null) {  
	                try {  
	                    document.close();  
	                } catch (IOException e) {  
	                    e.printStackTrace();  
	                }  
	            }  
	        }  
	        return pdfFilePath+".txt";  
	    }  
}

结果：