使用jpedal解析PDF到XML

最新推荐文章于 2022-06-16 20:07:49 发布

tangxiucai2

最新推荐文章于 2022-06-16 20:07:49 发布

阅读量3.5k

点赞数 1

本文链接：https://blog.csdn.net/tangxiucai2/article/details/9884081

版权

本文详细介绍了如何使用jpedal库将PDF文档解析转换成XML格式，涵盖了转换过程的关键步骤和技术要点。

摘要由CSDN通过智能技术生成


接口类：

package org.aoe.software.pdf;

import java.io.InputStream;

/**
 *Convent pdf to xml.
 *PDF转XML的格式定义

<pdf id="00000001" fileName="temp0001.pdf">
  <page pageIndex="1">
    <text>
      <tr colX="x1:x2" colY="y1:y2">ssssssssss</tr>
    </text>
    <table  colX="x1:x2:x3" colY="y1:y2:y3:y4">
      <tr>
         <td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td>
      </tr>
    </table>
  </page>
</pdf>


说明：
id:表示PDF文件的唯一ID标识名，可以为空,是由调用者传入的参数
fileName:表示PDF的文件名称，不可为空（去除文件中所包含的路径），

page:表示页面信息
pageIndex：表示PDF文件的具体页码信息
text:表示PDF内容中的段落信息
table:表示PDF内容中的表格信息

tr:表示行信息
td:表示表格中的单远格信息

冒号分隔每组值
colX:
   矩形的左下角X坐标

colY：
   矩形的右上角y坐标

其中：td 中的colX,colY表示单元格中数据内容的坐标

根据表头的colX 属性描述，计算出cols:表示这个表格总的有多少列
根据表头的colY 属性描述，计算出rows:表示这个表格总的有多少行

colspan:表示列合并(表明具体的由哪些列合并在一起),如果>1个示从当前列合并后的总列数，等于2表示要合并右边的一列单元格组成新的单元格，其它数据以此类推
rowspan:表示行合并(表明具体的由哪些行合并在一起)，如果>1个示从当前行合并后的总行数，等于2表示要合并下边的一行单元格组成新的单元格，其它数据以此类推



 */
public class PDFToXml {
	private static final String XML_HEAD = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
	private static final String NEW_LINE = "\r\n";
	
	/*调用者传入一个本地的文件名（包含路径），fileID可空，返回生成好的XML格式的字符串，
	 * 如果生成失败，返回字符为空值，即：""
	*/
	public static String ConvertToXML(String fileName, String fileID){
		StringBuffer sb = new StringBuffer();
		String fileShortName = fileName;
		fileShortName = fileShortName.replace("\\", "/");
		if(fileShortName.indexOf("/") != -1)
			fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
		sb.append(XML_HEAD).append(NEW_LINE);
		sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
		//sb.append(ConvertUtils.parse(fileName)).append(NEW_LINE);
		sb.append(ExtractRawStream.generateXMLFile(fileName, "tmp.xml", fileID)).append(NEW_LINE);
		sb.append("</pdf>").append(NEW_LINE);
		return sb.toString();
	}
	
	/*调用者传入一个本地的文件名（包含路径）,fileID可空，
	 * 把生成好的XML格式的数据按指定的文件路径进行保存，如果生成或保存失败，返回false
	*/
	public static boolean ConvertToXML(String fileName, String fileID, String savePath){
		return FileUtils.save(ConvertToXML(fileName, fileID), savePath);
	}

	/*调用者传入PDF的文件流,当前文件流的名称与文件ID，fileID不能为空，
	 * 返回生成的XML格式的字符串，如果生成失败，返回字符为空值，即：""
	 */
	public static String ConvertToXML(InputStream stream, String fileName,String fileID){
		StringBuffer sb = new StringBuffer();
		String fileShortName = fileName;
		fileShortName = fileShortName.replace("\\", "/");
		if(fileShortName.indexOf("/") != -1)
			fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
		sb.append(XML_HEAD).append(NEW_LINE);
		sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
		//sb.append(ConvertUtils.parse(stream)).append(NEW_LINE);
		sb.append(ExtractRawStream.generateXMLFile(stream, fileName, fileID)).append(NEW_LINE);
		sb.append("</pdf>").append(NEW_LINE);
		return null;
	}
	
	/*
	 * 调用者传入PDF的文件流,当前文件流的名称与文件ID，fileID不能为空,
	 * 把生成好的XML格式的数据按指定的文件路径进行保存，如果生成或保存失败，返回false
	 */	
	public static boolean ConvertToXML(InputStream stream,String fileName,String fileID, String savePath){
		return FileUtils.save(ConvertToXML(stream, fileName, fileID), savePath);
	}
	
	
	/
	
	public static void main(String[] args) {
		System.out.println(ConvertToXML("r:/a.pdf", "1111", "r:/zzz.xml"));
		//System.out.println(ConvertToXML("r:/b.pdf", "1111", "r:/b.xml"));
	}
}

package org.aoe.software.pdf;

import java.io.InputStream;
import java.util.Map;

import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;

public class ConvertUtils {
	private static final String NEW_LINE = "\r\n";
	private static PdfDecoder decodePdf = new PdfDecoder(false);;
	private static int defX1 = -1, defX2, defY1, defY2;

	public static String parse(String pdfFilepath) {
		try {
			decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
			PdfDecoder.init(true);
			decodePdf.openPdfFile(pdfFilepath);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return parseContent(decodePdf);
	}
	
	public static String parse(InputStream is){
		try {
			decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
			PdfDecoder.init(true);
			decodePdf.openPdfFileFromInputStream(is, false);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return parseContent(decodePdf);
	}
	
	
	private static String parseContent(PdfDecoder pdfDecoder){
		StringBuffer sb = new StringBuffer();
		if (!decodePdf.isExtractionAllowed()) {
			System.out.println("Text extraction not allowed");
		} else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
			System.out.println("Encrypted settings");
			System.out.println("Please look at Viewer for code sample to handle such files");
			System.out.println("Or get support/consultancy");
		} else {
			// page range
			int start = 1, end = decodePdf.getPageCount();

			try {
				for (int page = start; page < end + 1; page++) { 

					sb.append("<page pageIndex=\""+ page +"\">").append(NEW_LINE);
					
					decodePdf.decodePage(page);
					PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();
					PdfPageData currentPageData = decodePdf.getPdfPageData();

					int x1, y1, x2, y2;

					if (defX1 == -1) {
						x1 = currentPageData.getMediaBoxX(page);
						x2 = currentPageData.getMediaBoxWidth(page) + x1;
						y2 = currentPageData.getMediaBoxY(page);
						y1 = currentPageData.getMediaBoxHeight(page) + y2;
					} else {
						x1 = defX1;
						y1 = defY1;
						x2 = defX2;
						y2 = defY2;
					}
						/**
						 * Co-ordinates are x1,y1 (top left hand corner),
						 * x2,y2(bottom right)
						 */

						try {
							Map tableContent = currentGrouping.extractTextAsTable(
									x1, y1, x2, y2, page, false, // csv
									false, false, false, 0);

							// get the text from the Map object
							String tableText = (String) tableContent.get("content");
							
							
							//忽略不在乎的标签
							tableText = ignoreTag("<TABLE>", tableText);
							tableText = ignoreTag("</TABLE>", tableText);
							
							tableText = ignoreTag(" nowrap", tableText);
							tableText = ignoreTag(" ", tableText);
							tableText = ignoreTag("<SpaceCount space=\"\\d+\" />", tableText);
							
							tableText = ignoreTag("<td></td>", tableText);
							tableText = ignoreTag("<tr></tr>", tableText);
							
							boolean isTable = isTable(tableText);
							if(isTable){
								int rows = getCount(tableText, "<tr>");
								int cols = getCount(tableText, "<td>");
								sb.append("<table  colX=\""+rows+"\" colY=\""+cols+"\">").append(tableText).append("</table>").append(NEW_LINE);
							}else{
								tableText = ignoreTag("<tr>", tableText);
								tableText = ignoreTag("</tr>", tableText);
								tableText = ignoreTag("<td>", tableText);
								tableText = ignoreTag("</td>", tableText);
								sb.append("<text>").append(NEW_LINE);
								sb.append("<tr colX=\""+ x1 +":"+ x2 +"\" colY=\""+ y1 +":"+ y2 +"\">"+ tableText +"</tr>").append(NEW_LINE);
								sb.append("</text>").append(NEW_LINE);
							}
						} catch (PdfException e) {
							decodePdf.closePdfFile();
							e.printStackTrace();
						}

						// remove data once written out
						decodePdf.flushObjectValues(false);
					
					sb.append("</page>").append(NEW_LINE);
				}
			} catch (Exception e) {
				decodePdf.closePdfFile();
				e.printStackTrace();
			}

			decodePdf.flushObjectValues(true); // flush any text data read
		}
		decodePdf.closePdfFile();
		return sb.toString();
	}
	
	private static String ignoreTag(String tag, String origin){
		return origin.replaceAll(tag, "");
	}
	
	private static int getCount(String table, String tag){
		int count = 0;
		int index = 0;
		while((index = table.indexOf(tag, index)) != -1){
			count++;
			index += tag.length();
		}
		return count;
	}
	
	private static boolean isTable(String tableText){
		//将如下情况设置为table ：多行 或者 单行多列(非空列个数大于2)
		/*if(tableText.indexOf("<tr>") != tableText.lastIndexOf("<tr>")){
			return true;
		}*/
		int rows = getCount(tableText, "<tr>");
		int index = 0;
		if(rows>0){
			for(int i=1; i<rows; i++){
				int tr = tableText.indexOf("<tr>", index);
				int closedTr = tableText.indexOf("</tr>", tr);
				String line = tableText.substring(tr, closedTr);
				index += line.length();
				
				if(line.indexOf("<td>") != line.lastIndexOf("<td>")){
					return true;
				}
			}
		}
		return false;
	}
}

package org.aoe.software.pdf;
 
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;

import org.aoe.software.pdf.po.Page;
import org.aoe.software.pdf.po.Table;
import org.aoe.software.pdf.po.TableTd;
import org.aoe.software.pdf.po.TableTr;
import org.aoe.software.pdf.po.Text;
import org.aoe.software.pdf.po.TextTr;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.fonts.FontMappings;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Strip;

public class ExtractRawStream {
 
	//DX20130502 decode page no
	 public int decode_pageno;
	 
    /**flag to show if we print messages*/
     public static boolean outputMessages=true;
 
    /**word count - used for testing*/
     private int wordsExtracted=0;
 
    /**correct separator for OS */
     String separator = System.getProperty("file.separator");
 
    /**the decoder object which decodes the pdf and returns a data object*/
     PdfDecoder decodePdf = null;
 
    /**flag to show if file or byte array*/
     private boolean isFile=true;
 
    /**byte array*/
     private byte[] byteArray=null;
 
    /**used in our regression tests to limit to first 10 pages*/
     public static boolean isTest=false;
     
     private List<Rect> relist = new ArrayList<Rect>();
     
     private List<TextLine> textlist = new ArrayList<TextLine>();
     
     private Rect page_rect = new Rect();
     
     private static String file_name = "";

	/**
      * routine to decode a file
      */
     private void decodeFile(String file_name) {
        //PdfDecoder returns a PdfException if there is a problem
         try {
             decodePdf = new PdfDecoder(true);
 
            //incase fonts not embedded
             FontMappings.setFontReplacements();
 
            decodePdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
             PdfDecoder.init(true);
             //make sure widths in data CRITICAL if we want to split lines correctly!!
 
              decodePdf.useTextExtraction();
 
            //always reset to use unaltered co-ords - allow use of rotated or unrotated
             // co-ordinates on pages with rotation (used to be in PdfDecoder)
             PdfGroupingAlgorithms.useUnrotatedCoords=false;
 
            /**
              * open the file (and read metadata including pages in  file)
              */
             if(outputMessages)
                 System.out.println("Opening file :" + file_name);
 
            if(isFile)
                 decodePdf.openPdfFile(file_name);
             else
                 decodePdf.openPdfArray(byteArray);
         } catch (PdfSecurityException e) {
             System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
         } catch (PdfException e) {
             System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
 
        } catch (Exception e) {
             System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
             e.printStackTrace();
         }
 
        /**
          * extract data from pdf (if allowed).
          */
         if(!decodePdf.isExtractionAllowed()){
             if(outputMessages)
                 System.out.println("Text extraction not allowed");
         }else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
             if(outputMessages){
                 System.out.println("Encrypted settings");
                 System.out.println("Please look at Viewer for code sample to handle such files");
             }
         } else{
            /**
              * extract data from pdf