接口类:
package org.aoe.software.pdf;
import java.io.InputStream;
/**
*Convent pdf to xml.
*PDF转XML的格式定义
<pdf id="00000001" fileName="temp0001.pdf">
<page pageIndex="1">
<text>
<tr colX="x1:x2" colY="y1:y2">ssssssssss</tr>
</text>
<table colX="x1:x2:x3" colY="y1:y2:y3:y4">
<tr>
<td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td>
</tr>
</table>
</page>
</pdf>
说明:
id:表示PDF文件的唯一ID标识名,可以为空,是由调用者传入的参数
fileName:表示PDF的文件名称,不可为空(去除文件中所包含的路径),
page:表示页面信息
pageIndex:表示PDF文件的具体页码信息
text:表示PDF内容中的段落信息
table:表示PDF内容中的表格信息
tr:表示行信息
td:表示表格中的单远格信息
冒号分隔每组值
colX:
矩形的左下角X坐标
colY:
矩形的右上角y坐标
其中:td 中的colX,colY表示单元格中数据内容的坐标
根据表头的colX 属性描述,计算出cols:表示这个表格总的有多少列
根据表头的colY 属性描述,计算出rows:表示这个表格总的有多少行
colspan:表示列合并(表明具体的由哪些列合并在一起),如果>1个示从当前列合并后的总列数,等于2表示要合并右边的一列单元格组成新的单元格,其它数据以此类推
rowspan:表示行合并(表明具体的由哪些行合并在一起),如果>1个示从当前行合并后的总行数,等于2表示要合并下边的一行单元格组成新的单元格,其它数据以此类推
*/
public class PDFToXml {
private static final String XML_HEAD = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
private static final String NEW_LINE = "\r\n";
/*调用者传入一个本地的文件名(包含路径),fileID可空,返回生成好的XML格式的字符串,
* 如果生成失败,返回字符为空值,即:""
*/
public static String ConvertToXML(String fileName, String fileID){
StringBuffer sb = new StringBuffer();
String fileShortName = fileName;
fileShortName = fileShortName.replace("\\", "/");
if(fileShortName.indexOf("/") != -1)
fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
sb.append(XML_HEAD).append(NEW_LINE);
sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
//sb.append(ConvertUtils.parse(fileName)).append(NEW_LINE);
sb.append(ExtractRawStream.generateXMLFile(fileName, "tmp.xml", fileID)).append(NEW_LINE);
sb.append("</pdf>").append(NEW_LINE);
return sb.toString();
}
/*调用者传入一个本地的文件名(包含路径),fileID可空,
* 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false
*/
public static boolean ConvertToXML(String fileName, String fileID, String savePath){
return FileUtils.save(ConvertToXML(fileName, fileID), savePath);
}
/*调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空,
* 返回生成的XML格式的字符串,如果生成失败,返回字符为空值,即:""
*/
public static String ConvertToXML(InputStream stream, String fileName,String fileID){
StringBuffer sb = new StringBuffer();
String fileShortName = fileName;
fileShortName = fileShortName.replace("\\", "/");
if(fileShortName.indexOf("/") != -1)
fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
sb.append(XML_HEAD).append(NEW_LINE);
sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
//sb.append(ConvertUtils.parse(stream)).append(NEW_LINE);
sb.append(ExtractRawStream.generateXMLFile(stream, fileName, fileID)).append(NEW_LINE);
sb.append("</pdf>").append(NEW_LINE);
return null;
}
/*
* 调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空,
* 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false
*/
public static boolean ConvertToXML(InputStream stream,String fileName,String fileID, String savePath){
return FileUtils.save(ConvertToXML(stream, fileName, fileID), savePath);
}
/
public static void main(String[] args) {
System.out.println(ConvertToXML("r:/a.pdf", "1111", "r:/zzz.xml"));
//System.out.println(ConvertToXML("r:/b.pdf", "1111", "r:/b.xml"));
}
}
package org.aoe.software.pdf;
import java.io.InputStream;
import java.util.Map;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
public class ConvertUtils {
private static final String NEW_LINE = "\r\n";
private static PdfDecoder decodePdf = new PdfDecoder(false);;
private static int defX1 = -1, defX2, defY1, defY2;
public static String parse(String pdfFilepath) {
try {
decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
PdfDecoder.init(true);
decodePdf.openPdfFile(pdfFilepath);
} catch (Exception e) {
e.printStackTrace();
}
return parseContent(decodePdf);
}
public static String parse(InputStream is){
try {
decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
PdfDecoder.init(true);
decodePdf.openPdfFileFromInputStream(is, false);
} catch (Exception e) {
e.printStackTrace();
}
return parseContent(decodePdf);
}
private static String parseContent(PdfDecoder pdfDecoder){
StringBuffer sb = new StringBuffer();
if (!decodePdf.isExtractionAllowed()) {
System.out.println("Text extraction not allowed");
} else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
System.out.println("Encrypted settings");
System.out.println("Please look at Viewer for code sample to handle such files");
System.out.println("Or get support/consultancy");
} else {
// page range
int start = 1, end = decodePdf.getPageCount();
try {
for (int page = start; page < end + 1; page++) {
sb.append("<page pageIndex=\""+ page +"\">").append(NEW_LINE);
decodePdf.decodePage(page);
PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();
PdfPageData currentPageData = decodePdf.getPdfPageData();
int x1, y1, x2, y2;
if (defX1 == -1) {
x1 = currentPageData.getMediaBoxX(page);
x2 = currentPageData.getMediaBoxWidth(page) + x1;
y2 = currentPageData.getMediaBoxY(page);
y1 = currentPageData.getMediaBoxHeight(page) + y2;
} else {
x1 = defX1;
y1 = defY1;
x2 = defX2;
y2 = defY2;
}
/**
* Co-ordinates are x1,y1 (top left hand corner),
* x2,y2(bottom right)
*/
try {
Map tableContent = currentGrouping.extractTextAsTable(
x1, y1, x2, y2, page, false, // csv
false, false, false, 0);
// get the text from the Map object
String tableText = (String) tableContent.get("content");
//忽略不在乎的标签
tableText = ignoreTag("<TABLE>", tableText);
tableText = ignoreTag("</TABLE>", tableText);
tableText = ignoreTag(" nowrap", tableText);
tableText = ignoreTag(" ", tableText);
tableText = ignoreTag("<SpaceCount space=\"\\d+\" />", tableText);
tableText = ignoreTag("<td></td>", tableText);
tableText = ignoreTag("<tr></tr>", tableText);
boolean isTable = isTable(tableText);
if(isTable){
int rows = getCount(tableText, "<tr>");
int cols = getCount(tableText, "<td>");
sb.append("<table colX=\""+rows+"\" colY=\""+cols+"\">").append(tableText).append("</table>").append(NEW_LINE);
}else{
tableText = ignoreTag("<tr>", tableText);
tableText = ignoreTag("</tr>", tableText);
tableText = ignoreTag("<td>", tableText);
tableText = ignoreTag("</td>", tableText);
sb.append("<text>").append(NEW_LINE);
sb.append("<tr colX=\""+ x1 +":"+ x2 +"\" colY=\""+ y1 +":"+ y2 +"\">"+ tableText +"</tr>").append(NEW_LINE);
sb.append("</text>").append(NEW_LINE);
}
} catch (PdfException e) {
decodePdf.closePdfFile();
e.printStackTrace();
}
// remove data once written out
decodePdf.flushObjectValues(false);
sb.append("</page>").append(NEW_LINE);
}
} catch (Exception e) {
decodePdf.closePdfFile();
e.printStackTrace();
}
decodePdf.flushObjectValues(true); // flush any text data read
}
decodePdf.closePdfFile();
return sb.toString();
}
private static String ignoreTag(String tag, String origin){
return origin.replaceAll(tag, "");
}
private static int getCount(String table, String tag){
int count = 0;
int index = 0;
while((index = table.indexOf(tag, index)) != -1){
count++;
index += tag.length();
}
return count;
}
private static boolean isTable(String tableText){
//将如下情况设置为table :多行 或者 单行多列(非空列个数大于2)
/*if(tableText.indexOf("<tr>") != tableText.lastIndexOf("<tr>")){
return true;
}*/
int rows = getCount(tableText, "<tr>");
int index = 0;
if(rows>0){
for(int i=1; i<rows; i++){
int tr = tableText.indexOf("<tr>", index);
int closedTr = tableText.indexOf("</tr>", tr);
String line = tableText.substring(tr, closedTr);
index += line.length();
if(line.indexOf("<td>") != line.lastIndexOf("<td>")){
return true;
}
}
}
return false;
}
}
package org.aoe.software.pdf;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import org.aoe.software.pdf.po.Page;
import org.aoe.software.pdf.po.Table;
import org.aoe.software.pdf.po.TableTd;
import org.aoe.software.pdf.po.TableTr;
import org.aoe.software.pdf.po.Text;
import org.aoe.software.pdf.po.TextTr;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.fonts.FontMappings;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Strip;
public class ExtractRawStream {
//DX20130502 decode page no
public int decode_pageno;
/**flag to show if we print messages*/
public static boolean outputMessages=true;
/**word count - used for testing*/
private int wordsExtracted=0;
/**correct separator for OS */
String separator = System.getProperty("file.separator");
/**the decoder object which decodes the pdf and returns a data object*/
PdfDecoder decodePdf = null;
/**flag to show if file or byte array*/
private boolean isFile=true;
/**byte array*/
private byte[] byteArray=null;
/**used in our regression tests to limit to first 10 pages*/
public static boolean isTest=false;
private List<Rect> relist = new ArrayList<Rect>();
private List<TextLine> textlist = new ArrayList<TextLine>();
private Rect page_rect = new Rect();
private static String file_name = "";
/**
* routine to decode a file
*/
private void decodeFile(String file_name) {
//PdfDecoder returns a PdfException if there is a problem
try {
decodePdf = new PdfDecoder(true);
//incase fonts not embedded
FontMappings.setFontReplacements();
decodePdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
PdfDecoder.init(true);
//make sure widths in data CRITICAL if we want to split lines correctly!!
decodePdf.useTextExtraction();
//always reset to use unaltered co-ords - allow use of rotated or unrotated
// co-ordinates on pages with rotation (used to be in PdfDecoder)
PdfGroupingAlgorithms.useUnrotatedCoords=false;
/**
* open the file (and read metadata including pages in file)
*/
if(outputMessages)
System.out.println("Opening file :" + file_name);
if(isFile)
decodePdf.openPdfFile(file_name);
else
decodePdf.openPdfArray(byteArray);
} catch (PdfSecurityException e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
} catch (PdfException e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
} catch (Exception e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
e.printStackTrace();
}
/**
* extract data from pdf (if allowed).
*/
if(!decodePdf.isExtractionAllowed()){
if(outputMessages)
System.out.println("Text extraction not allowed");
}else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
if(outputMessages){
System.out.println("Encrypted settings");
System.out.println("Please look at Viewer for code sample to handle such files");
}
} else{
/**
* extract data from pdf