接口类:
package org.aoe.software.pdf;
import java.io.InputStream;
/**
*Convent pdf to xml.
*PDF转XML的格式定义
<pdf id="00000001" fileName="temp0001.pdf">
<page pageIndex="1">
<text>
<tr colX="x1:x2" colY="y1:y2">ssssssssss</tr>
</text>
<table colX="x1:x2:x3" colY="y1:y2:y3:y4">
<tr>
<td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td>
</tr>
</table>
</page>
</pdf>
说明:
id:表示PDF文件的唯一ID标识名,可以为空,是由调用者传入的参数
fileName:表示PDF的文件名称,不可为空(去除文件中所包含的路径),
page:表示页面信息
pageIndex:表示PDF文件的具体页码信息
text:表示PDF内容中的段落信息
table:表示PDF内容中的表格信息
tr:表示行信息
td:表示表格中的单远格信息
冒号分隔每组值
colX:
矩形的左下角X坐标
colY:
矩形的右上角y坐标
其中:td 中的colX,colY表示单元格中数据内容的坐标
根据表头的colX 属性描述,计算出cols:表示这个表格总的有多少列
根据表头的colY 属性描述,计算出rows:表示这个表格总的有多少行
colspan:表示列合并(表明具体的由哪些列合并在一起),如果>1个示从当前列合并后的总列数,等于2表示要合并右边的一列单元格组成新的单元格,其它数据以此类推
rowspan:表示行合并(表明具体的由哪些行合并在一起),如果>1个示从当前行合并后的总行数,等于2表示要合并下边的一行单元格组成新的单元格,其它数据以此类推
*/
public class PDFToXml {
private static final String XML_HEAD = "<?xml version=\"1.0\" encoding=\"GBK\"?>";
private static final String NEW_LINE = "\r\n";
/*调用者传入一个本地的文件名(包含路径),fileID可空,返回生成好的XML格式的字符串,
* 如果生成失败,返回字符为空值,即:""
*/
public static String ConvertToXML(String fileName, String fileID){
StringBuffer sb = new StringBuffer();
String fileShortName = fileName;
fileShortName = fileShortName.replace("\\", "/");
if(fileShortName.indexOf("/") != -1)
fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
sb.append(XML_HEAD).append(NEW_LINE);
sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
//sb.append(ConvertUtils.parse(fileName)).append(NEW_LINE);
sb.append(ExtractRawStream.generateXMLFile(fileName, "tmp.xml", fileID)).append(NEW_LINE);
sb.append("</pdf>").append(NEW_LINE);
return sb.toString();
}
/*调用者传入一个本地的文件名(包含路径),fileID可空,
* 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false
*/
public static boolean ConvertToXML(String fileName, String fileID, String savePath){
return FileUtils.save(ConvertToXML(fileName, fileID), savePath);
}
/*调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空,
* 返回生成的XML格式的字符串,如果生成失败,返回字符为空值,即:""
*/
public static String ConvertToXML(InputStream stream, String fileName,String fileID){
StringBuffer sb = new StringBuffer();
String fileShortName = fileName;
fileShortName = fileShortName.replace("\\", "/");
if(fileShortName.indexOf("/") != -1)
fileShortName = fileShortName.substring(fileShortName.lastIndexOf("/") + 1);
sb.append(XML_HEAD).append(NEW_LINE);
sb.append("<pdf id=\""+ (fileID == null ? "" : fileID)+"\" fileName=\""+fileShortName+"\">").append(NEW_LINE);
//sb.append(ConvertUtils.parse(stream)).append(NEW_LINE);
sb.append(ExtractRawStream.generateXMLFile(stream, fileName, fileID)).append(NEW_LINE);
sb.append("</pdf>").append(NEW_LINE);
return null;
}
/*
* 调用者传入PDF的文件流,当前文件流的名称与文件ID,fileID不能为空,
* 把生成好的XML格式的数据按指定的文件路径进行保存,如果生成或保存失败,返回false
*/
public static boolean ConvertToXML(InputStream stream,String fileName,String fileID, String savePath){
return FileUtils.save(ConvertToXML(stream, fileName, fileID), savePath);
}
/
public static void main(String[] args) {
System.out.println(ConvertToXML("r:/a.pdf", "1111", "r:/zzz.xml"));
//System.out.println(ConvertToXML("r:/b.pdf", "1111", "r:/b.xml"));
}
}
package org.aoe.software.pdf;
import java.io.InputStream;
import java.util.Map;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
public class ConvertUtils {
private static final String NEW_LINE = "\r\n";
private static PdfDecoder decodePdf = new PdfDecoder(false);;
private static int defX1 = -1, defX2, defY1, defY2;
public static String parse(String pdfFilepath) {
try {
decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
PdfDecoder.init(true);
decodePdf.openPdfFile(pdfFilepath);
} catch (Exception e) {
e.printStackTrace();
}
return parseContent(decodePdf);
}
public static String parse(InputStream is){
try {
decodePdf.setExtractionMode(PdfDecoder.TEXT); // extract just text
PdfDecoder.init(true);
decodePdf.openPdfFileFromInputStream(is, false);
} catch (Exception e) {
e.printStackTrace();
}
return parseContent(decodePdf);
}
private static String parseContent(PdfDecoder pdfDecoder){
StringBuffer sb = new StringBuffer();
if (!decodePdf.isExtractionAllowed()) {
System.out.println("Text extraction not allowed");
} else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
System.out.println("Encrypted settings");
System.out.println("Please look at Viewer for code sample to handle such files");
System.out.println("Or get support/consultancy");
} else {
// page range
int start = 1, end = decodePdf.getPageCount();
try {
for (int page = start; page < end + 1; page++) {
sb.append("<page pageIndex=\""+ page +"\">").append(NEW_LINE);
decodePdf.decodePage(page);
PdfGroupingAlgorithms currentGrouping = decodePdf.getGroupingObject();
PdfPageData currentPageData = decodePdf.getPdfPageData();
int x1, y1, x2, y2;
if (defX1 == -1) {
x1 = currentPageData.getMediaBoxX(page);
x2 = currentPageData.getMediaBoxWidth(page) + x1;
y2 = currentPageData.getMediaBoxY(page);
y1 = currentPageData.getMediaBoxHeight(page) + y2;
} else {
x1 = defX1;
y1 = defY1;
x2 = defX2;
y2 = defY2;
}
/**
* Co-ordinates are x1,y1 (top left hand corner),
* x2,y2(bottom right)
*/
try {
Map tableContent = currentGrouping.extractTextAsTable(
x1, y1, x2, y2, page, false, // csv
false, false, false, 0);
// get the text from the Map object
String tableText = (String) tableContent.get("content");
//忽略不在乎的标签
tableText = ignoreTag("<TABLE>", tableText);
tableText = ignoreTag("</TABLE>", tableText);
tableText = ignoreTag(" nowrap", tableText);
tableText = ignoreTag("", tableText);
tableText = ignoreTag("<SpaceCount space=\"\\d+\" />", tableText);
tableText = ignoreTag("<td></td>", tableText);
tableText = ignoreTag("<tr></tr>", tableText);
boolean isTable = isTable(tableText);
if(isTable){
int rows = getCount(tableText, "<tr>");
int cols = getCount(tableText, "<td>");
sb.append("<table colX=\""+rows+"\" colY=\""+cols+"\">").append(tableText).append("</table>").append(NEW_LINE);
}else{
tableText = ignoreTag("<tr>", tableText);
tableText = ignoreTag("</tr>", tableText);
tableText = ignoreTag("<td>", tableText);
tableText = ignoreTag("</td>", tableText);
sb.append("<text>").append(NEW_LINE);
sb.append("<tr colX=\""+ x1 +":"+ x2 +"\" colY=\""+ y1 +":"+ y2 +"\">"+ tableText +"</tr>").append(NEW_LINE);
sb.append("</text>").append(NEW_LINE);
}
} catch (PdfException e) {
decodePdf.closePdfFile();
e.printStackTrace();
}
// remove data once written out
decodePdf.flushObjectValues(false);
sb.append("</page>").append(NEW_LINE);
}
} catch (Exception e) {
decodePdf.closePdfFile();
e.printStackTrace();
}
decodePdf.flushObjectValues(true); // flush any text data read
}
decodePdf.closePdfFile();
return sb.toString();
}
private static String ignoreTag(String tag, String origin){
return origin.replaceAll(tag, "");
}
private static int getCount(String table, String tag){
int count = 0;
int index = 0;
while((index = table.indexOf(tag, index)) != -1){
count++;
index += tag.length();
}
return count;
}
private static boolean isTable(String tableText){
//将如下情况设置为table :多行 或者 单行多列(非空列个数大于2)
/*if(tableText.indexOf("<tr>") != tableText.lastIndexOf("<tr>")){
return true;
}*/
int rows = getCount(tableText, "<tr>");
int index = 0;
if(rows>0){
for(int i=1; i<rows; i++){
int tr = tableText.indexOf("<tr>", index);
int closedTr = tableText.indexOf("</tr>", tr);
String line = tableText.substring(tr, closedTr);
index += line.length();
if(line.indexOf("<td>") != line.lastIndexOf("<td>")){
return true;
}
}
}
return false;
}
}
package org.aoe.software.pdf;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import jxl.Workbook;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import org.aoe.software.pdf.po.Page;
import org.aoe.software.pdf.po.Table;
import org.aoe.software.pdf.po.TableTd;
import org.aoe.software.pdf.po.TableTr;
import org.aoe.software.pdf.po.Text;
import org.aoe.software.pdf.po.TextTr;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.jpedal.PdfDecoder;
import org.jpedal.exception.PdfException;
import org.jpedal.exception.PdfSecurityException;
import org.jpedal.fonts.FontMappings;
import org.jpedal.grouping.PdfGroupingAlgorithms;
import org.jpedal.objects.PdfPageData;
import org.jpedal.utils.Strip;
public class ExtractRawStream {
//DX20130502 decode page no
public int decode_pageno;
/**flag to show if we print messages*/
public static boolean outputMessages=true;
/**word count - used for testing*/
private int wordsExtracted=0;
/**correct separator for OS */
String separator = System.getProperty("file.separator");
/**the decoder object which decodes the pdf and returns a data object*/
PdfDecoder decodePdf = null;
/**flag to show if file or byte array*/
private boolean isFile=true;
/**byte array*/
private byte[] byteArray=null;
/**used in our regression tests to limit to first 10 pages*/
public static boolean isTest=false;
private List<Rect> relist = new ArrayList<Rect>();
private List<TextLine> textlist = new ArrayList<TextLine>();
private Rect page_rect = new Rect();
private static String file_name = "";
/**
* routine to decode a file
*/
private void decodeFile(String file_name) {
//PdfDecoder returns a PdfException if there is a problem
try {
decodePdf = new PdfDecoder(true);
//incase fonts not embedded
FontMappings.setFontReplacements();
decodePdf.setExtractionMode(PdfDecoder.TEXT); //extract just text
PdfDecoder.init(true);
//make sure widths in data CRITICAL if we want to split lines correctly!!
decodePdf.useTextExtraction();
//always reset to use unaltered co-ords - allow use of rotated or unrotated
// co-ordinates on pages with rotation (used to be in PdfDecoder)
PdfGroupingAlgorithms.useUnrotatedCoords=false;
/**
* open the file (and read metadata including pages in file)
*/
if(outputMessages)
System.out.println("Opening file :" + file_name);
if(isFile)
decodePdf.openPdfFile(file_name);
else
decodePdf.openPdfArray(byteArray);
} catch (PdfSecurityException e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
} catch (PdfException e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
} catch (Exception e) {
System.err.println("Exception " + e+" in pdf code for wordlist"+file_name);
e.printStackTrace();
}
/**
* extract data from pdf (if allowed).
*/
if(!decodePdf.isExtractionAllowed()){
if(outputMessages)
System.out.println("Text extraction not allowed");
}else if (decodePdf.isEncrypted() && !decodePdf.isPasswordSupplied()) {
if(outputMessages){
System.out.println("Encrypted settings");
System.out.println("Please look at Viewer for code sample to handle such files");
}
} else{
/**
* extract data from pdf
*/
try {
//for (int page = start; page < end + 1; page++) { //read pages
//decode the page
decodePdf.decodePage(decode_pageno);
//String contents[] = decodePdf.;
// //debug only
// FileWriter fw = new FileWriter("d:/abc/commands/raw-" + decode_pageno + ".txt");
StringBuffer sb = new StringBuffer();
for (int i=0;i<decodePdf.stream_data.length;i++) {
sb.append((char)decodePdf.stream_data[i]);
//fw.write(decodePdf.stream_data[i]); //debug only
}
// fw.flush(); //debug only
/**use whole page size for demo - get data from PageData object*/
PdfPageData currentPageData = decodePdf.getPdfPageData();
int x1 = currentPageData.getMediaBoxX(decode_pageno);
int x2 = currentPageData.getMediaBoxWidth(decode_pageno)+x1;
int y2 = currentPageData.getMediaBoxX(decode_pageno);
int y1 = currentPageData.getMediaBoxHeight(decode_pageno)-y2;
//报存页面坐标参数
page_rect.setX(x1);
page_rect.setWidth(x2 - x1);
page_rect.setY(y2);
page_rect.setHeight(y1-y2);
//OutputStreamWriter output_stream =
// new OutputStreamWriter(
// new FileOutputStream(outputDir + "raw-re-"+decode_pageno + ".txt"),
// "UTF-8");
StringBuffer line = new StringBuffer();
for (int j=0;j<sb.length();j++) {
line.append(sb.charAt(j));
if (sb.charAt(j) == 10) {
if ((line.toString().split(" ").length == 5)
&& ((line.toString().split(" ")[4].equals("re" + (char)10))
||(line.toString().split(" ")[4].equals("re" + (char)13 + (char)10)))) {
String[] command = line.toString().split(" ");
Rect rect = new Rect();
rect.setX(Double.parseDouble(command[0]));
rect.setY(page_rect.getHeight() - Double.parseDouble(command[1]) - Double.parseDouble(command[3]));
rect.setWidth(Double.parseDouble(command[2]));
rect.setHeight(Double.parseDouble(command[3]));
//output.append(line);
relist.add(rect);
}
line.setLength(0);
}
}
// //debug only
// if (decode_pageno == 6) {
// StraightLines.printLines("d:/abc/relist_6.txt", relist);
// }
StraightLines.processReCommands(relist, decode_pageno);
StraightLines.sortByYMinAsc(relist);
//for (int j=0;j<relist.size();j++) {
// Rect rect = relist.get(j);
// output_stream.write(rect.getX() + " " + rect.getY() + " " + rect.getWidth() + " " + rect.getHeight() + " re" + "\n\r");
// }
// output_stream.flush();
/** create a grouping object to apply grouping to data*/
PdfGroupingAlgorithms currentGrouping =decodePdf.getGroupingObject();
List<?> words =null;
try{
/*
words =currentGrouping.extractTextAsWordlist(
x1,
y1,
x2,
y2,
decode_pageno,
true,"&:=()!;.,\\/\"\"\'\'");
*/
words =currentGrouping.extractTextAsWordlist(
x1,
y1,
x2,
y2,
decode_pageno,
true,"");
} catch (PdfException e) {
decodePdf.closePdfFile();
System.err.println("Exception= "+ e+" in "+file_name);
}
//DX20130614
if (words == null) {
decodePdf.closePdfFile();
return;
}
Iterator<?> wordIterator=words.iterator();
while(wordIterator.hasNext()){
String currentWord=(String) wordIterator.next();
/**remove the XML formatting if present - not needed for pure text*/
currentWord=Strip.convertToText(currentWord, decodePdf.isXMLExtraction());
/**if(currentWord.indexOf(" ")!=-1){
System.out.println("word="+currentWord);
System.exit(1);
}*/
/**
* these co-ordinates are absolute from the bottom of the page (MediaBox)
* If you are extracting image (which may use crop, use need to modify as below
*/
double wx1 = Double.parseDouble((String) wordIterator.next());
double wy1 = Double.parseDouble((String) wordIterator.next());
double wx2 = Double.parseDouble((String) wordIterator.next());
double wy2 = Double.parseDouble((String) wordIterator.next());
/**this could be inserting into a database instead*/
TextLine text = new TextLine();
text.getRect().setX(wx1);
text.getRect().setY(page_rect.getHeight() - wy1);
text.getRect().setWidth(wx2);
text.getRect().setHeight(wy1 - wy2);
text.setText(currentWord);
textlist.add(text);
}
System.out.println("Page " + decode_pageno + " extracted!");
//}
} catch (Exception e) {
decodePdf.closePdfFile();
System.err.println("Exception "+ e+" in "+file_name);
e.printStackTrace();
}
/**
* flush data structures - not strictly required but included
* as example
*/
decodePdf.flushObjectValues(true); //flush any text data read
/**tell user*/
if(outputMessages)
System.out.println("Text read");
}
/**close the pdf file*/
decodePdf.closePdfFile();
decodePdf=null;
}
//
/**
* main routine which checks for any files passed and runs the demo
*/
public static void main(String[] args) {
FileUtils.save(generateXMLFile("r:/a.pdf", "R:/out.xml", "00000001"), "r:/z.xml");
}
/**
* return words extracted. We use this in some tests.
*/
public int getWordsExtractedCount() {
return wordsExtracted;
}
/*
* extract raw commands
*/
public List<Rect> parseFilePage(String filename, int pageno) {
setDecode_pageno(pageno);
decodeFile(filename);
StraightLines.sortByXMax(relist);
return relist;
}
public int getDecode_pageno() {
return decode_pageno;
}
public void setDecode_pageno(int decode_pageno) {
this.decode_pageno = decode_pageno;
}
private static double MINIMUN_LINE_LENGTH = 2;
//算法描述
/*
* 先获得所有横线,每两条相邻横线为一行
* 再每一行,获得所有有效竖线(同时与上下横线交叉的竖线),每两条相邻竖线为一列
* 最后生成单元表
* 获得所有水平线
*/
public static String generateExcelTables(Element pageElement, int page_no, List<Rect> lines, List<TextLine> textlist) {
List<Rect> column_lines = new ArrayList<Rect>();
List<Rect> horizontal_lines = new ArrayList<Rect>();
List<Rect> vertical_lines = new ArrayList<Rect>();
//去除短线
for (int i=0;i<lines.size();i++) {
if ((lines.get(i).getWidth() > MINIMUN_LINE_LENGTH)
&& (lines.get(i).getX() > 0)){
horizontal_lines.add(lines.get(i));
}
}
StraightLines.sortByYMinAsc(horizontal_lines);
//获得垂直线
for (int i=0;i<lines.size();i++) {
if ((lines.get(i).getHeight() > MINIMUN_LINE_LENGTH)
&& (lines.get(i).getY() > 0)) {
vertical_lines.add(lines.get(i));
}
}
StraightLines.sortByYMax(vertical_lines);
/*if (pageElement.attribute("pageindex").getStringValue().equals("27")) {
System.out.println("debug");
}*/
for (int i=0;i<horizontal_lines.size()-1;i++) {
Rect topline, bottomline;
topline = horizontal_lines.get(i);
bottomline = horizontal_lines.get(i+1);
for (int j=0;j<vertical_lines.size();j++) {
//找到交叉该对水平线的垂直线
if (((vertical_lines.get(j).getY() - MINIMUN_LINE_LENGTH < topline.getY())
&& (vertical_lines.get(j).getY() + vertical_lines.get(j).getHeight() + MINIMUN_LINE_LENGTH > topline.getY()))
&& ((vertical_lines.get(j).getY() - MINIMUN_LINE_LENGTH < bottomline.getY())
&& (vertical_lines.get(j).getY() + vertical_lines.get(j).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) {
//如果结果中不存在该垂直线,则加入
boolean bFind = false;
for (int k=0;k<column_lines.size();k++) {
if (column_lines.get(k).getX() == vertical_lines.get(j).getX()) {
bFind = true;
break;
}
}
if (!bFind) column_lines.add(vertical_lines.get(j));
}
}
}
StraightLines.sortByXMin(column_lines);
List<Rect> mergedhlines;
mergedhlines = StraightLines.mergeHorizontalLines(horizontal_lines);
StraightLines.sortByYMinAsc(mergedhlines);
StraightLines.sortByXMin(vertical_lines);
//xml表元素
Element tableElement = pageElement.addElement("table");
//Add by tangxc.
//tableElement.addAttribute("border", "1");
String str_colX = "";
for (int j=0;j<column_lines.size();j++) {
if (j==0) {
str_colX = (int)column_lines.get(j).getX() + "";
} else {
str_colX = str_colX + ":" + (int)column_lines.get(j).getX();
}
}
String str_colY = "";
TextLine.sortByYMinAsc(textlist);
//在EXCEL文件中生成表格
//WritableWorkbook workbook = initOutputExcelFile();
//WritableSheet sheet = workbook.createSheet("Page", 0);
int first_column_rowspan = 0; //第一列的行和并
/*Table table = null;*/
for (int i=0;i<mergedhlines.size()-1;i++) {
Rect topline, bottomline;
topline = mergedhlines.get(i);
bottomline = mergedhlines.get(i+1);
double leftline = 0;
leftline = StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, 0);
if (leftline == 0) {
if (tableElement.nodeCount() > 0) {
first_column_rowspan = 0; //DX20130704 reset
tableElement.addAttribute("colX", str_colX);
tableElement.addAttribute("colY", str_colY);
tableElement = pageElement.addElement("table");
//Add by tangxc.
//tableElement.addAttribute("border", "1");
str_colY = "";
}
/*str_colY = "";
TextTr tr = new TextTr();
tr.setColX(str_colX);
tr.setColY(str_colY);
tr.setContent("");
Text txt = new Text();
txt.addTr(tr);
page.addText(txt);
*/
continue; //没有交叉线
} else {
if (str_colY.equals("")) {
str_colY = (int)mergedhlines.get(i).getY() + ":" + (int)mergedhlines.get(i+1).getY();
} else {
str_colY = str_colY + ":" + (int)mergedhlines.get(i+1).getY();
}
if ((tableElement.nodeCount() > 0) && (i == (mergedhlines.size()-2))) {
tableElement.addAttribute("colX", str_colX);
tableElement.addAttribute("colY", str_colY);
}
/*table = new Table();*/
}
Element rowElement = tableElement.addElement("tr");
/*TableTr tr = new TableTr();
table.addTr(tr);*/
do {
double nextline = 0;
boolean bFind = false;
for (int j=0;j<column_lines.size();j++) {
if (column_lines.get(j).getX() == leftline) {
bFind = true;
//找下一根交叉线
nextline = StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, leftline);
if (nextline==0) {
break; //没有下一根交叉线
}
for (int m=j+1;m<column_lines.size();m++) {
if (column_lines.get(m).getX() == nextline) {
Element cellElement = null;
//单元格的坐标
Rect cell_rect = new Rect();
cell_rect.setX(leftline);
cell_rect.setWidth(nextline);
cell_rect.setY(topline.getY());
cell_rect.setHeight(bottomline.getY());
/*TableTd td = null;
*/
if (leftline == StraightLines.getNextVerticalLine(topline, bottomline, vertical_lines, 0)) { //第一列
if (first_column_rowspan > 1) {
first_column_rowspan--;
continue;
} else {
first_column_rowspan = 0;
}
//设置边框
cellElement = rowElement.addElement("td");
cellElement.addAttribute("colspan", (m-j) + "");
/*td = new TableTd();
td.setColspan(String.valueOf(m-j));
tr.addTd(td);*/
//设置行合并
if ((bottomline.getX()-topline.getX())>10) {
first_column_rowspan = 2;
for (int p=i+2;p<mergedhlines.size();p++) {
if ((mergedhlines.get(p).getX()-topline.getX())>10) {
cell_rect.setHeight(mergedhlines.get(p).getY());
first_column_rowspan++;
} else {
break;
}
}
cellElement.addAttribute("rowspan", (first_column_rowspan) + "");
/*td.setRowspan(String.valueOf(first_column_rowspan));
*/
}
} else { //非第一列
cellElement = rowElement.addElement("td");
cellElement.addAttribute("colspan", (m-j) + "");
/*td = new TableTd();
td.setColspan(String.valueOf(m-j));
tr.addTd(td);*/
}
Element textElement = cellElement.addElement("text");
//SetSheetCell(sheet, i+1, j+1, 1, m-j, "");
//查找在topline, bottomline, leftline, nextline区域内的字符串,放入EXCEL表格
for (int n=0;n<textlist.size();n++) {
TextLine textline = textlist.get(n);
int rowspan = 1;
if (cellElement.attribute("rowspan") != null) {
rowspan = Integer.parseInt(cellElement.attribute("rowspan").getStringValue());
}
if ((textline.getRect().getX() >= cell_rect.getX())
&& ((textline.getRect().getX()) < cell_rect.getWidth())
&& ((textline.getRect().getY()) >= cell_rect.getY())
&& ((textline.getRect().getY()) < cell_rect.getHeight())) {
textElement = cellElement.element("text");
mergeElement(textElement, textline);
//mergeElement(td, textline);
// textElement.addAttribute("height", (int)textline.getRect().getHeight() + "");
// textElement.addAttribute("width", (int)(textline.getRect().getWidth() - textline.getRect().getX()) + "");
// textElement.addAttribute("x", (int)textline.getRect().getX() + "");
// textElement.addAttribute("y", (int)textline.getRect().getY() + "");
// textElement.setText(textline.getText());
//cellText = cellText + textline.getText();
//找到
//SetSheetCell(sheet, i+1, j+1, 1, m-j, textline.getText());
}
}
//cellElement.setText(cellText);
}
}
}
}
leftline = nextline;
} while (leftline != 0);
}
//pageElement.elements().remove(pageElement.elements().size());
Rect[] tables = new Rect[pageElement.elements().size()];
int table_indexes[] = new int[pageElement.elements().size()]; //表的元素索引值
int last_table_index = -1; //最后一张表格索引号
for (int i=0;i<pageElement.elements().size();i++) {
Element node = (Element)pageElement.elements().get(i);
if ((node.attribute("colX") == null) || (node.attributeValue("colX").equals(""))) continue;
tables[i] = new Rect();
table_indexes[i] = i;
String cols_X[] = node.attributeValue("colX").split(":");
String cols_Y[] = node.attributeValue("colY").split(":");
tables[i].setX(Double.parseDouble(cols_X[0]));
tables[i].setY(Double.parseDouble(cols_Y[0]));
tables[i].setWidth((Double.parseDouble(cols_X[cols_X.length-1])-tables[i].getX()));
tables[i].setHeight((Double.parseDouble(cols_Y[cols_Y.length-1])-tables[i].getY()));
last_table_index = i;
}
Page page = new Page();
page.setCurrentNum(page_no);
//表格数据后期处理
//去除空行
for (int i=0;i<pageElement.elements().size();i++) {
Element table = (Element)pageElement.elements().get(i);
Table tab = new Table();
tab.setColX(table.attributeValue("colX"));
tab.setColY(table.attributeValue("colY"));
page.addTable(tab);
List<Integer> empty_row_index_list = new ArrayList<Integer>();
for (int j=0;j<table.elements().size();j++) {
Element tr = (Element)table.elements().get(j);
TableTr myTr = new TableTr();
tab.addTr(myTr);
boolean b_empty_row = tr.elements().size()>0?true:false;
for (int k=0;k<tr.elements().size();k++) {
Element td = (Element)tr.elements().get(k);
if (!td.getStringValue().equals("")) {
b_empty_row = false;
Element text = td.element("text");
TableTd myTd = new TableTd();
int x = Integer.parseInt(text.attributeValue("x"));
int y = Integer.parseInt(text.attributeValue("y"));
int w = Integer.parseInt(text.attributeValue("width"));
int h = Integer.parseInt(text.attributeValue("height"));
myTd.setColX(x+":"+(x+w));
myTd.setColY(y+":"+(y+h));
myTd.setColspan(td.attributeValue("colspan"));
myTd.setRowspan(td.attributeValue("rowspan"));
myTd.setContent(td.getStringValue());
myTr.addTd(myTd);
continue;
}
}
if (b_empty_row) {
empty_row_index_list.add(j);
}
}
for (int l=empty_row_index_list.size();l>0;l--) {
table.elements().remove((int)((Integer)empty_row_index_list.get(l-1).intValue()));
}
}
//表外文本
Rect lastrect = null;
for (TextLine textline:textlist) {
boolean inserted = false;
Rect rect = textline.getRect();
for (int i=0;i<tables.length;i++) {
if (tables[i] == null) continue;
if (rect.getY()<tables[i].getY()
|| ((rect.getY() > tables[i].getY()) && (rect.getY() < (tables[i].getY() + tables[i].getHeight())) && (rect.getX() < tables[i].getX()))
|| ((rect.getY() > tables[i].getY()) && (rect.getY() < (tables[i].getY() + tables[i].getHeight())) && (rect.getX() > (tables[i].getX() + tables[i].getWidth())))) {
//是否表外数据
if ((i==0) || (rect.getY() > (tables[i-1].getY() + tables[i-1].getHeight()))) {
int step = 1;
Element element = DocumentHelper.createElement("text");
element.addAttribute("x", ""+(int)rect.getX());
element.addAttribute("y", ""+(int)rect.getY());
element.addAttribute("width", ""+(int)rect.getWidth());
element.addAttribute("height", ""+(int)rect.getHeight());
element.setText(textline.getText());
TextTr myTr = new TextTr();
myTr.setColX(rect.getX() +":" + ((int)rect.getX() + (int)rect.getWidth()));
myTr.setColY(rect.getY() +":" + ((int)rect.getY() + (int)rect.getHeight()));
myTr.setContent(textline.getText());
Text myTxt = new Text();
myTxt.addTr(myTr);
page.addText(myTxt);
pageElement.content().add(table_indexes[i], element);
if (lastrect == null) {
lastrect = rect;
}
else {
if (Math.abs(lastrect.getY() - rect.getY())>(lastrect.getHeight()/2)) {
lastrect = rect;
//element = DocumentHelper.createElement("br");
//pageElement.content().add(table_indexes[i], element);
//element.addElement("br");
pageElement.content().add(pageElement.indexOf(element), DocumentHelper.createElement("br"));
step = 2;
}
}
for (int j=i;j<table_indexes.length;j++) {
table_indexes[j]+=step;
}
inserted = true;
break;
}
}
}
if (!inserted) {
if ((last_table_index < 0) ||
(rect.getY() > (tables[last_table_index].getY()+tables[last_table_index].getHeight()))) {
Element element = pageElement.addElement("text");
if (lastrect == null) {
lastrect = rect;
}
else {
if (Math.abs(lastrect.getY() - rect.getY())>(lastrect.getHeight()/2)) {
lastrect = rect;
//element.addElement("br");
//pageElement.content().add(pageElement.indexOf(element), DocumentHelper.createElement("br"));
}
}
element.addAttribute("x", ""+(int)rect.getX());
element.addAttribute("y", ""+(int)rect.getY());
element.addAttribute("width", ""+(int)rect.getWidth());
element.addAttribute("height", ""+(int)rect.getHeight());
element.setText(textline.getText());
TextTr myTr = new TextTr();
myTr.setColX(rect.getX() +":" + ((int)rect.getX() + (int)rect.getWidth()));
myTr.setColY(rect.getY() +":" + ((int)rect.getY() + (int)rect.getHeight()));
myTr.setContent(textline.getText());
Text myTxt = new Text();
myTxt.addTr(myTr);
page.addText(myTxt);
inserted = true;
}
}
}
//uninitOutputExcelFile(workbook);
return page.toString();
}
public static WritableWorkbook initOutputExcelFile() {
WritableWorkbook workbook = null;
WritableSheet sheet = null;
try {
workbook = Workbook.createWorkbook(new File("d:/output.xls"));
sheet = workbook.createSheet("Page", 0);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return workbook;
}
public static void uninitOutputExcelFile(WritableWorkbook workbook) {
try {
workbook.write();
workbook.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (WriteException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void SetSheetCell(WritableSheet sheet, int row, int column, int rowspan, int colspan, String text) {
try {
//Label number = new Label(row, column, text);
//sheet.getCell(column, row).getContents();
Label number = new Label(column, row, sheet.getCell(column, row).getContents() + text);
WritableCellFormat format = new WritableCellFormat();
format.setBorder(jxl.format.Border.LEFT, jxl.format.BorderLineStyle.THIN); //边框样式, ls)
format.setBorder(jxl.format.Border.RIGHT, jxl.format.BorderLineStyle.THIN); //边框样式, ls)
format.setBorder(jxl.format.Border.TOP, jxl.format.BorderLineStyle.THIN); //边框样式, ls)
format.setBorder(jxl.format.Border.BOTTOM, jxl.format.BorderLineStyle.THIN); //边框样式, ls)
number.setCellFormat(format);
sheet.addCell(number);
//sheet.mergeCells(row, column, row + rowspan, column + colspan);
sheet.mergeCells(column, row, column + colspan-1, row + rowspan-1);
} catch (WriteException e) {
e.printStackTrace();
}
}
public static String generateXMLFile(String filename, String xml_filename) {
return generateXMLFile(filename, xml_filename, "");
}
public static String generateXMLFile(String filename, String xml_filename, String id) {
StringBuffer sb = new StringBuffer();
ExtractRawStream ers = new ExtractRawStream();
//checkPath(xml_filename);
Element rootElement = ers.initXMLFile(filename, id);
PdfDecoder decodePdf = null;
decodePdf = new PdfDecoder(true);
try {
decodePdf.openPdfFile(filename);
} catch (PdfException e1) {
e1.printStackTrace();
System.exit(0);
}
int start = 1, end = decodePdf.getPageCount();
for (int page_no=start; page_no<end+1;page_no++) {
ers.relist.clear();
ers.textlist.clear();
ers.page_rect.clear();
Element pageElement = rootElement.addElement("page");
pageElement.addAttribute("pageindex", page_no + "");
ers.parseFilePage(filename, page_no);
sb.append(ExtractRawStream.generateExcelTables(pageElement, page_no, ers.relist, ers.textlist));
}
/*try {
XMLWriter output = new XMLWriter(
new FileWriter(
new File(xml_filename)));
output.write(rootElement.getDocument());
output.close();
} catch(IOException e) {
System.out.println(e.getMessage());
}*/
return filte(sb);
}
public static String generateXMLFile(InputStream stream, String filename, String id) {
StringBuffer sb = new StringBuffer();
ExtractRawStream ers = new ExtractRawStream();
Element rootElement = ers.initXMLFile(filename, id);
PdfDecoder decodePdf = null;
decodePdf = new PdfDecoder(true);
try {
decodePdf.openPdfFileFromInputStream(stream, false);
} catch (PdfException e1) {
e1.printStackTrace();
System.exit(0);
}
int start = 1, end = decodePdf.getPageCount();
for (int page_no=start; page_no<end+1;page_no++) {
ers.relist.clear();
ers.textlist.clear();
ers.page_rect.clear();
Element pageElement = rootElement.addElement("page");
pageElement.addAttribute("pageindex", page_no + "");
ers.parseFilePage(filename, page_no);
sb.append(ExtractRawStream.generateExcelTables(pageElement, page_no, ers.relist, ers.textlist));
}
return filte(sb);
}
private static String filte(StringBuffer sb){
String result = sb.toString();
result = ignoreTag("colX=\"null\"", result);
result = ignoreTag("colY=\"null\"", result);
result = ignoreTag("<table colX=\"null\" colY=\"null\"></table>", result);
result = ignoreTag("<table></table>", result);
result = ignoreTag("<table ></table>", result);
result = ignoreTag(" colspan=\"null\"", result);
result = ignoreTag(" rowspan=\"null\"", result);
return result;
}
private Element initXMLFile(String file_name, String id) {
Document document = DocumentHelper.createDocument();
Element rootElement = document.addElement("pdf");
rootElement.addAttribute("id", id);
rootElement.addAttribute("filename", new File(file_name).getName());
return rootElement;
}
private static void checkPath(String filename) {
File file = new File(filename).getParentFile();
if (file!=null&&!file.exists()) {
file.mkdirs();
}
}
@SuppressWarnings("deprecation")
private static void mergeElement(Element textElement, TextLine textline) {
String x = "";
String y = "";
String width = "";
String height = "";
if (textElement.attribute("x") != null) {
x = textElement.attribute("x").getStringValue();
y = textElement.attribute("y").getStringValue();
width = textElement.attribute("width").getStringValue();
height = textElement.attribute("height").getStringValue();
}
if (x == null || x.equals("null") || x.equals("")) {
x = (int)textline.getRect().getX() + "";
y = (int)textline.getRect().getY() + "";
width = (int)(textline.getRect().getWidth() - textline.getRect().getX()) + "";
height = (int)textline.getRect().getHeight() + "";
textElement.addAttribute("x", x);
textElement.addAttribute("y", y);
textElement.addAttribute("width", width);
textElement.addAttribute("height", height);
textElement.setText(textline.getText());
} else {
Rect rect = new Rect();
rect.setX((Double.parseDouble(x) < textline.getRect().getX())?Double.parseDouble(x):textline.getRect().getX());
rect.setY((Double.parseDouble(y) < textline.getRect().getY())?Double.parseDouble(y):textline.getRect().getY());
double x_max = Double.parseDouble(x) + Double.parseDouble(width);
if (x_max<textline.getRect().getWidth()) {
rect.setWidth(textline.getRect().getWidth()-textline.getRect().getX());
} else {
rect.setWidth(Double.parseDouble(width));
}
double y_max = Double.parseDouble(y) + Double.parseDouble(height);
if (y_max<(textline.getRect().getY() + textline.getRect().getHeight())) {
rect.setHeight(textline.getRect().getY() + textline.getRect().getHeight() - rect.getY());
} else {
rect.setHeight(y_max - rect.getY());
}
textElement.addAttribute("x", (int)rect.getX() + "");
textElement.addAttribute("y", (int)rect.getY() + "");
textElement.addAttribute("width", (int)rect.getWidth() + "");
textElement.addAttribute("height", (int)rect.getHeight() + "");
textElement.setText(textElement.getText() + textline.getText());
}
}
private static void mergeElement(TableTd td, TextLine textline) {
String x = "";
String y = "";
String width = "";
String height = "";
/*if (textElement.attribute("x") != null) {
x = textElement.attribute("x").getStringValue();
y = textElement.attribute("y").getStringValue();
width = textElement.attribute("width").getStringValue();
height = textElement.attribute("height").getStringValue();
}*/
if (x == null || x.equals("null") || x.equals("")) {
x = (int)textline.getRect().getX() + "";
y = (int)textline.getRect().getY() + "";
width = (int)(textline.getRect().getWidth() - textline.getRect().getX()) + "";
height = (int)textline.getRect().getHeight() + "";
} else {
Rect rect = new Rect();
rect.setX((Double.parseDouble(x) < textline.getRect().getX())?Double.parseDouble(x):textline.getRect().getX());
rect.setY((Double.parseDouble(y) < textline.getRect().getY())?Double.parseDouble(y):textline.getRect().getY());
double x_max = Double.parseDouble(x) + Double.parseDouble(width);
if (x_max<textline.getRect().getWidth()) {
rect.setWidth(textline.getRect().getWidth()-textline.getRect().getX());
} else {
rect.setWidth(Double.parseDouble(width));
}
double y_max = Double.parseDouble(y) + Double.parseDouble(height);
if (y_max<(textline.getRect().getY() + textline.getRect().getHeight())) {
rect.setHeight(textline.getRect().getY() + textline.getRect().getHeight() - rect.getY());
} else {
rect.setHeight(y_max - rect.getY());
}
x = (int)rect.getX() + "";
y = (int)rect.getY() + "";
width = (int)rect.getWidth() + "";
height = (int)rect.getHeight() + "";
}
td.setColX(x+":"+(x+width));
td.setColY(y+":"+(y+height));
td.setContent(textline.getText());
}
//无框表格数据提取
public void noframe_table_parse(List<Rect> lines, List<TextLine> textlist) {
TextLine.groupByX(textlist);
Rect line = new Rect();
for (TextLine textline:textlist) {
Rect rect = textline.getRect();
}
}
private static String ignoreTag(String tag, String origin){
return origin.replaceAll(tag, "");
}
}
package org.aoe.software.pdf;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
public class FileUtils {
public static boolean save(String data, String filepath){
OutputStream os = null;
try {
os = new FileOutputStream(filepath);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(os));
out.write(data);
out.flush();
return true;
} catch (Exception e) {
return false;
} finally{
if(os != null){
try {
os.close();
} catch (IOException e) {
os = null;
}
}
}
}
}
package org.aoe.software.pdf;
public class Rect {
double x;
double y;
double width;
double height;
Rect() {
}
Rect(Rect rect) {
set(rect);
}
public void set(Rect rect) {
this.x = rect.getX();
this.y = rect.getY();
this.width = rect.getWidth();
this.height = rect.getHeight();
}
public double getX() {
return x;
}
public void setX(double x) {
this.x = x;
}
public double getY() {
return y;
}
public void setY(double y) {
this.y = y;
}
public double getWidth() {
return width;
}
public void setWidth(double width) {
this.width = width;
}
public double getHeight() {
return height;
}
public void setHeight(double height) {
this.height = height;
}
public void clear() {
x = 0;
y = 0;
width = 0;
height = 0;
}
}
package org.aoe.software.pdf;
import java.util.ArrayList;
import java.util.List;
public class TextLine {
private String text;
private Rect rect = new Rect();
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public Rect getRect() {
return rect;
}
public void setRect(Rect rect) {
this.rect = rect;
}
public static void sortByYMinAsc(List<TextLine> textlist) {
int i, j;
boolean ischanged = false;
for (j=textlist.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (textlist.get(i).getRect().getY() > textlist.get(i+1).getRect().getY()) {
TextLine temp = textlist.get(i);
textlist.set(i, textlist.get(i+1));
textlist.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
public static double MINIMUN_LINE_LENGTH = 3;
public static List<List<TextLine>> groupByX(List<TextLine> lines) {
if (lines.size() == 0) return null;
TextLine.sortByXMinAsc(lines);
List<List<TextLine>> out = new ArrayList<List<TextLine>>();
double lastx = 0;
lastx = lines.get(0).getRect().getX();
List<TextLine> current = new ArrayList<TextLine>();
for (TextLine line:lines) {
//分组
if ((line.getRect().getX() - lastx) > MINIMUN_LINE_LENGTH) {
out.add(current);
current = new ArrayList<TextLine>();
lastx = line.getRect().getX();
current.add(line);
} else {
line.getRect().setX(lastx); //去掉双线
current.add(line);
}
}
out.add(current);
return out;
}
public static List<List<TextLine>> groupByY(List<TextLine> lines) {
if (lines.size() == 0) return null;
TextLine.sortByYMinAsc(lines);
List<List<TextLine>> out = new ArrayList<List<TextLine>>();
double lasty = 0;
lasty = lines.get(0).getRect().getY();
List<TextLine> current = new ArrayList<TextLine>();
for (TextLine line:lines) {
//分组
if ((line.getRect().getY() - lasty) > MINIMUN_LINE_LENGTH) {
out.add(current);
current = new ArrayList<TextLine>();
lasty = line.getRect().getY();
current.add(line);
} else {
line.getRect().setY(lasty); //去掉双线
current.add(line);
}
}
out.add(current);
return out;
}
//按x轴值排序
public static void sortByXMinAsc(List<TextLine> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getRect().getX() < in.get(i+1).getRect().getX()) {
TextLine temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
}
package org.aoe.software.pdf;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class StraightLines {
//按x轴值排序
public static void sortByXMin(List<Rect> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getX() > in.get(i+1).getX()) {
Rect temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
//按x轴值排序
public static void sortByXMinAsc(List<Rect> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getX() < in.get(i+1).getX()) {
Rect temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
//按y轴值排序
public static void sortByYMin(List<Rect> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getY() < in.get(i+1).getY()) {
Rect temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
//按y轴值排序
public static void sortByYMinAsc(List<Rect> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getY() > in.get(i+1).getY()) {
Rect temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
//按宽度排序
public static void sortByXMax(List<Rect> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getWidth() > in.get(i+1).getWidth()) {
Rect temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
//按高度排序
public static void sortByYMax(List<Rect> in) {
int i, j;
boolean ischanged = false;
for (j=in.size();j>0;j--) {
ischanged = false;
for (i=0;i<j-1;i++) {
if (in.get(i).getHeight() > in.get(i+1).getHeight()) {
Rect temp = in.get(i);
in.set(i, in.get(i+1));
in.set(i+1, temp);
ischanged = true;
}
}
if (!ischanged) break;
}
}
public static List<Rect> mergeHorizontalLines(List<Rect> horizontal_lines) {
List<Rect> merged_lines = new ArrayList<Rect>();
if ((horizontal_lines == null))
return merged_lines;
//水平按序排列
StraightLines.sortByYMinAsc(horizontal_lines);
double lasty = 0;
Rect temp = new Rect();
for (int i=0;i<horizontal_lines.size();i++) {
Rect line = horizontal_lines.get(i);
if (lasty == 0) {
lasty = line.getY();
temp = new Rect();
temp.setX(line.getX());
temp.setWidth(line.getWidth());
temp.setY(line.getY());
temp.setHeight(line.getHeight());
continue;
}
if (line.getY() != lasty) {
lasty = line.getY();
merged_lines.add(temp);
temp = new Rect();
temp.setX(line.getX());
temp.setWidth(line.getWidth());
temp.setY(line.getY());
temp.setHeight(line.getHeight());
}
else
{
//合并线
if (temp.getWidth() == 0) {
temp.setX(line.getX());
temp.setWidth(line.getWidth());
temp.setY(line.getY());
temp.setHeight(line.getHeight());
}
else {
double xMin = (line.getX() < temp.getX())?line.getX():temp.getX();
if ((line.getX() + line.getWidth()) > (temp.getX() + temp.getWidth())) {
temp.setWidth((line.getX() + line.getWidth()) - temp.getX());
}
else {
temp.setWidth((temp.getX() + temp.getWidth()) - temp.getX());
}
temp.setX(xMin);
}
}
}
if (temp.getWidth() > 0) {
merged_lines.add(temp);
}
return merged_lines;
}
public static List<Rect> mergeVerticalLines(List<Rect> vertical_lines) {
List<Rect> merged_lines = new ArrayList<Rect>();
if ((vertical_lines == null))
return merged_lines;
//水平按序排列
StraightLines.sortByXMinAsc(vertical_lines);
double lasty = 0;
Rect temp = new Rect();
for (int i=0;i<vertical_lines.size();i++) {
Rect line = vertical_lines.get(i);
if (lasty == 0) {
lasty = line.getY();
temp = new Rect();
temp.setX(line.getX());
temp.setWidth(line.getWidth());
temp.setY(line.getY());
temp.setHeight(line.getHeight());
continue;
}
if (line.getY() != lasty) {
lasty = line.getY();
merged_lines.add(temp);
temp = new Rect();
temp.setX(line.getX());
temp.setWidth(line.getWidth());
temp.setY(line.getY());
temp.setHeight(line.getHeight());
}
else {
//合并线
if (temp.getWidth() == 0) {
temp.setX(line.getX());
temp.setWidth(line.getWidth());
temp.setY(line.getY());
temp.setHeight(line.getHeight());
}
else {
double xMin = (line.getX() < temp.getX())?line.getX():temp.getX();
if ((line.getX() + line.getWidth()) > (temp.getX() + temp.getWidth())) {
temp.setWidth((line.getX() + line.getWidth()) - temp.getX());
}
else {
temp.setWidth((temp.getX() + temp.getWidth()) - temp.getX());
}
temp.setX(xMin);
}
}
}
if (temp.getWidth() > 0) {
merged_lines.add(temp);
}
return merged_lines;
}
public static double MINIMUN_LINE_LENGTH = 3;
public static double getNextVerticalLine(Rect topline, Rect bottomline, List<Rect> in_vertical_lines, double startx) {
List<Rect> vertical_lines = in_vertical_lines;
StraightLines.sortByXMin(vertical_lines);
double result = 0;
boolean bFind = false;
int start;
if (startx==0) {
start = -1;
}
else {
for (start=0;start<vertical_lines.size();start++) {
if (vertical_lines.get(start).getX() == startx) {
if (((vertical_lines.get(start).getY() - MINIMUN_LINE_LENGTH < topline.getY())
&& (vertical_lines.get(start).getY() + vertical_lines.get(start).getHeight() + MINIMUN_LINE_LENGTH > topline.getY()))
&& ((vertical_lines.get(start).getY() - MINIMUN_LINE_LENGTH < bottomline.getY())
&& (vertical_lines.get(start).getY() + vertical_lines.get(start).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) {
for (;start<vertical_lines.size() && vertical_lines.get(start).getX() == startx;start++);
start--;
bFind = true;
break;
}
}
}
}
bFind = false;
for (int i=start+1;i<vertical_lines.size();i++) {
if (((vertical_lines.get(i).getY() - MINIMUN_LINE_LENGTH < topline.getY())
&& (vertical_lines.get(i).getY() + vertical_lines.get(i).getHeight() + MINIMUN_LINE_LENGTH > topline.getY()))
&& ((vertical_lines.get(i).getY() - MINIMUN_LINE_LENGTH < bottomline.getY())
&& (vertical_lines.get(i).getY() + vertical_lines.get(i).getHeight() + MINIMUN_LINE_LENGTH > bottomline.getY()))) {
bFind = true;
result = vertical_lines.get(i).getX();
break;
}
}
return result;
}
public static void processReCommands(List<Rect> relist, int page_no) {
List<Rect> temp = new ArrayList<Rect>();
//Rect line;
for (Rect line:relist) {
if ((line.getWidth()>MINIMUN_LINE_LENGTH) && (line.getHeight()>MINIMUN_LINE_LENGTH)) {
Rect topline = new Rect();
topline.setX(line.getX());
topline.setY(line.getY());
topline.setWidth(line.getWidth());
topline.setHeight(0.1);
Rect bottomline = new Rect();
bottomline.setX(line.getX());
bottomline.setY(line.getY()+line.getHeight());
bottomline.setWidth(line.getWidth());
bottomline.setHeight(0.1);
Rect leftline = new Rect();
leftline.setX(line.getX());
leftline.setY(line.getY());
leftline.setHeight(line.getHeight());
leftline.setWidth(0.1);
Rect rightline = new Rect();
rightline.setX(line.getX()+line.getWidth());
rightline.setY(line.getY());
rightline.setHeight(line.getHeight());
rightline.setWidth(0.1);
temp.add(topline);
temp.add(bottomline);
temp.add(leftline);
temp.add(rightline);
} else {
temp.add(line);
}
}
relist.clear();
relist.addAll(temp);
temp.clear();
List<Rect> horizontal_lines = new ArrayList<Rect>();
List<Rect> vertical_lines = new ArrayList<Rect>();
Rect lastline = new Rect();
//获得水平线
for (int i=0;i<relist.size();i++) {
if (relist.get(i).getWidth() > MINIMUN_LINE_LENGTH) {
horizontal_lines.add(relist.get(i));
}
}
StraightLines.sortByYMinAsc(horizontal_lines);
for (Rect line:horizontal_lines) {
if ((lastline.getHeight() == 0) && (lastline.getWidth() == 0)) {
lastline.set(line);
continue;
}
if (Math.abs((lastline.getY() - line.getY())) > MINIMUN_LINE_LENGTH) {
temp.add(lastline);
lastline = new Rect();
lastline.set(line);
} else {
if (lastline.getX() > line.getX()) {
lastline.setX(line.getX());
}
if ((lastline.getX() + lastline.getWidth()) < (line.getX() + line.getWidth())) {
lastline.setWidth((line.getX() + line.getWidth()) - lastline.getX());
}
}
}
if ((lastline.getHeight() != 0) && (lastline.getWidth() != 0)) {
temp.add(lastline);
}
//获得垂直线
for (int i=0;i<relist.size();i++) {
if (relist.get(i).getHeight() > MINIMUN_LINE_LENGTH) {
if (relist.get(i).getX() != 0) {
vertical_lines.add(relist.get(i));
}
}
}
List<List<Rect>> v_list = StraightLines.groupByX(vertical_lines);
List<Rect> v_lines = StraightLines.remergeVerticalLines(v_list);
temp.addAll(v_lines);
/*
StraightLines.sortByXMinAsc(vertical_lines);
lastline = new Rect();
for (Rect line:vertical_lines) {
if ((lastline.getHeight() == 0) && (lastline.getWidth() == 0)) {
lastline.set(line);
continue;
}
if (Math.abs((lastline.getX() - line.getX())) > MINIMUN_LINE_LENGTH*5) {
lastline.setY(lastline.getY() - 2);
lastline.setHeight(lastline.getHeight() + 2);
temp.add(lastline);
lastline = new Rect();
lastline.set(line);
} else {
if (lastline.getY() > line.getY()) {
lastline.setY(line.getY());
}
if ((lastline.getY() + lastline.getHeight()) < (line.getY() + line.getHeight())) {
lastline.setHeight((line.getY() + line.getHeight()) - lastline.getY());
}
}
}
if ((lastline.getHeight() != 0) && (lastline.getWidth() != 0)) {
temp.add(lastline);
}
*/
relist.clear();
relist.addAll(temp);
}
public static void printLines(String filename, List<Rect> lines) {
FileWriter out;
try {
out = new FileWriter(new File(filename));
for (Rect line:lines) {
out.write(line.getX() + ", " + line.getY() + ", " + line.getWidth() + ", " + line.getHeight() + " re" + (char)(10) + (char)(13));
}
out.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static List<List<Rect>> groupByX(List<Rect> lines) {
if (lines.size() == 0) return null;
StraightLines.sortByXMin(lines);
List<List<Rect>> out = new ArrayList<List<Rect>>();
double lastx = 0;
lastx = lines.get(0).getX();
List<Rect> current = new ArrayList<Rect>();
for (Rect line:lines) {
//分组
if ((line.getX() - lastx)>MINIMUN_LINE_LENGTH) {
out.add(current);
current = new ArrayList<Rect>();
lastx = line.getX();
current.add(line);
} else {
line.setX(lastx); //去掉双线
current.add(line);
}
}
out.add(current);
return out;
}
public static List<Rect> remergeVerticalLines(List<List<Rect>> lines_list) {
List<Rect> out = new ArrayList<Rect>();
if (lines_list == null) return out;
for (List<Rect> lines: lines_list) {
StraightLines.sortByYMinAsc(lines);
Rect current = new Rect();
for (Rect line: lines) {
if (line.getY() == 0) continue;
if (current.getY() == 0) current.set(line);
if (line.getY()<=(current.getY()+current.getHeight())) {
double height = 0;
if ((line.getY() + line.getHeight()) > (current.getY() + current.getHeight())) {
height = line.getY() + line.getHeight() - current.getY();
current.setHeight(height);
}
} else {
out.add(current);
current = new Rect();
current.set(line);
}
}
out.add(current);
}
return out;
}
}
PO
package org.aoe.software.pdf.po;
import java.util.LinkedList;
import java.util.List;
/**
* 每页对象.
*
* <page pageIndex="1">
<text>
<tr colX="x1:x2" colY="y1:y2">ssssssssss</tr>
</text>
<table colX="x1:x2:x3" colY="y1:y2:y3:y4">
<tr>
<td colX="x1:x2" colY="y1:y2" colspan="2" rowspan="2">TTTT</td>
</tr>
</table>
</page>
*
*/
public class Page {
private int currentNum;
private List<Text> textList = new LinkedList<Text>();
private List<Table> tableList = new LinkedList<Table>();
private List<Integer> seqList = new LinkedList<Integer>(); // 0:text 1:table
public String toString(){
StringBuffer sb = new StringBuffer();
int textIndex = 0;
int tableIndex = 0;
for(int i : seqList){
if(i == 0){
sb.append(textList.get(textIndex++).toString());
}else{
sb.append(tableList.get(tableIndex++).toString());
}
}
return String.format("<page pageIndex=\"%s\">%s</page>", currentNum, sb.toString());
}
public int getCurrentNum() {
return currentNum;
}
public void setCurrentNum(int currentNum) {
this.currentNum = currentNum;
}
public void addText(Text text){
textList.add(text);
seqList.add(0);
}
public void addTable(Table table){
tableList.add(table);
seqList.add(1);
}
}
package org.aoe.software.pdf.po;
import java.util.LinkedList;
import java.util.List;
/**
表格.
*
*/
public class Table {
private String colX;
private String colY;
private List<TableTr> trList = new LinkedList<TableTr>();
public String toString(){
StringBuffer sb = new StringBuffer();
for(TableTr tr : trList){
sb.append(tr.toString());
}
return String.format("<table border=\"1\" colX=\"%s\" colY=\"%s\">%s</table>",
colX, colY, sb.toString());
}
public void addTr(TableTr tr){
trList.add(tr);
}
public String getColX() {
return colX;
}
public void setColX(String colX) {
this.colX = colX;
}
public String getColY() {
return colY;
}
public void setColY(String colY) {
this.colY = colY;
}
}
package org.aoe.software.pdf.po;
/**
* 表格的单元格。
*
*/
public class TableTd {
private String colX;
private String colY;
private String colspan;
private String rowspan;
private String content;
@Override
public String toString(){
return String.format("<td colX=\"%s\" colY=\"%s\" colspan=\"%s\" rowspan=\"%s\">%s</td>",
colX, colY, colspan, rowspan, content);
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getColX() {
return colX;
}
public void setColX(String colX) {
this.colX = colX;
}
public String getColY() {
return colY;
}
public void setColY(String colY) {
this.colY = colY;
}
public String getColspan() {
return colspan;
}
public void setColspan(String colspan) {
this.colspan = colspan;
}
public String getRowspan() {
return rowspan;
}
public void setRowspan(String rowspan) {
this.rowspan = rowspan;
}
}
package org.aoe.software.pdf.po;
import java.util.LinkedList;
import java.util.List;
/**
表格的行.
*
*/
public class TableTr {
private List<TableTd> tdList = new LinkedList<TableTd>();
public String toString(){
StringBuffer sb = new StringBuffer();
sb.append("<tr>");
for(TableTd td : tdList){
sb.append(td.toString());
}
sb.append("</tr>");
return sb.toString();
}
public void addTd(TableTd td){
tdList.add(td);
}
}
package org.aoe.software.pdf.po;
import java.util.LinkedList;
import java.util.List;
/**
* 文本块.
*
*/
public class Text {
private List<TextTr> trList = new LinkedList<TextTr>();
public String toString(){
StringBuffer sb = new StringBuffer();
for(TextTr tr : trList){
sb.append(tr.toString());
}
return String.format("<text>%s</text>",sb.toString());
}
public void addTr(TextTr tr){
trList.add(tr);
}
}
package org.aoe.software.pdf.po;
/**
* 文本行 。
*
*/
public class TextTr {
private String colX;
private String colY;
private String content;
public String toString(){
return String.format("<tr colX=\"%s\" colY=\"%s\">%s</tr>", colX, colY, content);
}
public String getColX() {
return colX;
}
public void setColX(String colX) {
this.colX = colX;
}
public String getColY() {
return colY;
}
public void setColY(String colY) {
this.colY = colY;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}