PDF文件中，如何根据关键字，获取坐标信息

最新推荐文章于 2024-04-23 17:03:07 发布

897221242

最新推荐文章于 2024-04-23 17:03:07 发布

阅读量2.3k

点赞数

文章标签： pdf java apache

本文链接：https://blog.csdn.net/u013933709/article/details/131049391

版权

package com.dhcc.zhfc.elesign.util;

import org.apache.commons.lang.StringUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.hibernate.annotations.common.util.StringHelper;

import java.io.*;
import java.util.ArrayList;
import java.util.List;

/**
* @ClassName PdfBoxKeyWordPosition
* @Description TODO
* @Author 86173
* @Date 2020/5/11 15:14
* @Version 1.0
*/
public class PdfBoxKeyWordPosition extends PDFTextStripper {
// 关键字字符数组
private char[] key;
// PDF文件路径
private String pdfPath;
private byte[] fileBytes;
// 坐标信息集合
private List<float[]> list = new ArrayList<float[]>();
// 当前页信息集合
private List<float[]> pagelist = new ArrayList<float[]>();
// 有参构造方法
public PdfBoxKeyWordPosition(String keyWords, String pdfPath,byte[] bin) throws IOException {
super();
super.setSortByPosition(true);
this.pdfPath = pdfPath;
this.fileBytes= bin;
char[] key = new char[keyWords.length()];
for (int i = 0; i < keyWords.length(); i++) {
key[i] = keyWords.charAt(i);
}
this.key = key;
}
public char[] getKey() {
return key;
}
public void setKey(char[] key) {
this.key = key;
}
public String getPdfPath() {
return pdfPath;
}
public void setPdfPath(String pdfPath) {
this.pdfPath = pdfPath;
}
// 获取坐标信息
public List<float[]> getCoordinate() throws IOException {
try {
if(!StringHelper.isEmpty(pdfPath)){
document = PDDocument.load(new File(pdfPath));
}
if(document==null&&fileBytes!=null){
document = PDDocument.load(fileBytes);
}
int pages = document.getNumberOfPages();
for (int i = 1; i <= pages; i++) {
pagelist.clear();
super.setSortByPosition(true);
super.setStartPage(i);
super.setEndPage(i);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(document, dummy);
for (float[] li : pagelist) {
li[2] = i;
}
list.addAll(pagelist);
}
return list;
} catch (Exception e) {
e.printStackTrace();
} finally {
if (document != null) {
document.close();
}
}
return list;
}

// 获取坐标信息
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (int i = 0; i < textPositions.size(); i++) {
String fonts = textPositions.get(i).getFont().getName();
String str = textPositions.get(i).getUnicode();
if (str.equals(key[0] + "")) {
int count = 0;
for (int j = 0; j < key.length-1; j++) {
String s = "";
try {
s = textPositions.get(i + j).getUnicode();
} catch (Exception e) {
s = "";
}
if (s.equals(key[j] + "")) {
count++;
}
}
if (count == key.length-1) {
float[] idx = new float[3];
// 需要进行一些调整使得章盖在字体上
// X坐标在这里加上了字体的长度，也可以直接 idx[0] = textPositions.get(i).getX()
idx[0] = textPositions.get(i).getX()+textPositions.get(i).getFontSize();
// Y坐标在这里减去的字体的长度，也可以直接 idx[1] = textPositions.get(i).getPageHeight()-textPositions.get(i).getY()
idx[1] = textPositions.get(i).getHeight()-textPositions.get(i).getY()-4*textPositions.get(i).getFontSize();
System.out.println("x=" + idx[0] + ",y=" + idx[1]);
pagelist.add(idx);
return;
}
}
}
}

public static void main(String[] args) throws IOException {
String pdfPath = "C:\\Users\\pangq\\Desktop\\555.pdf";
File file = new File(pdfPath);
//PDDocument doc = PDDocument.load(file);
String keyWords = "纪海祥";
//PDImageXObject pdImage = PDImageXObject.createFromFile("C:/Programs/test/sign.png", doc);
byte[] bytes = File2byte(file);
PdfBoxKeyWordPosition pdf = new PdfBoxKeyWordPosition(keyWords, "",bytes);
PDPageContentStream contentStream = null;
List<float[]> list = pdf.getCoordinate();
List<Integer> convertResult = convert(list);
String a = convert2String(list);
// 多页pdf的处理*/
for (float[] fs : list) {
float x = fs[0];
float y = fs[1];
}
//doc.close();
}
public static byte[] File2byte(File tradeFile){
byte[] buffer = null;
FileInputStream fis =null;
ByteArrayOutputStream bos =null;
try
{
fis = new FileInputStream(tradeFile);
bos = new ByteArrayOutputStream();
byte[] b = new byte[1024];
int n;
while ((n = fis.read(b)) != -1)
{
bos.write(b, 0, n);
}
fis.close();
bos.close();
buffer = bos.toByteArray();
}catch (FileNotFoundException e){
e.printStackTrace();
}catch (IOException e){
e.printStackTrace();
}finally {
if(fis !=null){
try {
fis.close();
}catch (IOException io){
io.printStackTrace();
}
}
if(bos !=null){
try {
bos.close();
}catch (IOException io){
io.printStackTrace();
}
}
}
return buffer;
}

public static List<Integer> convert(List<float[]> list){
List<Integer> res = new ArrayList<Integer>();
if(list!=null&&list.size()>0) {
for (float[] fs : list) {
int page = (int) fs[2];
if(!res.contains(page)){
res.add(page);
}
}
}
return res;
}
public static String convert2String(List<float[]> list){
List<Integer> res = convert(list);
String str = StringUtils.join(res.iterator(),",");
return str;
}

/**
* 获取pdf中页码
* @param bystes
* @return
*/
public static int getPdfNubers(byte[] bystes){

int pages = 0;
ByteArrayInputStream in = new ByteArrayInputStream(bystes);
PDDocument pdfReader = null;
try {
pdfReader = PDDocument.load(in);
pages= pdfReader.getNumberOfPages();
} catch (IOException e) {
return pages;
}

return pages;

}

897221242

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
PDF文件中，如何根据关键字，获取坐标信息

/ Y坐标在这里减去的字体的长度，也可以直接 idx[1] = textPositions.get(i).getPageHeight()-textPositions.get(i).getY()// X坐标在这里加上了字体的长度，也可以直接 idx[0] = textPositions.get(i).getX()// 需要进行一些调整使得章盖在字体上。String keyWords = "纪海祥";// 多页pdf的处理*/// 关键字字符数组。// PDF文件路径。// 当前页信息集合。
复制链接

扫一扫