java中根据关键字找在文件中的位置

首先,要有两个jar包。

这两个jar包如果不好找或找不到可以去我的资源下载哦。

https://download.csdn.net/download/qq_43560721/11628645

然后是主要代码,粘上去就可以用,亲测有用。

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.ContentByteUtils;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.PdfContentStreamProcessor;
import com.itextpdf.text.pdf.parser.RenderListener;
import com.itextpdf.text.pdf.parser.TextRenderInfo;

public class PdfKeywordFinder {

	public static void main(String[] args) throws IOException {
		File pdfFile = new File("E:/b.pdf");
		byte[] pdfData = new byte[(int) pdfFile.length()];
		FileInputStream inputStream = null;
		try {
			inputStream = new FileInputStream(pdfFile);
			inputStream.read(pdfData);
		} catch (IOException e) {
			throw e;
		} finally {
			if (inputStream != null) {
				try {
					inputStream.close();
				} catch (IOException e) {
				}
			}
		}

		String keyword = "甲方:";

		List<float[]> positions = findKeywordPostions(pdfData, keyword);

		System.out.println("total:" + positions.size());
		if (positions != null && positions.size() > 0) {
			for (float[] position : positions) {
				System.out.print("pageNum: " + (int) position[0]);
				System.out.print("\tx: " + position[1]);
				System.out.println("\ty: " + position[2]);
			}
		}
	}

	/**
	 * findKeywordPostions
	 * 
	 * @param pdfData
	 * @param keyword
	 * @return List<float[]> : float[0]:pageNum float[1]:x float[2]:y
	 * @throws IOException
	 */
	public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
		List<float[]> result = new ArrayList<>();
		List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);

		for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
			List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
			if (charPositions == null || charPositions.size() < 1) {
				continue;
			}
			result.addAll(charPositions);
		}
		return result;
	}

	private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
		PdfReader reader = new PdfReader(pdfData);

		List<PdfPageContentPositions> result = new ArrayList<>();

		int pages = reader.getNumberOfPages();
		for (int pageNum = 1; pageNum <= pages; pageNum++) {
			float width = reader.getPageSize(pageNum).getWidth();
			float height = reader.getPageSize(pageNum).getHeight();

			PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);

			// 寮�濮嬮�愰〉娓叉煋pdf
			PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
			PdfDictionary pageDic = reader.getPageN(pageNum);
			PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
			try {
				processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
			} catch (IOException e) {
				reader.close();
				throw e;
			}

			String content = pdfRenderListener.getContent();
			List<CharPosition> charPositions = pdfRenderListener.getcharPositions();

			List<float[]> positionsList = new ArrayList<>();
			for (CharPosition charPosition : charPositions) {
				float[] positions = new float[] { charPosition.getPageNum(), charPosition.getX(), charPosition.getY() };
				positionsList.add(positions);
			}

			PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
			pdfPageContentPositions.setContent(content);
			pdfPageContentPositions.setPostions(positionsList);

			result.add(pdfPageContentPositions);
		}
		reader.close();
		return result;
	}

	private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {

		List<float[]> result = new ArrayList<>();

		String content = pdfPageContentPositions.getContent();
		List<float[]> charPositions = pdfPageContentPositions.getPositions();

		for (int pos = 0; pos < content.length();) {
			int positionIndex = content.indexOf(keyword, pos);
			if (positionIndex == -1) {
				break;
			}
			float[] postions = charPositions.get(positionIndex);
			result.add(postions);
			pos = positionIndex + 1;
		}
		return result;
	}

	private static class PdfPageContentPositions {
		private String content;
		private List<float[]> positions;

		public String getContent() {
			return content;
		}

		public void setContent(String content) {
			this.content = content;
		}

		public List<float[]> getPositions() {
			return positions;
		}

		public void setPostions(List<float[]> positions) {
			this.positions = positions;
		}
	}

	private static class PdfRenderListener implements RenderListener {
		private int pageNum;
		private float pageWidth;
		private float pageHeight;
		private StringBuilder contentBuilder = new StringBuilder();
		private List<CharPosition> charPositions = new ArrayList<>();

		public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
			this.pageNum = pageNum;
			this.pageWidth = pageWidth;
			this.pageHeight = pageHeight;
		}

		@Override
		public void beginTextBlock() {

		}

		@Override
		public void renderText(TextRenderInfo renderInfo) {
			List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
			for (TextRenderInfo textRenderInfo : characterRenderInfos) {
				String word = textRenderInfo.getText();
				if (word.length() > 1) {
					word = word.substring(word.length() - 1, word.length());
				}
				Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();
				double x = rectangle.getMinX();
				double y = rectangle.getMaxY();

				float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
				float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;// 淇濈暀鍥涗綅灏忔暟

				CharPosition charPosition = new CharPosition(pageNum, xPercent, yPercent);
				charPositions.add(charPosition);
				contentBuilder.append(word);
			}
		}

		@Override
		public void endTextBlock() {

		}

		@Override
		public void renderImage(ImageRenderInfo renderInfo) {

		}

		public String getContent() {
			return contentBuilder.toString();
		}

		public List<CharPosition> getcharPositions() {
			return charPositions;
		}
	}

	private static class CharPosition {
		private int pageNum = 0;
		private float x = 0;
		private float y = 0;

		public CharPosition(int pageNum, float x, float y) {
			this.pageNum = pageNum;
			this.x = x;
			this.y = y;
		}

		public int getPageNum() {
			return pageNum;
		}

		public float getX() {
			return x;
		}

		public float getY() {
			return y;
		}

		@Override
		public String toString() {
			return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
		}
	}
}

控制台输出。

解释说明:文档中有一个位置存在“甲方:”,在第二页,且xy比例为 0.1915:0.386

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值