java 分页读取文件大小_java 读取pdf (可分页读取)

最新推荐文章于 2024-07-10 15:34:17 发布

邦成为寄卖连锁

最新推荐文章于 2024-07-10 15:34:17 发布

阅读量125

点赞数

文章标签： java 分页读取文件大小

本文链接：https://blog.csdn.net/weixin_42562079/article/details/114569376

版权

需要pdfbox和log4j的包

举个例子：

import org.pdfbox.pdfparser.*;

import org.pdfbox.util.PDFTextStripper;

import java.io.*;

/**

* 测试pdfbox

* @author kingfish

* @version 1.0

public class TestPdf {

public static void main(String[] args) throws Exception{

FileInputStream fis = new FileInputStream("c://intro.pdf");

PDFParser p = new PDFParser(fis);

p.parse();

PDFTextStripper ts = new PDFTextStripper();

String s = ts.getText(p.getPDDocument());

System.out.println(s);

fis.close();

}

--------------------------------------------------------------------------------

import java.io.*; import java.util.*; import com.etymon.pj.*; import com.etymon.pj.object.*; import com.etymon.pj.exception.*; /** * This is a wrapper for the Pj PDF parser */ public class PjWrapper { Pdf pdf; PjCatalog catalog; PjPagesNode rootPage; public PjWrapper(String PdfFileName,String TextFileName)throws IOException, PjException { pdf = new Pdf(PdfFileName); // hopefully the catalog can never be a reference... catalog = (PjCatalog) pdf.getObject(pdf.getCatalog()); // root node of pages tree is specified by a reference in the catalog rootPage = (PjPagesNode) pdf.resolve(catalog.getPages()); } public static void main (String [] args) throws IOException, PjException { /*PjWrapper testWrapper = new PjWrapper(args[0]); LinkedList textList = testWrapper.getAllText();*/ } /** * Returns as much text as we can extract from the PDF. * This currently includes: * * NOTE: Pj does not support LZW, so some text in some PDF's may not * be indexable */ public LinkedList getAllText() throws PjException { LinkedList stringList = new LinkedList(); Iterator streamIter = getAllContentsStreams().iterator(); PjStream stream; String streamData; String streamText; boolean moreData; int textStart, textEnd; //System.out.println("Going through streams..."); while(streamIter.hasNext()) { //System.out.println("Getting next stream"); stream = (PjStream) streamIter.next(); //System.out.println("Adding text from stream with filter: " +getFilterString(stream); stream = stream.flateDecompress(); //System.out.println("Adding text from stream with filter afterdecompress: " + getFilterString(stream)); streamData = new String(stream.getBuffer()); streamText = new String(); moreData = true; textStart = textEnd = 0; while(moreData) { if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) { moreData = false; break; } if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) { moreData = false; break; } try { streamText += PjString.decodePdf(streamData.substring(textStart,textEnd + 1)); } catch (Exception e) { System.out.println("malformed string: " + streamData.substring(textStart, textEnd + 1)); } } //if(streamText.equals("inserted text")) System.out.println(streamText); if (streamText.length() > 0) stringList.add(streamText); } return stringList; } public static String getFilterString(PjStream stream) throws PjException { String filterString = new String(); PjObject filter; //System.out.println("getting filter from dictionary"); if ((filter = stream.getStreamDictionary().getFilter()) == null) { //System.out.println("Got null filter"); return ""; } //System.out.println("got it"); // filter should either be a name or an array of names if (filter instanceof PjName) { //System.out.println("getting filter string from simple name"); filterString = ((PjName) filter).getString(); } else { //System.out.println("getting filter string from array of names"); Iterator nameIter; Vector nameVector; if ((nameVector = ((PjArray) filter).getVector()) == null) { //System.out.println("got null vector for list of names"); return ""; } nameIter = nameVector.iterator(); while (nameIter.hasNext()) { filterString += ((PjName) nameIter.next()).getString(); if (nameIter.hasNext()) filterString += " "; } } //System.out.println("got filter string"); return filterString; } /** * Performs a post-order traversal of the pages tree * from the root node and gets all of the contents streams * @returns a list of all the contents of all the pages */ public LinkedList getAllContentsStreams() throws InvalidPdfObjectException { return getContentsStreams(getAllPages()); } /** * Get contents streams from the list of PjPage objects * @returns a list of all the contents of the pages */ public LinkedList getContentsStreams(LinkedList pages) throws InvalidPdfObjectException { LinkedList streams = new LinkedList(); Iterator pageIter = pages.iterator(); PjObject contents; while(pageIter.hasNext()) { contents = pdf.resolve(((PjPage)pageIter.next()).getContents()); // should only be a stream or an array of streams (or refs to streams) if (contents instanceof PjStream) streams.add(contents); else{ Iterator streamsIter = ((PjArray)contents).getVector().iterator(); while(streamsIter.hasNext()) streams.add(pdf.resolve((PjObject)streamsIter.next())); } } return streams ; } /** * Performs a post-order traversal of the pages tree * from the root node. * @returns a list of all the PjPage objects */ public LinkedList getAllPages() throws InvalidPdfObjectException { LinkedList pages = new LinkedList(); getPages(rootPage, pages); return pages; } /** * Performs a post-order traversal of the pages tree * from the node passed to it. * @returns a list of all the PjPage objects under node */ public void getPages(PjObject node, LinkedList pages) throws InvalidPdfObjectException { PjPagesNode pageNode = null; // let's hope pdf's don't have pointers to pointers if (node instanceof PjReference) pageNode = (PjPagesNode) pdf.resolve(node); else pageNode = (PjPagesNode) node; if (pageNode instanceof PjPage) { pages.add(pageNode); return; } // kids better be an array and not a reference to one Iterator kidIterator = ((PjArray) ((PjPages) pageNode).getKids()).getVector().iterator(); while(kidIterator.hasNext()) { getPages((PjObject) kidIterator.next(), pages); } } public Pdf getPdf() { return pdf; } }

邦成为寄卖连锁

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java 分页读取文件大小_java 读取pdf (可分页读取)

需要pdfbox和log4j的包举个例子：import org.pdfbox.pdfparser.*;import org.pdfbox.util.PDFTextStripper;import java.io.*;/*** 测试pdfbox* @author kingfish* @version 1.0*/public class TestPdf {public static void main(...
复制链接

扫一扫