java pdf 分页_java 读取pdf (可分页读取)

需要pdfbox和log4j的包

举个例子:

import org.pdfbox.pdfparser.*;

import org.pdfbox.util.PDFTextStripper;

import java.io.*;

/**

* 测试pdfbox

* @author kingfish

* @version 1.0

*/

public class TestPdf {

public static void main(String[] args) throws Exception{

FileInputStream fis = new FileInputStream("c://intro.pdf");

PDFParser p = new PDFParser(fis);

p.parse();

PDFTextStripper ts = new PDFTextStripper();

String s = ts.getText(p.getPDDocument());

System.out.println(s);

fis.close();

}

}

--------------------------------------------------------------------------------

import java.io.*;

import java.util.*;

import com.etymon.pj.*;

import com.etymon.pj.object.*;

import com.etymon.pj.exception.*;

/**

* This is a wrapper for the Pj PDF parser

*/

public class PjWrapper {

Pdf pdf;

PjCatalog catalog;

PjPagesNode rootPage;

public PjWrapper(String PdfFileName,String TextFileName)throws

IOException, PjException {

pdf = new Pdf(PdfFileName);

// hopefully the catalog can never be a reference...

catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());

// root node of pages tree is specified by a reference in the catalog

rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());

}

public static void main (String [] args) throws IOException, PjException

{

/*PjWrapper testWrapper = new PjWrapper(args[0]);

LinkedList textList = testWrapper.getAllText();*/

}

/**

* Returns as much text as we can extract from the PDF.

* This currently includes:

*

* NOTE: Pj does not support LZW, so some text in some PDF's may not

* be indexable

*/

public LinkedList getAllText() throws PjException {

LinkedList stringList = new LinkedList();

Iterator streamIter = getAllContentsStreams().iterator();

PjStream stream;

String streamData;

String streamText;

boolean moreData;

int textStart, textEnd;

//System.out.println("Going through streams...");

while(streamIter.hasNext()) {

//System.out.println("Getting next stream");

stream = (PjStream) streamIter.next();

//System.out.println("Adding text from stream with filter: "

+getFilterString(stream);

stream = stream.flateDecompress();

//System.out.println("Adding text from stream with filter

afterdecompress: " + getFilterString(stream));

streamData = new String(stream.getBuffer());

streamText = new String();

moreData = true;

textStart = textEnd = 0;

while(moreData) {

if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) {

moreData = false;

break;

}

if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) {

moreData = false;

break;

}

try {

streamText +=

PjString.decodePdf(streamData.substring(textStart,textEnd + 1));

} catch (Exception e) {

System.out.println("malformed string: " +

streamData.substring(textStart, textEnd + 1));

}

}

//if(streamText.equals("inserted text"))

System.out.println(streamText);

if (streamText.length() > 0)

stringList.add(streamText);

}

return stringList;

}

public static String getFilterString(PjStream stream) throws PjException

{

String filterString = new String();

PjObject filter;

//System.out.println("getting filter from dictionary");

if ((filter = stream.getStreamDictionary().getFilter()) == null) {

//System.out.println("Got null filter");

return "";

}

//System.out.println("got it");

// filter should either be a name or an array of names

if (filter instanceof PjName) {

//System.out.println("getting filter string from simple name");

filterString = ((PjName) filter).getString();

} else {

//System.out.println("getting filter string from array of names");

Iterator nameIter;

Vector nameVector;

if ((nameVector = ((PjArray) filter).getVector()) == null) {

//System.out.println("got null vector for list of names");

return "";

}

nameIter = nameVector.iterator();

while (nameIter.hasNext()) {

filterString += ((PjName) nameIter.next()).getString();

if (nameIter.hasNext())

filterString += " ";

}

}

//System.out.println("got filter string");

return filterString;

}

/**

* Performs a post-order traversal of the pages tree

* from the root node and gets all of the contents streams

* @returns a list of all the contents of all the pages

*/

public LinkedList getAllContentsStreams() throws

InvalidPdfObjectException {

return getContentsStreams(getAllPages());

}

/**

* Get contents streams from the list of PjPage objects

* @returns a list of all the contents of the pages

*/

public LinkedList getContentsStreams(LinkedList pages) throws

InvalidPdfObjectException {

LinkedList streams = new LinkedList();

Iterator pageIter = pages.iterator();

PjObject contents;

while(pageIter.hasNext()) {

contents = pdf.resolve(((PjPage)pageIter.next()).getContents());

// should only be a stream or an array of streams (or refs to

streams)

if (contents instanceof PjStream)

streams.add(contents);

else{

Iterator streamsIter = ((PjArray)contents).getVector().iterator();

while(streamsIter.hasNext())

streams.add(pdf.resolve((PjObject)streamsIter.next()));

}

}

return streams ;

}

/**

* Performs a post-order traversal of the pages tree

* from the root node.

* @returns a list of all the PjPage objects

*/

public LinkedList getAllPages() throws InvalidPdfObjectException {

LinkedList pages = new LinkedList();

getPages(rootPage, pages);

return pages;

}

/**

* Performs a post-order traversal of the pages tree

* from the node passed to it.

* @returns a list of all the PjPage objects under node

*/

public void getPages(PjObject node, LinkedList pages) throws

InvalidPdfObjectException {

PjPagesNode pageNode = null;

// let's hope pdf's don't have pointers to pointers

if (node instanceof PjReference)

pageNode = (PjPagesNode) pdf.resolve(node);

else

pageNode = (PjPagesNode) node;

if (pageNode instanceof PjPage) {

pages.add(pageNode);

return;

}

// kids better be an array and not a reference to one

Iterator kidIterator = ((PjArray) ((PjPages)

pageNode).getKids()).getVector().iterator();

while(kidIterator.hasNext()) {

getPages((PjObject) kidIterator.next(), pages);

}

}

public Pdf getPdf() {

return pdf;

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值