java提取pdf题目_java读取doc,pdf问题。

最新推荐文章于 2023-10-24 19:10:33 发布

李禾子呀

最新推荐文章于 2023-10-24 19:10:33 发布

阅读量152

点赞数

文章标签： java提取pdf题目

本文链接：https://blog.csdn.net/weixin_42183486/article/details/114552160

版权

展开全部

PDFBox是一个开源的对pdf文件进行操作的库。 PDFBox-0.7.3.jar加入classpath。同时FontBox1.0.jar加入classpath，否则报错62616964757a686964616fe4b893e5b19e31333339653637

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import org.pdfbox.pdfparser.PDFParser;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfReader {

/**

* simply reader all the text from a pdf file.

* You have to deal with the format of the output text by yourself.

* 2008-2-25

* @param pdfFilePath file path

* @return all text in the pdf file

public static String getTextFromPDF(String pdfFilePath)

{

String result = null;

FileInputStream is = null;

PDDocument document = null;

try {

is = new FileInputStream(pdfFilePath);

PDFParser parser = new PDFParser(is);

parser.parse();

document = parser.getPDDocument();

PDFTextStripper stripper = new PDFTextStripper();

result = stripper.getText(document);

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} finally {

if (is != null) {

try {

is.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

if (document != null) {

try {

document.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return result;

}

public static void main(String[] args)

{

String str=PdfReader.getTextFromPDF("C:\\Read.pdf");

System.out.println(str);

}

代码2：import java.io.File;

import java.io.FileOutputStream;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.net.MalformedURLException;

import java.net.URL;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PDFReader {

public void readFdf(String file) throws Exception {

boolean sort = false;

String pdfFile = file;

String textFile = null;

String encoding = "UTF-8";

int startPage = 1;

int endPage = Integer.MAX_VALUE;

Writer output = null;

PDDocument document = null;

try {

// 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件

URL url = new URL(pdfFile);

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

// 获取PDF的文件名

String fileName = url.getFile();

// 以原来PDF的名称来命名新产生的txt文件

if (fileName.length() > 4) {

File outputFile = new File(fileName.substring(0, fileName

.length() - 4)

+ ".txt");

textFile = outputFile.getName();

}

} catch (MalformedURLException e) {

// 如果作为URL装载得到异常则从文件系统装载

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

if (pdfFile.length() > 4) {

textFile = pdfFile.substring(0, pdfFile.length() - 4)

+ ".txt";

}

output = new OutputStreamWriter(new FileOutputStream(textFile),

encoding);

PDFTextStripper stripper = null;

stripper = new PDFTextStripper();

// 设置是否排序

stripper.setSortByPosition(sort);

// 设置起始页

stripper.setStartPage(startPage);

// 设置结束页

stripper.setEndPage(endPage);

// 调用PDFTextStripper的writeText提取并输出文本

stripper.writeText(document, output);

} finally {

if (output != null) {

// 关闭输出流

output.close();

}

if (document != null) {

// 关闭PDF Document

document.close();

}

/**

* @param args

public static void main(String[] args) {

// TODO Auto-generated method stub

PDFReader pdfReader = new PDFReader();

try {

// 取得E盘下的SpringGuide.pdf的内容

pdfReader.readFdf("C:\\Read.pdf");

} catch (Exception e) {

e.printStackTrace();

}

2、抽取支持中文的pdf文件－xpdf

xpdf是一个开源项目，我们可以调用他的本地方法来实现抽取中文pdf文件。

http://www.java-cn.com/technology/tech_downs/1880_004.zip

补丁包：

http://www.java-cn.com/technology/tech_downs/1880_005.zip

按照readme放好中文的patch，就可以开始写调用本地方法的java程序了。

下面是一个如何调用的例子：import java.io.*;

/**

Title: pdf extraction

Description: email:chris@matrix.org.cn

Company: Matrix.org.cn

* @author chris

* @version 1.0,who use this example pls remain the declare

public class PdfWin {

public PdfWin() {

}

public static void main(String args[]) throws Exception

{

String PATH_TO_XPDF="C:Program Filesxpdfpdftotext.exe";

String filename="c:a.pdf";

String[] cmd = new String[] { PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};

Process p = Runtime.getRuntime().exec(cmd);

BufferedInputStream bis = new BufferedInputStream(p.getInputStream());

InputStreamReader reader = new InputStreamReader(bis, "UTF-8");

StringWriter out = new StringWriter();

char [] buf = new char[10000];

int len;

while((len = reader.read(buf))>= 0) {

//out.write(buf, 0, len);

System.out.println("the length is"+len);

}

reader.close();

String ts=new String(buf);

System.out.println("the str is"+ts);

}

李禾子呀

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫