lucene如何解析pdf文档

最新推荐文章于 2022-03-28 11:54:45 发布

wzh20099

最新推荐文章于 2022-03-28 11:54:45 发布

阅读量294

点赞数

分类专栏： lucene 文章标签： lucene Adobe DWR D语言

本文链接：https://blog.csdn.net/wzh20099/article/details/83728939

版权

lucene 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

XPDF使用文档
XPDF版本 3.0.2
日期 2008-11-26
文档版本 V1.0

1、概述
读取PDF文件中的文本内容，可以使用开源项目xpdf。下载地址：http://www.foolabs.com/xpdf/download.html。
注意使用：xpdf-3.02pl2-win32.zip以及xpdf-chinese-simplified.tar.gz（支持中文）。

2、安装
将xpdf-3.02pl2-win32.zip解压缩到D盘xpdf目录下，我们将以d:\xpdf作为xpdf的工作路径。
将xpdf-chinese-simplified.tar解压缩到xpdf根目录下的xpdf-chinese-simplified目录中。
为了启用中文简体语言包，您必须将xpdf目录下的sample-xpdfrc文件另存为xpdfrc文件。
注意：此文件为配置文件，而且名称必须是xpdfrc。如果是别的名字，即使调用pdftotext.exe时，传入”-cfg xpdfrc2”来告诉xpdf配置文件的名字，好像pdftotext.exe也并没有使用这个配置文件。所以为了减少误解，请您将配置文件直接命名为xpdfrc。
并在这个xpdfrc文件最后加上以下配置，注意Map文件的路径一定要正确。

#----- begin Chinese Simplified support package (2004-jul-27)
cidToUnicode     Adobe-GB1  D:/xpdf/ xpdf-chinese-simplified/Adobe-GB1.cidToUnicode
unicodeMap ISO-2022-CN     D:/xpdf/ xpdf-chinese-simplified/ISO-2022-CN.unicodeMap
unicodeMap EUC-CN       D:/xpdf/xpdf-chinese-simplified/EUC-CN.unicodeMap
unicodeMap GBK      D:/xpdf/xpdf-chinese-simplified/GBK.unicodeMap
cMapDir      Adobe-GB1  D:/xpdf/xpdf-chinese-simplified/Cmap
toUnicodeDir                 D:/xpdf/xpdf-chinese-simplified/Cmap
#displayCIDFontTT   Adobe-GB1  /usr/..../gkai00mp.ttf
#----- end Chinese Simplified support package

另外，配置文件中原先没有加上一个“textPageBreaks”控制。为了避免这个分页符号，我们需要在xpdfrc文件“text output control”下面加上这么一段话：

# If set to "yes", text extraction will  insert  page
# breaks  (form feed characters) between pages.  This
# defaults to "yes".
textPageBreaks      no

设置textPageBreaks为no的意思是：在PDF文档的两页之间不加入分页符号。之所以这样，是因为这个符号有时候会引起SAX解析XML上的困难。
配置文件中原先把textEncoding注释了。这样默认的字符集是Latin1。我们必须打开它

#textEncoding		UTF-8
textEncoding		GBK

3、命令行调用
D:\xpdf\xpdf-3.02pl2-win32>pdftotext.exe -cfg xpdfrc d:\dwr中文文档(pdf).pdf

4、JAVA调用示范
pdftotext.exe的运行参数中，

private String excuteStr = "D:\\xpdf\\xpdf-3.02pl2-win32\\pdftotext.exe";

public String getContent()  {		
		String[] cmd = new String[] { excuteStr, "-enc", "UTF-8", "-q", file.getAbsolutePath(),"-" };

		Process p = null;
		BufferedInputStream bis = null ;
		InputStreamReader reader = null;
		StringBuffer sb = null;
		BufferedReader br = null;

		try {
			p = Runtime.getRuntime().exec(cmd);
			bis = new BufferedInputStream(p.getInputStream());
			reader = new InputStreamReader(bis, "UTF-8");

			sb = new StringBuffer();
			br = new BufferedReader(reader);
			String line = br.readLine();
			sb = new StringBuffer();
			while (line != null) {
				System.out.println(line);
				sb.append(line);
				sb.append(" ");
				line = br.readLine();
			}

		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				br.close() ;
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

		content = sb.toString() ;

 		return content ;
	}

一个应用的demo


package com.cs;

public interface Parsable {

	public String getTitle() ;
	public String getContent()  ;
	public String getSummary()  ;
}

package com.cs;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;

public class PdfParser implements Parsable {
	private File file ;
	private String content ;//内容

	/*
	 * pdf解读需配置
	 */
	private String executeStr = "E:\\EclipseStudyWorkspace\\LuceneParse\\xpdf\\xpdf-3.02pl2-win32\\pdftotext.exe" ;

	public PdfParser(File file){
		this.file = file ;
	}

	public String getContent(){

		if (content != null){
			return content ;
		}

		String[] cmd = new String[]{executeStr,"-enc","UTF-8","-q",file.getAbsolutePath(),"-"} ;
		Process p = null ;


		BufferedReader br = null ;
		StringBuffer sb = new StringBuffer() ;
		try {
			p = Runtime.getRuntime().exec(cmd) ;

			br = new BufferedReader(new InputStreamReader(p.getInputStream(),"UTF-8")) ;

			String str = null ;
			while((str = br.readLine() ) != null ){
				sb.append(str).append("\n") ;
			}

		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} finally{
			if (br != null){
				try {
					br.close() ;
				} catch (IOException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}
		content = sb.toString() ;

		return content ;
	}

	public String getSummary() {
		String summary ;
		if (content == null ) {
			getContent() ;
		}

		if (content.length() > 200) {
			summary = content.substring(0, 200) ;
		}else {
			summary = content ;
		}

		return summary;
	}

	public String getTitle(){
		return file.getName() ;
	}

	public static void main(String[] args){
		PdfParser parser = new PdfParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\123.pdf")) ;
		System.out.println("pdf content : "+parser.getContent()) ;
	}
}