利用pdfbox和poi抽取pdf、doc以及docx格式的内容

石头木V2
于 2024-07-22 08:55:35 发布
阅读量183
点赞数 3
文章标签： pdf
本文链接：https://blog.csdn.net/qq_20182781/article/details/140599791
版权
使用pdfbox1.5.0抽取pdf格式文档内容，使用poi3.7抽取doc及docx文档内容：
/**
 * Created by yan.shi on 2017/9/25.
 */
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;

/**
 * 这里使用pdfbox解析pdf类型文档
 * 使用poi解析doc与docx类型文档
 */
public class ExtractText {

    public static void main(String[] args) {
        ExtractText text=new ExtractText();
        String filePath="文件";
        String content=text.getText(filePath);
        if(null!=content)
            System.out.println("content: "+content);
    }

    public ExtractText(){
    }
    public ExtractText(String filePath){
    }

    /**
     * 根据不同的文档类型读取，这里只使用pdf、doc、docs类型
     * @param filePath
     * @return
     */
    public String getText(String filePath){
        File file = new File(filePath);
        String fileName=file.getName();
        String postfix=fileName.substring(fileName.lastIndexOf(".")+1);
        String content=null;
        if(postfix.equalsIgnoreCase("pdf")){
            content=getPDFText(file);
        }else if(postfix.equalsIgnoreCase("doc")){
            content=getDocText(file);
        }else if(postfix.equalsIgnoreCase("docx")){
            content=getDocxText(filePath);
        }else {
            System.out.println("输入的文件格式不支持！");
            return null;
        }
        if(null!=content && !"".equals(content))
            return content;
        else
            return null;
    }

    /**
     * 利用pdfbox解析pdf内容
     * @param file
     * @return
     */
    private String getPDFText(File file){
        FileInputStream fileinput=null;
        String text=null;
        try {
            fileinput=new FileInputStream(file);
            PDFParser parser=new PDFParser(fileinput);//pdf解析器
            parser.parse();//解析
            PDDocument pdfdocument=parser.getPDDocument();//pdf文档
            PDFTextStripper stripper=new PDFTextStripper();//文本剥离
            //List allPages=pdfdocument.getDocumentCatalog().getAllPages();
            text=stripper.getText(pdfdocument);//从pdf文档剥离文本
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(fileinput!=null){
                try {
                    fileinput.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return text;
    }

    /**
     * 读取doc文档类型
     * @param file
     * @return
     */
    private String getDocText(File file){
        FileInputStream fileinput=null;
        String text=null;

        try {
            fileinput=new FileInputStream(file);
            WordExtractor we=new WordExtractor(fileinput);
            //text=we.getText();
            String s[]=we.getParagraphText();
            for(String str:s){
                str=str.trim();
                if(str.equals("") || str==null)
                    continue;
                //System.out.println(str);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }finally {
            if(fileinput!=null){
                try {
                    fileinput.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return text;
    }

    /**
     * 读取docx文档类型
     * @param file
     * @return
     */
    private String getDocxText(String file){
        String text=null;
        try {
            OPCPackage opcPackage=POIXMLDocument.openPackage(file);
            POIXMLTextExtractor extractor=new XWPFWordExtractor(opcPackage);
            text=extractor.getText();
           //InputStream is=new FileInputStream(file);
            //XWPFWordExtractor doc=new XWPFWordExtractor(OPCPackage.open(is));
            //List<XWPFParagraph> paras=doc.get
            //System.out.println(text);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (XmlException e) {
            e.printStackTrace();
        } catch (OpenXML4JException e) {
            e.printStackTrace();
        }
        return text;
    }

}