Java读取word,PDF,Excel,Ppt等

只能抽取文本

1.导入依赖

  <!--读取word,ppt,excel文件-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.xmlbeans</groupId>
            <artifactId>xmlbeans</artifactId>
            <version>2.6.0</version>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
            <version>1.6.1</version>
        </dependency>
        <!--读取pdf-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.2</version>
        </dependency>

2.具体代码

public class TestDemo {
    public static void main(String[] args) throws IOException, OpenXML4JException, XmlException, BadLocationException {
        File file = new File("E://merge//");
        File[] files = file.listFiles();
        //判断文件夹是否存在文件
        if (null == files || files.length == 0) {
            System.out.println("文件夹是空的");
            return;
        } else {
            FileInputStream fin =null;
            for (File file1 : files) {
                String name = file1.getName();
                if (name.endsWith(".doc")) {
                    fin = new FileInputStream(file1);
                    String content1 = readDoc1(fin);
                    fin.close();
                } else if (name.endsWith(".docx")) {
                    String content2 = readDoc2(file1.getPath());
                } else if (name.endsWith(".rtf")) {
                    fin= new FileInputStream(file1);
                    String content3 = readDoc3(fin);
                    fin.close();
                } else if (name.endsWith(".ppt")) {
                    fin = new FileInputStream(file1);
                    String content4 = readPpt1(fin);
                    fin.close();
                } else if (name.endsWith(".pptx")) {
                    fin = new FileInputStream(file1);
                    String content5 = readPpt2(fin);
                    fin.close();
                } else if (name.endsWith(".xls")) {
                    fin = new FileInputStream(file1);
                    String content6 = readExcel1(fin);
                    fin.close();
                } else if (name.endsWith(".xlsx")) {
                    fin = new FileInputStream(file1);
                    String content7 = readExcel2(fin);
                    fin.close();
                } else if (name.endsWith(".pdf")) {
                    String content8 = readPdf(file1);
                }
            }
        }
    }
    //抽取word,.doc结尾
    public static String readDoc1(InputStream is) throws IOException {
        WordExtractor extractor = new WordExtractor(is);
        return extractor.getText();
    }
    //抽取word,.docx结尾
    public static String readDoc2(String path) throws OpenXML4JException, XmlException, IOException {
        OPCPackage opcPackage = POIXMLDocument.openPackage(path);
        POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
        return extractor.getText();
    }
    //抽取word,.rtf结尾
    public static String readDoc3(InputStream is) throws IOException, BadLocationException {
        DefaultStyledDocument document = new DefaultStyledDocument();
        new RTFEditorKit().read(is,document,0);
        String text = new String(document.getText(0,document.getLength()).getBytes("ISO8859-1"),"GBK");
        return  text;
    }
    //抽取ppt,ppt结尾
    public static String readPpt1(InputStream is) throws IOException{
        PowerPointExtractor extractor=new PowerPointExtractor(is);
        return extractor.getText();
    }
    //抽取ppt,pptx结尾
    public static String readPpt2(InputStream is) throws IOException {
        StringBuffer buffer = new StringBuffer();
        XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
        List<XSLFSlide> slides = Arrays.asList(xmlSlideShow.getSlides());
        for (XSLFSlide slide : slides) {
            CTSlide rawSlide = slide.getXmlObject();
            CTGroupShape gs = rawSlide.getCSld().getSpTree();
            CTShape[] shapes = gs.getSpArray();
            for (CTShape shape : shapes) {
                CTTextBody tb = shape.getTxBody();
                if (null == tb) {
                    continue;
                }
                CTTextParagraph[] paras = tb.getPArray();
                for (CTTextParagraph textParagraph : paras) {
                    CTRegularTextRun[] textRuns = textParagraph.getRArray();
                    for (CTRegularTextRun textRun : textRuns) {
                        buffer.append(textRun.getT());
                    }
                }
            }
        }
        String s = buffer.toString();
        return s;
    }

    //抽取excel,xls结尾
    public static String readExcel1(InputStream is)throws IOException {
    HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(is));
    ExcelExtractor extractor = new ExcelExtractor(wb);
        extractor.setFormulasNotResults(false);
        extractor.setIncludeSheetNames(true);
        return extractor.getText();
    }

    //抽取Excel,.xlsx结尾
    public static String readExcel2(InputStream is) throws IOException {
        StringBuffer buffer = new StringBuffer();
        XSSFWorkbook wb = new XSSFWorkbook(is);
        Sheet sheet = wb.getSheetAt(0);
        int firstRowIndex = sheet.getFirstRowNum();
        int lastRowIndex = sheet.getLastRowNum();
        for(int row=firstRowIndex;row<=lastRowIndex;row++) {
            Row row1 = sheet.getRow(row);
            if (row1 != null) {
                int firstCellIndex = row1.getFirstCellNum();
                int lastCellIndex = row1.getLastCellNum();
                for (int cIndex = firstCellIndex; cIndex < lastCellIndex; cIndex++) {   //遍历列
                    Cell cell = row1.getCell(cIndex);
                    if (cell != null) {
                        buffer.append(cell.toString()+"\t");
                    }
                }
            }

        }
        return buffer.toString();
    }

    //抽取pdf
    public static String readPdf(File file) throws IOException {
        PDDocument pd = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();
        return stripper.getText(pd);
    }
}

只能抽取文件中的文本内容,图片等不会加载,下图是抽取的ppt和excel只能抽取到文本,具体的实现要结合需求。我做的是消极处理方便查看,在开发中还是要积极处理。
另外:我是新手有任何问题都可提出谢谢

  • 3
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值