Java读取word，PDF，Excel,Ppt等

最新推荐文章于 2024-04-23 22:41:04 发布

欣欣向荣888

最新推荐文章于 2024-04-23 22:41:04 发布

阅读量353

点赞数 3

文章标签： excel ppt java

本文链接：https://blog.csdn.net/tammytt/article/details/109047683

版权

只能抽取文本。

1.导入依赖

  <!--读取word,ppt,excel文件-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml-schemas</artifactId>
            <version>3.8</version>
        </dependency>
        <dependency>
            <groupId>org.apache.xmlbeans</groupId>
            <artifactId>xmlbeans</artifactId>
            <version>2.6.0</version>
        </dependency>
        <dependency>
            <groupId>dom4j</groupId>
            <artifactId>dom4j</artifactId>
            <version>1.6.1</version>
        </dependency>
        <!--读取pdf-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.2</version>
        </dependency>

2.具体代码

public class TestDemo {
    public static void main(String[] args) throws IOException, OpenXML4JException, XmlException, BadLocationException {
        File file = new File("E://merge//");
        File[] files = file.listFiles();
        //判断文件夹是否存在文件
        if (null == files || files.length == 0) {
            System.out.println("文件夹是空的");
            return;
        } else {
            FileInputStream fin =null;
            for (File file1 : files) {
                String name = file1.getName();
                if (name.endsWith(".doc")) {
                    fin = new FileInputStream(file1);
                    String content1 = readDoc1(fin);
                    fin.close();
                } else if (name.endsWith(".docx")) {
                    String content2 = readDoc2(file1.getPath());
                } else if (name.endsWith(".rtf")) {
                    fin= new FileInputStream(file1);
                    String content3 = readDoc3(fin);
                    fin.close();
                } else if (name.endsWith(".ppt")) {
                    fin = new FileInputStream(file1);
                    String content4 = readPpt1(fin);
                    fin.close();
                } else if (name.endsWith(".pptx")) {
                    fin = new FileInputStream(file1);
                    String content5 = readPpt2(fin);
                    fin.close();
                } else if (name.endsWith(".xls")) {
                    fin = new FileInputStream(file1);
                    String content6 = readExcel1(fin);
                    fin.close();
                } else if (name.endsWith(".xlsx")) {
                    fin = new FileInputStream(file1);
                    String content7 = readExcel2(fin);
                    fin.close();
                } else if (name.endsWith(".pdf")) {
                    String content8 = readPdf(file1);
                }
            }
        }
    }
    //抽取word,.doc结尾
    public static String readDoc1(InputStream is) throws IOException {
        WordExtractor extractor = new WordExtractor(is);
        return extractor.getText();
    }
    //抽取word，.docx结尾
    public static String readDoc2(String path) throws OpenXML4JException, XmlException, IOException {
        OPCPackage opcPackage = POIXMLDocument.openPackage(path);
        POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
        return extractor.getText();
    }
    //抽取word,.rtf结尾
    public static String readDoc3(InputStream is) throws IOException, BadLocationException {
        DefaultStyledDocument document = new DefaultStyledDocument();
        new RTFEditorKit().read(is,document,0);
        String text = new String(document.getText(0,document.getLength()).getBytes("ISO8859-1"),"GBK");
        return  text;
    }
    //抽取ppt,ppt结尾
    public static String readPpt1(InputStream is) throws IOException{
        PowerPointExtractor extractor=new PowerPointExtractor(is);
        return extractor.getText();
    }
    //抽取ppt,pptx结尾
    public static String readPpt2(InputStream is) throws IOException {
        StringBuffer buffer = new StringBuffer();
        XMLSlideShow xmlSlideShow = new XMLSlideShow(is);
        List<XSLFSlide> slides = Arrays.asList(xmlSlideShow.getSlides());
        for (XSLFSlide slide : slides) {
            CTSlide rawSlide = slide.getXmlObject();
            CTGroupShape gs = rawSlide.getCSld().getSpTree();
            CTShape[] shapes = gs.getSpArray();
            for (CTShape shape : shapes) {
                CTTextBody tb = shape.getTxBody();
                if (null == tb) {
                    continue;
                }
                CTTextParagraph[] paras = tb.getPArray();
                for (CTTextParagraph textParagraph : paras) {
                    CTRegularTextRun[] textRuns = textParagraph.getRArray();
                    for (CTRegularTextRun textRun : textRuns) {
                        buffer.append(textRun.getT());
                    }
                }
            }
        }
        String s = buffer.toString();
        return s;
    }

    //抽取excel,xls结尾
    public static String readExcel1(InputStream is)throws IOException {
    HSSFWorkbook wb = new HSSFWorkbook(new POIFSFileSystem(is));
    ExcelExtractor extractor = new ExcelExtractor(wb);
        extractor.setFormulasNotResults(false);
        extractor.setIncludeSheetNames(true);
        return extractor.getText();
    }

    //抽取Excel,.xlsx结尾
    public static String readExcel2(InputStream is) throws IOException {
        StringBuffer buffer = new StringBuffer();
        XSSFWorkbook wb = new XSSFWorkbook(is);
        Sheet sheet = wb.getSheetAt(0);
        int firstRowIndex = sheet.getFirstRowNum();
        int lastRowIndex = sheet.getLastRowNum();
        for(int row=firstRowIndex;row<=lastRowIndex;row++) {
            Row row1 = sheet.getRow(row);
            if (row1 != null) {
                int firstCellIndex = row1.getFirstCellNum();
                int lastCellIndex = row1.getLastCellNum();
                for (int cIndex = firstCellIndex; cIndex < lastCellIndex; cIndex++) {   //遍历列
                    Cell cell = row1.getCell(cIndex);
                    if (cell != null) {
                        buffer.append(cell.toString()+"\t");
                    }
                }
            }

        }
        return buffer.toString();
    }

    //抽取pdf
    public static String readPdf(File file) throws IOException {
        PDDocument pd = PDDocument.load(file);
        PDFTextStripper stripper = new PDFTextStripper();
        return stripper.getText(pd);
    }
}

只能抽取文件中的文本内容，图片等不会加载，下图是抽取的ppt和excel只能抽取到文本，具体的实现要结合需求。我做的是消极处理方便查看，在开发中还是要积极处理。
另外：我是新手有任何问题都可提出谢谢

欣欣向荣888

关注

3
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Java读取word，PDF，Excel,Ppt等

只能抽取文本。1.导入依赖  <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.8</version> &lt
复制链接

扫一扫