java之读取office各种文档

jar包:

<!-- 读取doc/docx/pdf  -start-->
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>easyexcel</artifactId>
			<version>2.1.4</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-ooxml-schemas</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>poi-scratchpad</artifactId>
			<version>3.17</version>
		</dependency>
		<dependency>
			<groupId>org.apache.poi</groupId>
			<artifactId>ooxml-schemas</artifactId>
			<version>1.3</version>
		</dependency>
		<dependency>
			<groupId>org.apache.pdfbox</groupId>
			<artifactId>pdfbox</artifactId>
			<version>2.0.15</version>
		</dependency>
		<!-- 读取doc/docx/pdf  -end-->

工具类:


import com.alibaba.excel.EasyExcel;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;

/**
 * 读取word/pdf/excel
 */
public class ReadFile {

    /**
     * 简单粗暴
     *
     * @param file
     * @return
     */
    public static List<String> readDoc(File file) {
        InputStream fis = null;
        XWPFWordExtractor extractor = null;
        WordExtractor ex = null;
        List<String> list = new ArrayList<>();
        PDDocument pdfdocument = null;
        try {
            String fileName = file.getName();
            String extension = fileName.substring(file.getName().lastIndexOf(".")).toLowerCase();
            fis = new FileInputStream(file);
            if (".doc".equals(extension)) {
                ex = new WordExtractor(fis);
                list.addAll(Arrays.asList(ex.getText().split("\n")));
            } else if (".docx".equals(extension)) {
                XWPFDocument xdoc = new XWPFDocument(fis);
                extractor = new XWPFWordExtractor(xdoc);
                list.addAll(Arrays.asList(extractor.getText().split("\n")));
            } else if (".pdf".equals(extension)) {
                PDFParser parser = new PDFParser(new RandomAccessBuffer(fis)); // 创建PDF解析器
                parser.parse(); // 执行PDF解析过程
                pdfdocument = parser.getPDDocument(); // 获取解析器的PDF文档对象
                PDFTextStripper pdfstripper = new PDFTextStripper(); // 生成PDF文档内容剥离器
                list.addAll(Arrays.asList(pdfstripper.getText(pdfdocument).replace("\r", "").split("\n")));
            } else if (".xlsx .xls".contains(extension)) {
                List<Map> excelList = EasyExcel.read(file.getPath()).head(Map.class).sheet().doReadSync();
                List<Map> excels = JSONArray.parseArray(JSONObject.toJSONString(excelList), Map.class);
                list.add(JSON.toJSONString(excels));
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                if (null != fis) {
                    fis.close();
                }
                if (null != extractor) {
                    extractor.close();
                }
                if (null != ex) {
                    ex.close();
                }
                if (null != pdfdocument) {
                    pdfdocument.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return list;
    }
}

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值