读取文件文本内容工具类支持world 2003 2007+ execl2003 2007+ txt

最新推荐文章于 2021-01-28 10:43:52 发布

程序猿哒哒哒

最新推荐文章于 2021-01-28 10:43:52 发布

阅读量174

点赞数

分类专栏：工具类文章标签：文件内容读取工具类 word excel text

工具类专栏收录该内容

7 篇文章 0 订阅

订阅专栏

POM依赖：

<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-scratchpad</artifactId>
	<version>3.17</version>
</dependency>
<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi-ooxml</artifactId>
	<version>3.17</version>
</dependency>
<dependency>
	<groupId>org.apache.poi</groupId>
	<artifactId>poi</artifactId>
	<version>3.17</version>
</dependency>

package com.binmma.utils;/**
 * Created by binmma on 2019/2/28 0028.
 */
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.*;

/**
 * @Author binmma
 * @Date 2019/2/28 0028 下午 5:49
 * @Desc 文件读取
 **/
public class ReadFileUtils {
    private static Logger LOGGER = LoggerFactory.getLogger(ReadFileUtils.class);
    private static final int BUFFER_SIZE=1024*4;

    /**
     * 读取文件内容 支持word Excel txt
     * @param inputStream 输入流
     * @param suffix 文件格式
     * @return
     */
    public static String readFile(InputStream inputStream ,String suffix) {
        return parse(input2byte(inputStream),suffix);
    }
    /**
     * 读取文件内容 支持word Excel txt
     * @param file
     * @return
     */
    public static String readFile(File file) throws FileNotFoundException {
        if(file==null || !file.exists()){
            throw new FileNotFoundException("文件不存在");
        }
        String fileName = file.getName();
        String suffix = fileName.substring(fileName.lastIndexOf(".")+1);
        InputStream inputStream = null;
        try{
            inputStream = new FileInputStream(file);
        }catch (Exception e){
            LOGGER.error("文件【{}】读取失败",file.getAbsolutePath(),e);
        }
        return parse(input2byte(inputStream),suffix);
    }

    /**
     * 读取文件内容 支持word Excel txt
     * @param filePath 文件名
     * @return
     */
    public static String readFile(String filePath) {
        if(StringUtils.isBlank(filePath)){
            throw new IllegalArgumentException("文件路径为空");
        }
        String suffix = filePath.substring(filePath.lastIndexOf(".")+1);
        InputStream inputStream = null;
        try{
            inputStream = new FileInputStream(filePath);
        }catch (Exception e){
            LOGGER.error("文件【{}】读取失败",filePath,e);
        }
        return parse(input2byte(inputStream),suffix);
    }
    

    /**
     * 判断文档类型，调用不同的解析方法
     *
     * @param buffer
     * @param suffix
     * @return
     */
    public static String parse(byte[] buffer, String suffix) {
        String text = "";
        if (StringUtils.equals(suffix,"doc")) {
            text = getTextFromWord(buffer);
        } else if (StringUtils.equals(suffix, "docx")) {
            text = getTextFromWord2007(buffer);
        } else if (StringUtils.equals(suffix, "xls")) {
            text = getTextFromExcel(buffer);
        } else if (StringUtils.equals(suffix, "xlsx")) {
            text = getTextFromExcel2007(buffer);
        } else if (StringUtils.equals(suffix, "txt")) {
            text = getTextFormTxt(buffer);
        } else {
            LOGGER.error("不支持解析的文档类型");
        }
        return text;
    }

    /**
     * 读取Word97-2003的全部内容 doc
     * @param file
     * @return
     */
    private static String getTextFromWord(byte[] file) {
        String text = "";
        InputStream fis = null;
        WordExtractor ex = null;
        try {
            // word 2003： 图片不会被读取
            fis = new ByteArrayInputStream(file);
            ex = new WordExtractor(fis);
            text = ex.getText();
            ex.close();
        } catch (Exception e) {
            LOGGER.error("Word97-2003 doc文本解析失败");
        }
        return text;
    }

    /**
     * 读取Word2007+的全部内容 docx
     *
     * @param file
     * @return
     */
    private static String getTextFromWord2007(byte[] file) {
        String text = "";
        InputStream fis = null;
        XWPFDocument doc = null;
        XWPFWordExtractor workbook = null;
        try {
            fis = new ByteArrayInputStream(file);
            doc = new XWPFDocument(fis);
            workbook = new XWPFWordExtractor(doc);
            text = workbook.getText();
            workbook.close();
        } catch (IOException e) {
            LOGGER.error("Word2007+ docx文本解析失败");
        }
        return text;
    }

    /**
     * 读取Excel97-2003的全部内容 xls
     *
     * @param file
     * @return
     */
    private static String getTextFromExcel(byte[] file) {
        InputStream is = null;
        HSSFWorkbook wb = null;
        String text = "";
        try {
            is = new ByteArrayInputStream(file);
            wb = new HSSFWorkbook(new POIFSFileSystem(is));
            ExcelExtractor extractor = new ExcelExtractor(wb);
            extractor.setFormulasNotResults(false);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
            extractor.close();
        } catch (IOException e) {
            LOGGER.error("Excel97-2003 xls文本解析失败");
        }
        return text;
    }

    /**
     * 读取Excel2007+的全部内容 xlsx
     *
     * @param file
     * @return
     */
    private static String getTextFromExcel2007(byte[] file) {
        InputStream is = null;
        XSSFWorkbook workBook = null;
        String text = "";
        try {
            is = new ByteArrayInputStream(file);
            workBook = new XSSFWorkbook(is);
            XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);
            extractor.setIncludeSheetNames(false);
            text = extractor.getText();
            extractor.close();
        } catch (IOException e) {
            LOGGER.error("Excel2007+ xlsx文本解析失败");
        }
        return text;
    }

    /**
     * 读取txt文件全部内容 txt
     *
     * @param file
     * @return
     */
    private static String getTextFormTxt(byte[] file) {
        String text = "";
        try {
            String encoding = get_charset(file);
            text = new String(file, encoding);
        } catch (Exception e) {
            LOGGER.error("txt 文本解析失败");
            e.printStackTrace();
        }
        return text;
    }

    /**
     * 获得txt文件编码方式
     *
     * @param file
     * @return
     * @throws IOException
     */
    private static String get_charset(byte[] file) throws IOException {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        InputStream bis = null;
        try {
            boolean checked = false;
            bis = new ByteArrayInputStream(file);
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1)
                return charset;
            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE
                    && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF
                    && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                while ((read = bis.read()) != -1) {
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的，也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
                            // (0x80 - 0xBF),也可能在GB编码内
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错，但是几率较小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
        } catch (Exception e) {
            LOGGER.error("txt编码格式出错");
            e.printStackTrace();
        } finally {
            if (bis != null) {
                bis.close();
            }
        }
        return charset;
    }
    /**
     * input2byte
     * @param inStream
     * @return
     */
    public static byte[] input2byte(InputStream inStream) {
        ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
        byte[] buff = new byte[BUFFER_SIZE];
        int rc = 0;
        try {
            while ((rc = inStream.read(buff, 0, 100)) > 0) {
                swapStream.write(buff, 0, rc);
            }
        } catch (IOException e) {
            LOGGER.error("input2byte出错",e);
        }finally {
            try {
                inStream.close();
            } catch (IOException e) {
                LOGGER.error("文件关闭失败");
            }
        }
        byte[] in2b = swapStream.toByteArray();
        return in2b;
    }
}

原文：

程序猿哒哒哒

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
读取文件文本内容工具类支持world 2003 2007+ execl2003 2007+ txt

POM依赖：&lt;dependency&gt; &lt;groupId&gt;org.apache.poi&lt;/groupId&gt; &lt;artifactId&gt;poi-scratchpad&lt;/artifactId&gt; &lt;version&gt;3.17&lt;/version&gt;&lt;/dependency&gt;&lt
复制链接

扫一扫