POM依赖:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
package com.binmma.utils;/**
* Created by binmma on 2019/2/28 0028.
*/
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
/**
* @Author binmma
* @Date 2019/2/28 0028 下午 5:49
* @Desc 文件读取
**/
public class ReadFileUtils {
private static Logger LOGGER = LoggerFactory.getLogger(ReadFileUtils.class);
private static final int BUFFER_SIZE=1024*4;
/**
* 读取文件内容 支持word Excel txt
* @param inputStream 输入流
* @param suffix 文件格式
* @return
*/
public static String readFile(InputStream inputStream ,String suffix) {
return parse(input2byte(inputStream),suffix);
}
/**
* 读取文件内容 支持word Excel txt
* @param file
* @return
*/
public static String readFile(File file) throws FileNotFoundException {
if(file==null || !file.exists()){
throw new FileNotFoundException("文件不存在");
}
String fileName = file.getName();
String suffix = fileName.substring(fileName.lastIndexOf(".")+1);
InputStream inputStream = null;
try{
inputStream = new FileInputStream(file);
}catch (Exception e){
LOGGER.error("文件【{}】读取失败",file.getAbsolutePath(),e);
}
return parse(input2byte(inputStream),suffix);
}
/**
* 读取文件内容 支持word Excel txt
* @param filePath 文件名
* @return
*/
public static String readFile(String filePath) {
if(StringUtils.isBlank(filePath)){
throw new IllegalArgumentException("文件路径为空");
}
String suffix = filePath.substring(filePath.lastIndexOf(".")+1);
InputStream inputStream = null;
try{
inputStream = new FileInputStream(filePath);
}catch (Exception e){
LOGGER.error("文件【{}】读取失败",filePath,e);
}
return parse(input2byte(inputStream),suffix);
}
/**
* 判断文档类型,调用不同的解析方法
*
* @param buffer
* @param suffix
* @return
*/
public static String parse(byte[] buffer, String suffix) {
String text = "";
if (StringUtils.equals(suffix,"doc")) {
text = getTextFromWord(buffer);
} else if (StringUtils.equals(suffix, "docx")) {
text = getTextFromWord2007(buffer);
} else if (StringUtils.equals(suffix, "xls")) {
text = getTextFromExcel(buffer);
} else if (StringUtils.equals(suffix, "xlsx")) {
text = getTextFromExcel2007(buffer);
} else if (StringUtils.equals(suffix, "txt")) {
text = getTextFormTxt(buffer);
} else {
LOGGER.error("不支持解析的文档类型");
}
return text;
}
/**
* 读取Word97-2003的全部内容 doc
* @param file
* @return
*/
private static String getTextFromWord(byte[] file) {
String text = "";
InputStream fis = null;
WordExtractor ex = null;
try {
// word 2003: 图片不会被读取
fis = new ByteArrayInputStream(file);
ex = new WordExtractor(fis);
text = ex.getText();
ex.close();
} catch (Exception e) {
LOGGER.error("Word97-2003 doc文本解析失败");
}
return text;
}
/**
* 读取Word2007+的全部内容 docx
*
* @param file
* @return
*/
private static String getTextFromWord2007(byte[] file) {
String text = "";
InputStream fis = null;
XWPFDocument doc = null;
XWPFWordExtractor workbook = null;
try {
fis = new ByteArrayInputStream(file);
doc = new XWPFDocument(fis);
workbook = new XWPFWordExtractor(doc);
text = workbook.getText();
workbook.close();
} catch (IOException e) {
LOGGER.error("Word2007+ docx文本解析失败");
}
return text;
}
/**
* 读取Excel97-2003的全部内容 xls
*
* @param file
* @return
*/
private static String getTextFromExcel(byte[] file) {
InputStream is = null;
HSSFWorkbook wb = null;
String text = "";
try {
is = new ByteArrayInputStream(file);
wb = new HSSFWorkbook(new POIFSFileSystem(is));
ExcelExtractor extractor = new ExcelExtractor(wb);
extractor.setFormulasNotResults(false);
extractor.setIncludeSheetNames(false);
text = extractor.getText();
extractor.close();
} catch (IOException e) {
LOGGER.error("Excel97-2003 xls文本解析失败");
}
return text;
}
/**
* 读取Excel2007+的全部内容 xlsx
*
* @param file
* @return
*/
private static String getTextFromExcel2007(byte[] file) {
InputStream is = null;
XSSFWorkbook workBook = null;
String text = "";
try {
is = new ByteArrayInputStream(file);
workBook = new XSSFWorkbook(is);
XSSFExcelExtractor extractor = new XSSFExcelExtractor(workBook);
extractor.setIncludeSheetNames(false);
text = extractor.getText();
extractor.close();
} catch (IOException e) {
LOGGER.error("Excel2007+ xlsx文本解析失败");
}
return text;
}
/**
* 读取txt文件全部内容 txt
*
* @param file
* @return
*/
private static String getTextFormTxt(byte[] file) {
String text = "";
try {
String encoding = get_charset(file);
text = new String(file, encoding);
} catch (Exception e) {
LOGGER.error("txt 文本解析失败");
e.printStackTrace();
}
return text;
}
/**
* 获得txt文件编码方式
*
* @param file
* @return
* @throws IOException
*/
private static String get_charset(byte[] file) throws IOException {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
InputStream bis = null;
try {
boolean checked = false;
bis = new ByteArrayInputStream(file);
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1)
return charset;
if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE";
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8";
checked = true;
}
bis.reset();
if (!checked) {
while ((read = bis.read()) != -1) {
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 单独出现BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 双字节 (0xC0 - 0xDF)
// (0x80 - 0xBF),也可能在GB编码内
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出错,但是几率较小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
} catch (Exception e) {
LOGGER.error("txt编码格式出错");
e.printStackTrace();
} finally {
if (bis != null) {
bis.close();
}
}
return charset;
}
/**
* input2byte
* @param inStream
* @return
*/
public static byte[] input2byte(InputStream inStream) {
ByteArrayOutputStream swapStream = new ByteArrayOutputStream();
byte[] buff = new byte[BUFFER_SIZE];
int rc = 0;
try {
while ((rc = inStream.read(buff, 0, 100)) > 0) {
swapStream.write(buff, 0, rc);
}
} catch (IOException e) {
LOGGER.error("input2byte出错",e);
}finally {
try {
inStream.close();
} catch (IOException e) {
LOGGER.error("文件关闭失败");
}
}
byte[] in2b = swapStream.toByteArray();
return in2b;
}
}