一、引用所需要的jar包
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.23</version>
</dependency>
二、进行进行导包
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
三、读取文件内容
1、对于doc文件读取文件内容
/**
* doc读取文件内容
* @param file 文件路径
*/
public String searchContent(File file) {
String text = "";
try (FileInputStream fis = new FileInputStream(file);
HWPFDocument document = new HWPFDocument(fis)) {
Range range = document.getRange();
text = range.text();
return text;
} catch (Exception e) {
return text;
}
}
2、docx文件读取文件内容
/**
* 读取文件内容并搜索匹配的文本
*
* @param file 文件路径
*/
public String docxSearchContent(File file) {
String s = "";
try {
FileInputStream fis = new FileInputStream(file);
XWPFDocument document = new XWPFDocument(fis);
List<XWPFParagraph> paragraphs = document.getParagraphs();
for (XWPFParagraph paragraph : paragraphs) {
String text = paragraph.getText();
s += text;
}
document.close();
fis.close();
return s;
} catch (Exception e) {
return s;
}
}
3、pdf文件内容读取
/**
* 读取pdf进行匹配
*
* @param file 文件
*/
private String pdfSearch(File file) {
String info = "";
try {
//multipartFile为multipartFile文件类型,将文件转化为文件流被PDDocument加载
PDDocument document = PDDocument.load(file);
document.getClass();
//使用PDFTextStripper 工具
PDFTextStripper tStripper = new PDFTextStripper();
//设置文本排序,有规则输出
tStripper.setSortByPosition(true);
//获取所有文字信息
info = tStripper.getText(document);
document.close();
return info;
} catch (Exception e) {
return info;
}
}
4、xlsx文件内容读取
/**
* @param file 文件
* @return List<List<String>>
*/
private List<List<String>> xlsxSearch(File file) {
List<List<String>> listAll = new ArrayList<>();
try {
// 1. 创建 Workbook 对象
Workbook workbook = new XSSFWorkbook(new FileInputStream(file));
// 2. 创建 Sheet 对象
// 第一个工作表
Sheet sheet = workbook.getSheetAt(0);
// 3. 遍历每一行,获取单元格数据
for (Row row : sheet) {
List<String> list = new ArrayList<>();
for (Cell cell : row) {
String value;
cell.setCellType(CellType.STRING);
if (null == cell) {
value = "";
} else {
value = cell.toString();
}
list.add(value);
}
listAll.add(list);
}
// 4. 关闭 Workbook 对象
workbook.close();
return listAll;
} catch (Exception e) {
log.info("xlsxSearch==[{}]", e);
return listAll;
}
}
5、xls文件内容读取
/**
*
* @param file 文件
* @return List<List<String>>
*/
private List<List<String>> xlsSearch(File file) {
List<List<String>> listAll = new ArrayList<>();
boolean flag = false;
try {
// 1. 创建 Workbook 对象
HSSFWorkbook workbook = new HSSFWorkbook(new FileInputStream(file));
// 2. 创建 Sheet 对象
// 第一个工作表
HSSFSheet sheet = workbook.getSheetAt(0);
// 3. 遍历每一行,获取单元格数据
for (Row row : sheet) {
List<String> list = new ArrayList<>();
for (Cell cell : row) {
String value;
cell.setCellType(CellType.STRING);
if (null == cell) {
value = "";
} else {
value = cell.toString();
}
list.add(value);
}
listAll.add(list);
}
// 4. 关闭 Workbook 对象
workbook.close();
return listAll;
} catch (Exception e) {
return listAll;
}
}