数据中有excel文件的形式,如果我们不用python做,则需要通过poi插件,将excel中的内容读取出来,可以放在内存中直接处理,也可以将文件的格式转换成普通文本类型,例如txt,csv,或者无后缀名文件。可以用文本编辑工具,如editplus等打开查看,做相应的后续操作。
这里是以\001作为分隔符,防止空格的干扰,方便后续操作。本来打算以\005作为行分隔符,即保留原文中的样式,又不会受到\n的干扰。但是Spark ,MapReduce大都是以\n作为行分隔符读取,为了方便后续操作,所以我们清理了每一个单元格中的\n,然后以\n作为行分隔符。
//获取每一个单元格中的内容,如果为空将单元格内容替换为null。
if (row.getCell(z) == null || row.getCell(z).toString().equals("")) {
cell = "null";
} else {
cell = row.getCell(z).toString();
}
//将读取到的所有单元格的内容都拼接在一起,形成字符串,并清理格式去掉回车符。
if (z == cellNum - 1) {
cells = cells + cell.replaceAll("\n", "。") + "\n";
} else {
cells = cells + cell.replaceAll("\n", "。") + "\001";
}
完整代码如下:
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class ExcelReaderUtils {
private Workbook wb;
private Sheet sheet;
private Row row;
private Logger logger = LoggerFactory.getLogger(this.getClass());
public ExcelReaderUtils(String filePath) {
if (filePath == null) {
return;
}
String type = filePath.substring(filePath.lastIndexOf("."));
try {
FileInputStream fis = new FileInputStream(filePath);
if (".xls".equals(type)) {
wb = new HSSFWorkbook(fis);
} else if (".xlsx".equals(type)) {
wb = new XSSFWorkbook(fis);
} else {
wb = null;
}
} catch (FileNotFoundException e) {
logger.error("文件未找到", e);
} catch (IOException e) {
logger.error("IO异常", e);
}
}
public List<List<String>> readExcel() {
List<List<String>> list = new ArrayList<>();
int sheetNum = wb.getActiveSheetIndex();
for (int x = 0; x <= sheetNum; x++) {
sheet = wb.getSheetAt(x);
int rowNum = sheet.getLastRowNum();
List<String> context = new ArrayList<>();
for (int y = 0; y < rowNum; y++) {
row = sheet.getRow(y);
//判断是否为空行
if (row != null) {
int cellNum = row.getLastCellNum();
String cells = "";
String cell = "";
for (int z = 0; z < cellNum; z++) {
//获取每一个单元格中的内容,如果为空将单元格内容替换为null。
if (row.getCell(z) == null || row.getCell(z).toString().equals("")) {
cell = "null";
} else {
cell = row.getCell(z).toString();
}
//将读取到的所有单元格的内容都拼接在一起,形成字符串,并清理格式去掉回车符。
if (z == cellNum - 1) {
cells = cells + cell.replaceAll("\n", "。") + "\n";
} else {
cells = cells + cell.replaceAll("\n", "。") + "\001";
}
}
context.add(cells);
}
}
list.add(context);
}
return list;
}
public void writeFile(List<List<String>> contextList, String filePath){
writeFile(contextList,filePath,null);
}
public void writeFile(List<List<String>> contextList, String filePath, String fileType) {
//判断文件路径是否存在,如果不存在则创建
File path = new File(filePath);
if (!path.exists()) {
System.out.println("路径不存在,正在创建...");
path.mkdirs();
System.out.println("创建完成!");
}
//判断输入的文件生成类型
String type = "";
if (fileType != null && fileType != "") {
type = "." + fileType;
}
String sheetName = null;
File file = null;
FileWriter fw = null;
for (int i=0; i<contextList.size();i++){
List<String> context = contextList.get(i);
sheetName = wb.getSheetAt(i).getSheetName();
file = new File(path.getAbsolutePath()+"/"+sheetName+type);
try {
file.createNewFile();
fw = new FileWriter(file);
for (int j=0; j<context.size();j++){
fw.write(context.get(j));
fw.flush();
}
} catch (IOException e) {
System.out.println("生成输出文件出错!");
e.printStackTrace();
}
System.out.println("第"+i+"个文件:"+sheetName+type+",写入完成");
}
}
public static void main(String[] args) {
String path = "C:/Users/Administrator/Desktop/数据库表结构/轨道交通数据库结构与业务表.xlsx";
ExcelReaderUtils reader = new ExcelReaderUtils(path);
List<List<String>> list = reader.readExcel();
System.out.println("---------------------------开始读取Excel文件-----------------------------");
for (int i = 0; i < list.size(); i++) {
List<String> context = list.get(i);
for (int j = 0; j < context.size(); j++) {
System.out.println(context.get(j));
}
System.out.println("------------------------------------------------------------------------");
}
System.out.println("-------------------------------读取结束---------------------------------");
String outPath = "F:/ExcelOut";
reader.writeFile(list,outPath);
}
}
pom文件依赖如下:
<dependencies>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-examples</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-excelant</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>stax</groupId>
<artifactId>stax-api</artifactId>
<version>1.0</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>2.6.0</version>
</dependency>
</dependencies>