网络爬虫在采集少量数据时,可以使用Excel进行存储。 Java中主要有两款操作Excel的工具JXL(主要处理xls格式的Excel)和POI(可处理xls、xlsx、word等格式的文件)。本小节利用POI封装了Excel读取和写入方法,供读者参考。
使用Excel存储数据之前,需要在Maven工程的pom.xml文件中添加Excel的dependency。
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.15</version>
</dependency>
如程序5-2所示,将采集到的URL存入excel表格。
//程序5-2
public class ExcelStorage {
public static void main(String[] args) throws IOException {
//需要写入到excel的内容
//这里选择使用map数据结构来存储excel所对应的一行数据
List<Map<String,String>> excelMapList = new LinkedList<>();
//通过Jsoup创建和url的连接
Connection connection = Jsoup.connect("https://searchcustomerexperience.techtarget.com/info/news");
//获取网页的Document对象
Document document = connection.get();
Elements aList = document.getElementsByTag("a");
if(aList.size()>0){
for(int i = 0;i<aList.size();i++){
Element a = aList.get(i);
if(a != null){
String url = a.attr("href");
if(url.contains("http")) {
Map<String,String> map = new HashMap<>();
map.put("f1",url);
excelMapList.add(map);
}
}
}
}
//设置excel表头
List<String> headList = new ArrayList<>();
headList.add("url");
//创建含有表头的excel
XSSFWorkbook workbook = ExcelUtils.getWorkbook("Sheet1", headList);
//将excelMapList内容写入Excel
String excelPath = "/Users/steven/Documents/代码/project/spider/src/main/java/com/topicBet/url.xlsx";
ExcelUtils.writeExcelContent(workbook, "Sheet1", excelPath, excelMapList);
}
}
@Slf4j
public class ExcelUtils {
/**
* 读取excel文件内容,文件需要含有表头,如果没有表头请以f1,f2,f3......填充
*
* @param filePath 文件路径
* @param sheetAt 表格
* @return List<Map<String,String>>
*/
public static List<Map<String, String>> readExcelContent(String filePath, Integer sheetAt) {
List<Map<String, String>> mapList = new ArrayList<>();
FileInputStream fileInputStream = null;
try {
fileInputStream = new FileInputStream(filePath);
// 延迟解析比率
ZipSecureFile.setMinInflateRatio(-1.0d);
XSSFWorkbook workbook = new XSSFWorkbook(fileInputStream);
XSSFSheet sheet = workbook.getSheetAt(sheetAt);
int rows = sheet.getPhysicalNumberOfRows();
for (int i = 1; i <= rows; i++) {
//此行不为空行
if (sheet.getRow(i) != null) {
Map<String, String> map = new HashMap<>();
int columns = sheet.getRow(i).getPhysicalNumberOfCells();
for (int j = 0; j < columns; j++) {
map.put("f" + (j + 1), getString(sheet.getRow(i).getCell(j)).trim());
}
mapList.add(map);
}
}
} catch (Exception e) {
log.error("读取文件出现异常");
}
return mapList;
}
/**
* 把单元格的内容转为字符串
*
* @param xssfCell 单元格
* @return String
*/
public static String getString(XSSFCell xssfCell) {
if (xssfCell == null) {
return "";
}
if (xssfCell.getCellTypeEnum() == CellType.NUMERIC) {
DecimalFormat df = new DecimalFormat("0");
return df.format(xssfCell.getNumericCellValue());
} else if (xssfCell.getCellTypeEnum() == CellType.BOOLEAN) {
return String.valueOf(xssfCell.getBooleanCellValue());
} else {
return xssfCell.getStringCellValue();
}
}
/**
* 创建含有表头的excel
*
* @param sheetName 表格名称
* @param headList 表头字符串列表
* @return XSSFWorkbook
*/
public static XSSFWorkbook getWorkbook(String sheetName, List<String> headList) {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetName);
XSSFRow titleRow = sheet.createRow(0);
for (int i = 0; i < headList.size(); i++) {
XSSFCell cell = titleRow.createCell(i);
cell.setCellValue(headList.get(i));
}
return workbook;
}
/**
* 将需要输出的内容填充到excel并保存到本地
*
* @param workbook 表格
* @param sheetName 表格名称
* @param path 输出到本地的路径
* @param excelMapList 需要输出的内容
*/
public static void writeExcelContent(XSSFWorkbook workbook, String sheetName, String path, List<Map<String, String>> excelMapList) {
// 延迟解析比率
ZipSecureFile.setMinInflateRatio(-1.0d);
XSSFSheet sheet = workbook.getSheet(sheetName);
for (int i = 1; i <= excelMapList.size(); i++) {
XSSFRow row = sheet.createRow(i);
Map<String, String> map = excelMapList.get(i - 1);
for (int j = 0; j < map.size(); j++) {
XSSFCell cell = row.createCell(j);
cell.setCellValue(map.get("f" + (j + 1)));
}
}
//用输出流写到excel
FileOutputStream outputStream = null;
try {
outputStream = new FileOutputStream(path);
workbook.write(outputStream);
outputStream.flush();
outputStream.close();
} catch (Exception e) {
log.error("写入excel发生异常");
}
}
}