pom中引入:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.16</version>
</dependency>
public static List<String> getStringByWeb(URL url, String enCoded, String str) throws IOException{
//从目标url中获取字符流到bufr
BufferedReader bufr = new BufferedReader(new InputStreamReader(url.openStream(),enCoded));
List<String> list = new ArrayList<String>();
Pattern p = Pattern.compile(str);
String line = null;
while((line=bufr.readLine())!=null){
System.out.println(line);
Matcher m = p.matcher(line);
//寻找文件链接
while(m.find()){
//将符合规则的数据存储到集合中。
list.add(m.group());
}
}
return list;
}
public static void writeExcel(List<String> dataList, int cloumnCount,String finalXlsxPath){
OutputStream out = null;
try {
// 获取总列数
int columnNumCount = cloumnCount;
// 读取Excel文档
File finalXlsxFile = new File(finalXlsxPath);
Workbook workBook = getWorkbok(finalXlsxFile);
// sheet 对应一个工作页
Sheet sheet = workBook.getSheetAt(0);
/**
* 删除原有数据,除了属性列
*/
int rowNumber = sheet.getLastRowNum(); // 第一行从0开始算
// System.out.println("插入数据条数(除属性列外):" + rowNumber);
for (int i = 1; i <= rowNumber; i++) {
Row row = sheet.getRow(i);
sheet.removeRow(row);
}
// 创建文件输出流,输出电子表格:这个必须有,否则你在sheet上做的任何操作都不会有效
out = new FileOutputStream(finalXlsxPath);
workBook.write(out);
/**
* 往Excel中写新数据
*/
for (int j = 0; j < dataList.size(); j++) {
// 创建一行:从第二行开始,跳过属性列
Row row = sheet.createRow(j);
// 得到要插入的每一条记录
String dataMap = dataList.get(j);
for (int k = 0; k <= columnNumCount; k++) {
// 在一行内循环
Cell first = row.createCell(0);
first.setCellValue(dataMap);
}
}
// 创建文件输出流,准备输出电子表格:这个必须有,否则你在sheet上做的任何操作都不会有效
out = new FileOutputStream(finalXlsxPath);
workBook.write(out);
} catch (Exception e) {
e.printStackTrace();
} finally{
try {
if(out != null){
out.flush();
out.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("数据导出成功");
}
public static Workbook getWorkbok(File file) throws IOException{
Workbook wb = null;
FileInputStream in = new FileInputStream(file);
if(file.getName().endsWith(EXCEL_XLS)){ //Excel 2003
wb = new HSSFWorkbook(in);
}else if(file.getName().endsWith(EXCEL_XLSX)){ // Excel 2007/2010
wb = new XSSFWorkbook(in);
}
return wb;
}
main方法测试(我这里爬取的是链接):
public static void main(String[] args) {
URL url = new URL("http://xxxx");
//访问编码
String enCoded = "utf-8";
//设置链接的正则表达式
String str = "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
//使用getStringByWeb(url,enCoded,str)获取url中的链接的集合
List<String> list = getStringByWeb(url,enCoded,str);
writeExcel(arrayList, arrayList.size(),"C:/xxx.xlsx");
}