Jsoup
本次爬取使用到了Jsoup,不知道的朋友可以学一下官方文档
准备工作:
-
创建一个普通maven工程
-
pom依赖
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
分析页面,与上篇博客一样
https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1
拿到url
(改变page参数json数据发生改变。所以确定改url为爬取入口)
https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1
开始爬取
package com.scitc;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class App {
public static void main(String[] args) {
int imageNum = 1;//统计图片数量
int pageSize = 149;//总页数
for (int page = 1; page <= pageSize; page++) {
String url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page="
+ page + "&season_type=1&pagesize=20&type=1";
Document document = null;
try {
document = Jsoup.connect(url).timeout(3000).ignoreContentType(true).get();
} catch (IOException e) {
e.printStackTrace();
System.err.println("解析url失败");
}
//转换为JSON
JSONObject jsonObject = JSON.parseObject(document.text());
List<String> coverList = JSONObjectToCoverList(jsonObject);
for (String cover : coverList) {
//创建路径
String imagePath = createImagePath(cover);
InputStream inputStream = createUrlConnection(cover);
System.err.println("第 " + imageNum++ + " 张图片下载完成");
inputStreamToFile(inputStream, imagePath);
}
System.err.println("-------第 " + page + " 页图片爬取完成");
}
}
public static InputStream createUrlConnection(String cover) {
InputStream inputStream = null;
try {
URL imgUrl = new URL(cover);
URLConnection urlConnection = imgUrl.openConnection();
urlConnection.setConnectTimeout(10 * 1000);
inputStream = urlConnection.getInputStream();
} catch (Exception e) {
e.printStackTrace();
System.out.println("URL创建失败");
}
return inputStream;
}
//流转换
public static void inputStreamToFile(InputStream inputStream, String imagePath) {
// System.out.println("开始转换图片");
try {
BufferedInputStream bis = new BufferedInputStream(inputStream);
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(imagePath));
byte[] bytes = new byte[1024];
int len = 0;
while ((len = bis.read(bytes)) != -1) {
bos.write(bytes, 0, len);
}
bis.close();
bos.close();
} catch (Exception e) {
e.printStackTrace();
System.err.println("inputStream转换异常");
}
}
//创建图片路径
private static String createImagePath(String cover) {
System.out.println("开始创建图片路径");
//图片名称
String imgName = cover.substring(cover.lastIndexOf("/") + 1);
//创建路径
String path = "G://BiliBili//images";
File dir = new File(path);
if (!dir.exists()) {
dir.mkdirs();
}
String fileName = dir + File.separator + imgName;
System.out.println("图片路径:" + fileName);
return fileName;
}
// 把json转化为只含有 链接 的集合
public static List<String> JSONObjectToCoverList(JSONObject jsonObject) {
List<String> coverList = new ArrayList<>();
JSONArray jsonArray = jsonObject.getJSONObject("data").getJSONArray("list");
for (int i = 0; i < jsonArray.size(); i++) {
Map<String, String> map = (Map) jsonArray.get(i);
coverList.add(map.get("cover"));
}
return coverList;
}
}
效果
完成
B站上面 的壁纸站也可以采用相同的方法
https://h.bilibili.com/d