看了標哥(代码忘烦恼)爬虫博客,两篇博客写的简单易懂:自己也跟着学了一下,几乎就是在上面进行的改动。
- Java爬虫
- java爬取前程无忧(51job),这篇文章看博客代码不完整,建议看 github源码
准备工作:
- 创建一个普通maven工程
- pom依赖
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.9</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.11</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.15</version>
</dependency>
<dependency>
<groupId>org.mybatis</groupId>
<artifactId>mybatis</artifactId>
<version>3.5.2</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
</dependency>
</dependencies>
- 项目结构
分析页面
https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1
我们从这找到了相关的json数据
访问获取到的url:
(改变page参数json数据发生改变。所以确定改url为爬取入口)
https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1
开始爬取
- 创建数据库
CREATE TABLE `bilibili` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`title` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`cover` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`orders` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
`index_show` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
- 根据json字段创建BiliBili实体类
package com.scitc.model;
public class BiliBili{
private Integer id;
private String title;
private String cover;//封面图
private String order;//追番人数
private String indexShow;//总集数
public BiliBili() {
}
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getCover() {
return cover;
}
public void setCover(String cover) {
this.cover = cover;
}
public String getOrder() {
return order;
}
public void setOrder(String order) {
this.order = order;
}
public String getIndexShow() {
return indexShow;
}
public void setIndexShow(String indexShow) {
this.indexShow = indexShow;
}
@Override
public String toString() {
return "BiliBili{" +
"id=" + id +
", title='" + title + '\'' +
", cover='" + cover + '\'' +
", order='" + order + '\'' +
", indexShow='" + indexShow + '\'' +
'}';
}
}
- BiliBiliMapper
public interface BiliBiliMapper {
int insert(BiliBili biliBili);
List<BiliBili> biliBiliList();
}
- BiliBiliMapper.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.scitc.mapper.BiliBiliMapper">
<resultMap id="biliResultMapper" type="com.scitc.model.BiliBili">
<id column="id" property="id" jdbcType="INTEGER"/>
<result column="title" property="title" jdbcType="VARCHAR"/>
<result column="cover" property="cover" jdbcType="VARCHAR"/>
<result column="orders" property="order" jdbcType="VARCHAR"/>
<result column="index_show" property="indexShow" jdbcType="VARCHAR"/>
</resultMap>
<insert id="insert" parameterType="com.scitc.model.BiliBili" keyProperty="id" keyColumn="id" useGeneratedKeys="true">
INSERT INTO bilibili(title,cover,orders,index_show) VALUES (#{title},#{cover},#{order},#{indexShow})
</insert>
<select id="biliBiliList" resultMap="biliResultMapper">
SELECT * FROM bilibili
</select>
</mapper>
- jdbc.properties
jdbc.url=jdbc:mysql://localhost:3306/user?useUnicode=true&characterEncoding=utf-8&serverTimezone=GMT%2B8
jdbc.driver=com.mysql.cj.jdbc.Driver
jdbc.user=root
jdbc.password=123456
- mybatis-config.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<properties resource="jdbc.properties"/>
<environments default="development">
<environment id="development">
<transactionManager type="JDBC"></transactionManager>
<dataSource type="POOLED">
<property name="driver" value="${jdbc.driver}"/>
<property name="url" value="${jdbc.url}"/>
<property name="username" value="${jdbc.user}"/>
<property name="password" value="${jdbc.password}"/>
</dataSource>
</environment>
</environments>
<mappers>
<mapper resource="BiliBiliMapper.xml"/>
</mappers>
</configuration>
- HTTPUtils
public class HTTPUtils {
public static HttpResponse getHtml(HttpClient client, String url){
//获取响应文件,及HTML,采取get方式获取响应数据
HttpGet getMethod = new HttpGet(url);
HttpResponse response= new BasicHttpResponse(HttpVersion.HTTP_1_1, HttpStatus.SC_OK,"OK");
//通过client执行get方法
try {
response = client.execute(getMethod);
} catch (IOException e) {
e.printStackTrace();
System.err.println("建立客户端出现异常");
}
return response;
}
}
- BiliBiliParse
public class BiliBiliParse {
public static void creteDate(String entity){
//读取mybatis配置文件
String resources = "mybatis-config.xml";
InputStream resourceAsStream = null;
try {
resourceAsStream = Resources.getResourceAsStream(resources);
} catch (IOException e) {
e.printStackTrace();
}
//得到连接对象注册sqlsession
SqlSessionFactory sqlSessionFactory = new SqlSessionFactoryBuilder().build(resourceAsStream);
SqlSession sqlSession = sqlSessionFactory.openSession();
BiliBiliMapper biliMapper = sqlSession.getMapper(BiliBiliMapper.class);
JSONObject jsonObject = JSONObject.parseObject(entity);
String jsonStr = jsonObject.getJSONObject("data").getString("list");
List<BiliBili> biliBilis = JSON.parseArray(jsonStr, BiliBili.class);
//需要JDK 1.8
biliBilis.stream().map(e -> {
int insert = biliMapper.insert((BiliBili) e);
sqlSession.commit();
return insert;
}).collect(Collectors.toList());
sqlSession.commit();
}
}
- URLHandle
public class URLHandle {
public static void urlParser(HttpClient client, String url){
//获取响应资源
HttpResponse response = HTTPUtils.getHtml(client,url);
//获取响应状态码
int statusCode = response.getStatusLine().getStatusCode();
System.out.println("响应状态码" + statusCode);
if(statusCode ==200){
//页面编码
try {
String entity = EntityUtils.toString(response.getEntity(),"utf-8");
System.out.println("开始解析...");
BiliBiliParse.creteDate(entity);
} catch (IOException e) {
e.printStackTrace();
System.err.println("解析entity失败");
}
}
}
}
- App
public class App
{
public static void main( String[] args )
{
System.out.println("正在生成客户端...");
HttpClient client = null;
System.out.println("客户端生成完毕.");
int pageSize = 149;//总页数
for(int page=1;page<=pageSize;page++){
String url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1©right=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page="
+ page + "&season_type=1&pagesize=20&type=1";
System.err.println("开始爬取第:" + page + " 页的数据");
System.out.println("正在生成客户端...");
client = HttpClientBuilder.create().build();
System.out.println("客户端生成完毕.");
//开始解析
System.out.println("开始响应客户端...");
URLHandle.urlParser(client, url);
}
System.out.println("全部爬取完成");
}
}
运行效果
查看数据库
虽然我这个没有什么用,不过还是挺好玩的。