Java 爬取BiliBili追番排行榜

看了標哥(代码忘烦恼)爬虫博客,两篇博客写的简单易懂:自己也跟着学了一下,几乎就是在上面进行的改动。

  1. Java爬虫
  2. java爬取前程无忧(51job),这篇文章看博客代码不完整,建议看 github源码

准备工作:

  1. 创建一个普通maven工程
    在这里插入图片描述
  2. pom依赖
<dependencies>
    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpclient</artifactId>
      <version>4.5.9</version>
    </dependency>

    <dependency>
      <groupId>org.apache.httpcomponents</groupId>
      <artifactId>httpcore</artifactId>
      <version>4.4.11</version>
    </dependency>
    <dependency>
      <groupId>mysql</groupId>
      <artifactId>mysql-connector-java</artifactId>
      <version>8.0.15</version>
    </dependency>
    <dependency>
      <groupId>org.mybatis</groupId>
      <artifactId>mybatis</artifactId>
      <version>3.5.2</version>
    </dependency>

    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.12.1</version>
    </dependency>

    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>
      <version>1.12.1</version>
    </dependency>

    <dependency>
      <groupId>com.alibaba</groupId>
      <artifactId>fastjson</artifactId>
      <version>1.2.47</version>
    </dependency>
</dependencies>
  1. 项目结构
    在这里插入图片描述

分析页面

https://www.bilibili.com/anime/index/#season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1

我们从这找到了相关的json数据
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
访问获取到的url:
在这里插入图片描述
(改变page参数json数据发生改变。所以确定改url为爬取入口)

https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page=1&season_type=1&pagesize=20&type=1

开始爬取

  1. 创建数据库
CREATE TABLE `bilibili` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  `cover` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  `orders` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  `index_show` varchar(1024) CHARACTER SET utf8 COLLATE utf8_general_ci DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8;
  1. 根据json字段创建BiliBili实体类
package com.scitc.model;

public class BiliBili{
    private Integer id;
    private String title;
    private String cover;//封面图
    private String order;//追番人数
    private String indexShow;//总集数

    public BiliBili() {
    }

    public Integer getId() {
        return id;
    }

    public void setId(Integer id) {
        this.id = id;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getCover() {
        return cover;
    }

    public void setCover(String cover) {
        this.cover = cover;
    }

    public String getOrder() {
        return order;
    }

    public void setOrder(String order) {
        this.order = order;
    }

    public String getIndexShow() {
        return indexShow;
    }

    public void setIndexShow(String indexShow) {
        this.indexShow = indexShow;
    }

    @Override
    public String toString() {
        return "BiliBili{" +
                "id=" + id +
                ", title='" + title + '\'' +
                ", cover='" + cover + '\'' +
                ", order='" + order + '\'' +
                ", indexShow='" + indexShow + '\'' +
                '}';
    }
}
  1. BiliBiliMapper
public interface BiliBiliMapper {
    int insert(BiliBili biliBili);

    List<BiliBili> biliBiliList();
}
  1. BiliBiliMapper.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper
        PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.scitc.mapper.BiliBiliMapper">

<resultMap id="biliResultMapper" type="com.scitc.model.BiliBili">
    <id column="id" property="id" jdbcType="INTEGER"/>
    <result  column="title" property="title" jdbcType="VARCHAR"/>
    <result  column="cover" property="cover" jdbcType="VARCHAR"/>
    <result  column="orders" property="order" jdbcType="VARCHAR"/>
    <result  column="index_show" property="indexShow" jdbcType="VARCHAR"/>
</resultMap>

<insert id="insert" parameterType="com.scitc.model.BiliBili" keyProperty="id" keyColumn="id" useGeneratedKeys="true">
        INSERT INTO bilibili(title,cover,orders,index_show) VALUES (#{title},#{cover},#{order},#{indexShow})

    </insert>

<select id="biliBiliList" resultMap="biliResultMapper">
        SELECT  * FROM bilibili
    </select>
</mapper>
  1. jdbc.properties
jdbc.url=jdbc:mysql://localhost:3306/user?useUnicode=true&characterEncoding=utf-8&serverTimezone=GMT%2B8
jdbc.driver=com.mysql.cj.jdbc.Driver
jdbc.user=root
jdbc.password=123456
  1. mybatis-config.xml
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE configuration
        PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
        "http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
    <properties resource="jdbc.properties"/>
    <environments default="development">
        <environment id="development">
            <transactionManager type="JDBC"></transactionManager>
            <dataSource type="POOLED">
                <property name="driver" value="${jdbc.driver}"/>
                <property name="url" value="${jdbc.url}"/>
                <property name="username" value="${jdbc.user}"/>
                <property name="password" value="${jdbc.password}"/>
            </dataSource>
        </environment>
    </environments>
    <mappers>
        <mapper resource="BiliBiliMapper.xml"/>
    </mappers>
</configuration>
  1. HTTPUtils
public class HTTPUtils {
    public static HttpResponse getHtml(HttpClient client, String url){
        //获取响应文件,及HTML,采取get方式获取响应数据
        HttpGet getMethod = new HttpGet(url);
        HttpResponse response=  new BasicHttpResponse(HttpVersion.HTTP_1_1, HttpStatus.SC_OK,"OK");

            //通过client执行get方法
        try {
            response = client.execute(getMethod);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("建立客户端出现异常");
        }
        return  response;
    }
}

  1. BiliBiliParse
public class BiliBiliParse {
    public static void creteDate(String entity){
        //读取mybatis配置文件
        String resources  = "mybatis-config.xml";
        InputStream resourceAsStream = null;
        try {
            resourceAsStream = Resources.getResourceAsStream(resources);
        } catch (IOException e) {
            e.printStackTrace();
        }
        //得到连接对象注册sqlsession
        SqlSessionFactory sqlSessionFactory =  new SqlSessionFactoryBuilder().build(resourceAsStream);
        SqlSession sqlSession = sqlSessionFactory.openSession();
        BiliBiliMapper biliMapper = sqlSession.getMapper(BiliBiliMapper.class);
        
        JSONObject jsonObject = JSONObject.parseObject(entity);
        String jsonStr = jsonObject.getJSONObject("data").getString("list");
        
        List<BiliBili> biliBilis = JSON.parseArray(jsonStr, BiliBili.class);
        //需要JDK 1.8
        biliBilis.stream().map(e -> {
            int insert = biliMapper.insert((BiliBili) e);
            sqlSession.commit();
            return insert;
        }).collect(Collectors.toList());
        sqlSession.commit();
    }
}
  1. URLHandle
public class URLHandle {
    public static void urlParser(HttpClient client, String url){
        //获取响应资源
        HttpResponse response = HTTPUtils.getHtml(client,url);
        //获取响应状态码
        int statusCode = response.getStatusLine().getStatusCode();
        System.out.println("响应状态码" + statusCode);
        if(statusCode ==200){

            //页面编码
            try {
                String entity = EntityUtils.toString(response.getEntity(),"utf-8");
                System.out.println("开始解析...");

                BiliBiliParse.creteDate(entity);
            } catch (IOException e) {
                e.printStackTrace();
                System.err.println("解析entity失败");
            }
        }
    }
}

  1. App
public class App 
{
    public static void main( String[] args )
    {
        System.out.println("正在生成客户端...");
        HttpClient client = null;
        System.out.println("客户端生成完毕.");
        
        int pageSize = 149;//总页数

        for(int page=1;page<=pageSize;page++){
            String url = "https://api.bilibili.com/pgc/season/index/result?season_version=-1&area=-1&is_finish=-1&copyright=-1&season_status=-1&season_month=-1&year=-1&style_id=-1&order=3&st=1&sort=0&page="
                    + page + "&season_type=1&pagesize=20&type=1";

            System.err.println("开始爬取第:" + page + " 页的数据");

            System.out.println("正在生成客户端...");
            client = HttpClientBuilder.create().build();
            System.out.println("客户端生成完毕.");

            //开始解析
            System.out.println("开始响应客户端...");

            URLHandle.urlParser(client, url);
        }
        System.out.println("全部爬取完成");
    }
}

运行效果

在这里插入图片描述
查看数据库

在这里插入图片描述

虽然我这个没有什么用,不过还是挺好玩的。

项目地址

github

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值