【爬虫】哔哩哔哩排行榜


一、思路

利用requests模块发送HTTP请求,然后用xpath从请求结果中提取出想要的数据,最后将数据保存到数据库

1、数据提取

利用Xpath来提取网页中的数据
在这里插入图片描述

def parse(text):
    html = etree.HTML(text)
    div_list = html.xpath('//div[@id="J_waterfallWrapper"]/div[@class="item"]')
    result = []
    for div in div_list:
        commodity_url = ''.join(div.xpath('./a/@href'))
        commodity_title = ''.join(div.xpath('./a/div[@class="info"]/span/@title'))
        price = ''.join(div.xpath('./a/div[@class="info"]/p[@class="price"]/span/strong/text()'))
        shop_name = ''.join(div.xpath('./a/div[@class="info"]/p[@class="shopName"]/span[1]/text()'))
        li_list = div.xpath('./a/div[@class="info"]/div[@class="moreInfo"]/div[@class="dsr-info"]/ul/li')
        truth = ''.join(li_list[0].xpath('./span[@class="morethan"]/b/text()'))
        service = ''.join(li_list[1].xpath('./span[@class="morethan"]/b/text()'))
        speed = ''.join(li_list[2].xpath('./span[@class="morethan"]/b/text()'))
        commodity = {
            'commodity_url': commodity_url,
            'commodity_title': commodity_title,
            'price': price,
            'shop_name': shop_name,
            'truth': truth,
            'service': service,
            'speed': speed,
        }
        result.append(commodity)
    save(result)

2、数据保存

将数据保存到MySQL数据库中

    def save(self):
        cursor = self.connect.cursor()
        sql = "insert into video_rank (id,number,title,video_url,watch_number,bullet_number,up_name,score) values (null,%s,%s,%s,%s,%s,%s,%s)"
        for item in self.result:
            cursor.execute(sql, (item['number'], item['title'], item['video_url'], item['watch_number'],
                                 item['bullet_number'], item['up_name'], item['score']))
            print(item)
        self.connect.commit()
        self.connect.close()

二、结果

在这里插入图片描述

三、源代码

import pymysql
import requests
from lxml import etree


class Rank:
    def __init__(self):
        self.result = []
        self.url = "https://www.bilibili.com/v/popular/rank/all"
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                          "Chrome/86.0.4240.111 Safari/537.36 Edg/86.0.622.58"}
        self.connect = pymysql.connect(host="127.0.0.1", user="root", password="root", database="bilibili",
                                       charset="utf8mb4")

    def run(self):
        response = requests.get(url=self.url, headers=self.headers)
        self.parse(response.text)
        self.save()

    def parse(self, text):
        html = etree.HTML(text)
        li_list = html.xpath('//ul[@class="rank-list"]/li')
        for li in li_list:
            number = ''.join(li.xpath('./div[@class="num"]/text()'))
            title = ''.join(li.xpath('./div[@class="content"]/div[@class="info"]/a/text()'))
            watch_number = ''.join(
                li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/span[1]/text()')).strip()
            bullet_number = ''.join(
                li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/span[2]/text()')).strip()
            video_url = ''.join(li.xpath('./div[@class="content"]/div[@class="info"]/a/@href')).replace("//", "")
            up_name = ''.join(
                li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/span/text()')).strip()
            score = ''.join(li.xpath('./div[@class="content"]/div[@class="info"]/div[@class="pts"]/div/text()'))
            video = {
                'number': number,
                'title': title,
                'video_url': video_url,
                'watch_number': watch_number,
                'bullet_number': bullet_number,
                'up_name': up_name,
                'score': score,
            }
            self.result.append(video)

    def save(self):
        cursor = self.connect.cursor()
        sql = "insert into video_rank (id,number,title,video_url,watch_number,bullet_number,up_name,score) values (null,%s,%s,%s,%s,%s,%s,%s)"
        for item in self.result:
            cursor.execute(sql, (item['number'], item['title'], item['video_url'], item['watch_number'],
                                 item['bullet_number'], item['up_name'], item['score']))
            print(item)
        self.connect.commit()
        self.connect.close()


if __name__ == '__main__':
    Rank().run()

  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值