项目说明
项目时间:2020.03.31
目标网址:https://www.bilibili.com/ranking/all/0/0/3
今天刚学了pymongo
模块,爬个B站练练手吧!
爬取的是这三个榜单:
另外两个结构不太一样,没有一起爬,稍微改一下提取信息的部分也是可以的。
项目源码:
getheaders()
是我自己定义的一个方法,可以换成自己的headers。
# encoding: utf-8
"""
@author: @wen
@contact:
@time: 2020/3/31 0031 20:48
@file: 爬取b站视频信息存入mongodb.py
@desc:
"""
import requests
from MySipderInit import getheaders
from lxml import etree
import pymongo
def gethtml(url):
"""只负责获取网页内容"""
try:
response = requests.get(url, headers=getheaders())
if response.status_code == 200:
# print(response.text) # 测试成功
html = etree.HTML(response.text)
return html
except Exception as ex:
print(ex)
def get_video_info(html):
"""解析页面,获取视频信息"""
li_list = html.xpath(r'//ul[@class="rank-list"]/li') # 外层<li>标签
video_info_list = []
for li in li_list:
"""提取视频信息"""
video_info_dict = {}
rank = li.xpath(r'./div/text()')[0] # 排名
score = li.xpath(r'.//div[@class="pts"]/div/text()')[0] # 分数
title = li.xpath(r'./div/div[2]/a/text()')[0] # 标题
author = li.xpath(r'.//div[@class="detail"]/a/span/text()')[0].strip() # 作者
playnum = li.xpath(r'.//div[@class="detail"]/span/text()')[0].strip() # 播放量
commentnum = li.xpath(r'.//div[@class="detail"]/span[2]/text()')[0].strip() # 弹幕量
videolink = li.xpath(r'.//div[@class="info"]/a/@href')[0] # 视频链接
"""将视频信息放进字典"""
video_info_dict['rank'] = rank
video_info_dict['score'] = score
video_info_dict['title'] = title
video_info_dict['author'] = author
video_info_dict['playnum'] = playnum
video_info_dict['commentnum'] = commentnum
video_info_dict['videolink'] = videolink
"""将字典追加进列表"""
video_info_list.append(video_info_dict)
# 返回的列表装的是一个个字典,每个字典里装的是一条视频的信息
return video_info_list
def send_to_mongo(cat, video_info_list):
"""将数据保存到mongo"""
col = db[cat] # 连接到集合,将该栏目的标签作为集合的名称
res = col.insert_many(video_info_list) # 一次性多条写入
print(res)
def main():
"""主要执行逻辑"""
for i in range(0, len(cat_list)):
html = gethtml(base_url % cat_list[i]) # 打开连接,获取网页内容
video_info_list = get_video_info(html) # 解析网页内容,获取视频榜单中视频信息
send_to_mongo(cat_list[i], video_info_list) # 保存到mongo中
if __name__ == '__main__':
cat_list = ['all', 'origin', 'rookie'] # 要爬取的三个榜单
base_url = r'https://www.bilibili.com/ranking/%s/0/0/7'
client = pymongo.MongoClient() # 创建mongo客户端
db = client['bilibili'] # 连接到数据库
main()