参考
https://blog.csdn.net/pythonxiaopeng/article/details/109030653
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import urllib
# 哔哩哔哩网页抓取演示
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
# 'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'_uuid=84585DA10-F1D10-110B7-EA55-BEC5F83E5E3A43059infoc; buvid3=D0B7DB1F-0B48-3D1C-46AC-A3117851FBA544076infoc; b_nut=1648005344; buvid4=A9F9005C-F46B-582A-AAF0-E7E9B49E8A2E44076-022032311-sluv7aFmF1IMuO0pwmTq5A%3D%3D; buvid_fp=ce46adfeb07afc695b34e45bbf0d02fb; bsource=search_baidu; sid=ke5ibdce; b_lsid=CC1097123_180267BB6B2; innersign=0',
'Host':'www.bilibili.com',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'
}
# url = 'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'
url = 'https://www.bilibili.com/v/popular/rank/all'
page = requests.get(url)
# 添加请求头,否则有时候请求不到完整信息
# page = requests.get(url,headers=headers)
# soup = BeautifulSoup(urllib.request.urlopen(page.content),"html.parser",from_encoding="iso-8859-1")
# soup = BeautifulSoup(open(page.content),"html.parser",from_encoding="iso-8859-1")
# soup = BeautifulSoup(page.content,"html.parser", from_encoding="iso-8859-1")
soup = BeautifulSoup(page.content, 'html.parser')
# print(page)
# print(page.text)
# print(soup)
# 源代码有 li.rank-item 标签
all_products = []
products = soup.select('li.rank-item')
# print(products)
for product in products:
# rank = product.select('div.num')[0].text
name = product.select('div.info > a')[0].text.strip()
play = product.select('span.data-box')[0].text
comment = product.select('span.data-box')[1].text
up = product.select('span.data-box')[2].text
url = product.select('div.info > a')[0].attrs['href']
all_products.append({
# "视频排名":rank,
"视频名": name,
"播放量": play,
"弹幕量": comment,
"up主": up,
"视频链接": url
})
print(all_products)
sys.exit(0)
keys = all_products[0].keys()
# with open('B站视频热榜TOP100.csv', 'w', newline='', encoding='utf-8-sig') as output_file:
# dict_writer = csv.DictWriter(output_file, keys)
# dict_writer.writeheader()
# dict_writer.writerows(all_products)
## 使用pandas写入数据
pd.DataFrame(all_products,columns=keys).to_csv('B站视频热榜TOP100.csv', encoding='utf-8')