Python之 一步一步来教你爬虫,使用python抓取网页数据并储存

参考

https://blog.csdn.net/pythonxiaopeng/article/details/109030653

import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import urllib


# 哔哩哔哩网页抓取演示

headers = {

'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
# 'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'_uuid=84585DA10-F1D10-110B7-EA55-BEC5F83E5E3A43059infoc; buvid3=D0B7DB1F-0B48-3D1C-46AC-A3117851FBA544076infoc; b_nut=1648005344; buvid4=A9F9005C-F46B-582A-AAF0-E7E9B49E8A2E44076-022032311-sluv7aFmF1IMuO0pwmTq5A%3D%3D; buvid_fp=ce46adfeb07afc695b34e45bbf0d02fb; bsource=search_baidu; sid=ke5ibdce; b_lsid=CC1097123_180267BB6B2; innersign=0',
'Host':'www.bilibili.com',
'Sec-Fetch-Dest':'document',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:99.0) Gecko/20100101 Firefox/99.0'

}

# url = 'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'
url = 'https://www.bilibili.com/v/popular/rank/all'
page = requests.get(url)
# 添加请求头,否则有时候请求不到完整信息
# page = requests.get(url,headers=headers) 

# soup = BeautifulSoup(urllib.request.urlopen(page.content),"html.parser",from_encoding="iso-8859-1")
# soup = BeautifulSoup(open(page.content),"html.parser",from_encoding="iso-8859-1")
# soup = BeautifulSoup(page.content,"html.parser", from_encoding="iso-8859-1")

soup = BeautifulSoup(page.content, 'html.parser')



# print(page)
# print(page.text)
# print(soup)

# 源代码有 li.rank-item 标签
all_products = []
products = soup.select('li.rank-item')

# print(products)

for product in products:
    # rank = product.select('div.num')[0].text
    name = product.select('div.info > a')[0].text.strip()
    play = product.select('span.data-box')[0].text
    comment = product.select('span.data-box')[1].text
    up = product.select('span.data-box')[2].text
    url = product.select('div.info > a')[0].attrs['href']

    all_products.append({
        # "视频排名":rank,
        "视频名": name,
        "播放量": play,
        "弹幕量": comment,
        "up主": up,
        "视频链接": url
    })

print(all_products)
sys.exit(0)

keys = all_products[0].keys()

# with open('B站视频热榜TOP100.csv', 'w', newline='', encoding='utf-8-sig') as output_file:
#     dict_writer = csv.DictWriter(output_file, keys)
#     dict_writer.writeheader()
#     dict_writer.writerows(all_products)

## 使用pandas写入数据
pd.DataFrame(all_products,columns=keys).to_csv('B站视频热榜TOP100.csv', encoding='utf-8')
  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值