blibli爬虫
爬取blibli视频评论数据,嘻嘻
首先爬取blibli的视频cid,视频cid是视频的唯一标识
import os
import requests
import json
import time
import random
def crawl_bilibili_popular(start_page=1, total_pages=1000, ps=20):
# 请求头信息
headers = {
'Accept': '*/*',
'Cookie': 'your_cookie_here', # 请替换成您的 Cookie
'Origin': 'https://www.bilibili.com',
'Referer': 'https://www.bilibili.com/v/popular/all/?spm_id_from=333.1007.0.0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36'
}
# 判断文件是否存在,如果存在则读取记录
record_file = 'crawl_record.txt'
start_page = load_record(record_file) if os.path.exists(record_file) else start_page
# 初始化链接集合
existing_links = load_existing_links('bilibli_cid.txt')
# 循环爬取每一页的内容
for page in range(start_page, total_pages + 1):
# 存储当前页视频的短链接
short_links = []
# 构造请求 URL
url = f'https://api.bilibili.com/x/web-interface/popular?ps={ps}&pn={page}&web_location=333.934'
# 发送 GET 请求前随机生成请求间隔时间
random_sleep = random.uniform(1, 3)
time.sleep(random_sleep)
# 发送 GET 请求
response = requests.get(url, headers=headers)
# 打印获取状态
print(f"正在获取第 {page} 页的内容...")
# 如果请求成功,则提取短链接
if response.status_code == 200:
data = response.json()
short_links = [item['short_link_v2'] for item in data['data']['list']]
# 打印进度
print(f"第 {page} 页内容获取成功,当前已获取 {len(short_links)} 条短链接。")
else:
print(f"请求第 {page} 页失败,状态码:", response.status_code)
# 如果获取失败,跳过进行下次请求
continue
# 添加新链接到集合中
new_links = set()
for link in short_links:
if link not in existing_links:
existing_links.add(link)
new_links.add(link)
# 保存链接到文件
if new_links:
save_links_to_file('bilibli_cid.txt', new_links)
# 更新记录
update_record(record_file, page)
def load_record(record_file):
# 读取记录文件中的页数
with open(record_file, 'r', encoding='utf-8') as f:
start_page = int(f.read().strip())
return start_page
def update_record(record_file, page):
# 更新记录文件中的页数
with open(record_file, 'w', encoding='utf-8') as f:
f.write(str(page))
def load_existing_links(file_path):
# 加载已存在的链接集合
existing_links = set()
if os.path.exists(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
existing_links = set(line.strip() for line in f)
return existing_links
def save_links_to_file(file_path, links):
# 将新链接写入文件
with open(file_path, 'a', encoding='utf-8') as f:
for link in links:
f.write(link + '\n')
if __name__ == "__main__":
# 调用函数爬取1000页内容
crawl_bilibili_popular(start_page=1, total_pages=1000)
# 输出结果
print("所有视频的短链接已保存到 bilibli_cid.txt 文件中。")
其次爬取视频评论
import requests
import re
from bs4 import BeautifulSoup
import operator
import traceback
import os
import pandas as pd
from lxml import etree
from time import sleep
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
'Referer': 'https://www.bilibili.com/'
}
timeout = 5
def getHTML(url):
try:
response = requests.get(url=url, headers=headers,
timeout=timeout)
# 自适应编码
response.encoding = response.apparent_encoding
return response.text
# 下句作用等同于上两句
# return response.text.encode(response.encoding).decode('utf-8')
except:
print(f"reqeuset url : {url} error...")
print(traceback.format_exc())
return None
def parsePage(page):
try:
print("parsing...")
html_ = etree.HTML(page)
meta_title = html_.xpath('//meta[@name="title"]/@content')[0]
if meta_title == '视频去哪了呢?_哔哩哔哩_bilibili':
print(f'视频 404 not found')
return [], '视频 404 not found'
syntax = [':', '=']
flag = 0
keys = re.findall(r'"cid":[\d]*', page)
if not keys:
keys = re.findall(r'cid=[\d]*', page)
flag = 1
comments, title = {}, None
keys = [keys[1]]
for index, item in enumerate(keys):
key = item.split(syntax[flag])[1]
print(f'{index + 1}/{len(keys)}: {key}')
comment_url = f'https://comment.bilibili.com/{key}.xml' # 弹幕地址
comment_text = getHTML(comment_url)
bs4 = BeautifulSoup(comment_text, "html.parser")
if not title:
title = BeautifulSoup(page, "html.parser").find('h1').get_text().strip()
for comment in bs4.find_all('d'):
time = float(comment.attrs['p'].split(',')[0])
time = timeFormatter(time)
comments[time] = comment.string
sorted_comments = sorted(comments.items(), key=operator.itemgetter(0)) # 排序
comments = dict(sorted_comments)
print("parse finish")
return comments, title
except:
print("parse error")
print(traceback.format_exc())
def validateTitle(title):
re_str = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(re_str, "_", title) # 替换为下划线
return new_title
def timeFormatter(param):
minute = int(param) // 60
second = float(param) - minute * 60
return f'{str(minute).zfill(2)}:{str(second).zfill(5)}'
def main():
bvs = ['BV1pq421A7UU', 'BV1CC4y1a7ee', 'BV1hx411e7KP', 'BV16F411B7Ek']
for bv in bvs:
url = f"https://www.bilibili.com/video/{bv}"
save_folder = "BarRage"
if not os.path.exists(save_folder):
os.mkdir(save_folder)
comments, title = parsePage(getHTML(url))
if len(comments) == 0:
continue
title = validateTitle(title)
df = pd.DataFrame({'时刻': list(comments.keys()), '弹幕文本': list(comments.values())})
df.drop_duplicates(subset=['时刻', '弹幕文本'], keep='first', inplace=True)
df.to_csv(f"{save_folder}/{title}.csv", index=False, encoding='utf-8-sig')
print(f'已经保存 {df.shape[0]} 条弹幕到 {save_folder}/{title}.csv\n\n')
sleep(10)
if __name__ == '__main__':
main()
改进读取文本爬取,有点小问题,欢迎建议,修改,嘻嘻
def read_cid_from_file(filename,n=0):
try:
with open(filename, 'r', encoding='utf-8') as file:
cid_array = file.readlines()
# 去除每行末尾的换行符
cid_array = [cid.strip() for cid in cid_array]
return cid_array[n:]
except FileNotFoundError:
print(f"文件 '{filename}' 不存在。")
return []
import requests
import re
from bs4 import BeautifulSoup
import operator
import traceback
import os
import pandas as pd
from lxml import etree
from time import sleep
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
'Referer': 'https://www.bilibili.com/'
}
timeout = 5
def getHTML(url):
try:
response = requests.get(url=url, headers=headers,
timeout=timeout)
# 自适应编码
response.encoding = response.apparent_encoding
return response.text
# 下句作用等同于上两句
# return response.text.encode(response.encoding).decode('utf-8')
except:
print(f"reqeuset url : {url} error...")
print(traceback.format_exc())
return None
def parsePage(page):
try:
print("parsing...")
html_ = etree.HTML(page)
meta_title = html_.xpath('//meta[@name="title"]/@content')[0]
if meta_title == '视频去哪了呢?_哔哩哔哩_bilibili':
print(f'视频 404 not found')
return [], '视频 404 not found'
syntax = [':', '=']
flag = 0
keys = re.findall(r'"cid":[\d]*', page)
if not keys:
keys = re.findall(r'cid=[\d]*', page)
flag = 1
comments, title = {}, None
keys = [keys[1]]
for index, item in enumerate(keys):
key = item.split(syntax[flag])[1]
print(f'{index + 1}/{len(keys)}: {key}')
comment_url = f'https://comment.bilibili.com/{key}.xml' # 弹幕地址
comment_text = getHTML(comment_url)
bs4 = BeautifulSoup(comment_text, "html.parser")
if not title:
title = BeautifulSoup(page, "html.parser").find('h1').get_text().strip()
for comment in bs4.find_all('d'):
time = float(comment.attrs['p'].split(',')[0])
time = timeFormatter(time)
comments[time] = comment.string
sorted_comments = sorted(comments.items(), key=operator.itemgetter(0)) # 排序
comments = dict(sorted_comments)
print("parse finish")
return comments, title
except:
print("parse error")
print(traceback.format_exc())
def validateTitle(title):
re_str = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
new_title = re.sub(re_str, "_", title) # 替换为下划线
return new_title
def timeFormatter(param):
minute = int(param) // 60
second = float(param) - minute * 60
return f'{str(minute).zfill(2)}:{str(second).zfill(5)}'
def main(bvs):
bvs = bvs
for bv in bvs:
try:
url = bv
save_folder = "BarRage"
if not os.path.exists(save_folder):
os.mkdir(save_folder)
comments, title = parsePage(getHTML(url))
if len(comments) == 0:
continue
csv_filename = f"{save_folder}/{title}.csv"
if os.path.exists(csv_filename):
print(f"CSV文件 '{csv_filename}' 已存在,跳过...")
continue
title = validateTitle(title)
df = pd.DataFrame({'时刻': list(comments.keys()), '弹幕文本': list(comments.values())})
df.drop_duplicates(subset=['时刻', '弹幕文本'], keep='first', inplace=True)
df.to_csv(f"{save_folder}/{title}.csv", index=False, encoding='utf-8-sig')
print(f'已经保存 {df.shape[0]} 条弹幕到 {save_folder}/{title}.csv\n\n')
sleep(5)
except:
print(f"文件 解析错误,跳过 。")
continue
if __name__ == '__main__':
# 使用示例
filename = 'bilibli_cid.txt'
n = 1
cids = read_cid_from_file(filename,n)
main(cids)
# print(cids)