众所周知国内某些知名网站反爬机制出了名的严,记录下和爬取机制搏斗的方法:
1准备:cookies,参考方法:bilibili获取cookie_bilibili cookie-CSDN博客 打开网页开发者模式可以用shift+ctrl+i
2使用请求头和cookie正常访问
import requests
from bs4 import BeautifulSoup
import time
import json
# 分区名到B站分区url的简单映射
AREA_URLS = {
"动画": "https://www.bilibili.com/v/douga",
"音乐": "https://www.bilibili.com/v/music",
"游戏": "https://www.bilibili.com/v/game",
# 你可以继续扩展
}
def get_bilibili_urls(area_name, pages=20):
if area_name not in AREA_URLS:
print(f"暂不支持该分区:{area_name}")
return []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
area_url = AREA_URLS[area_name]
urls = set()
for page in range(1, pages+1):
url = f"{area_url}?page={page}"
print(f"正在采集第{page}页:{url}")
try:
resp = requests.get(url, headers=headers, timeout=10)
resp.raise_for_status() # 检查请求是否成功
soup = BeautifulSoup(resp.text, "html.parser")
# 更新选择器以适应新的页面结构
video_links = soup.select("a[href*='/video/BV']")
for link in video_links:
href = link['href']
if href.startswith('/video/BV'):
full_url = "https://www.bilibili.com" + href.split('?')[0]
urls.add(full_url)
print(f"当前页面找到 {len(video_links)} 个视频链接")
time.sleep(2) # 增加延迟时间
except requests.RequestException as e:
print(f"请求出错:{e}")
continue
return list(urls)
if __name__ == "__main__":
area = input("请输入分区名(如:动画):")
pages = int(input("请输入采集页数(如:20):"))
urls = get_bilibili_urls(area, pages)
print(f"共采集到{len(urls)}个视频链接")
# 保存到json文件
with open("video_urls.json", "w", encoding="utf-8") as f:
json.dump(urls, f, ensure_ascii=False, indent=2)
print("已保存到 video_urls.json")
如此运行发现爬取不到视频
添加refence头和改用API
import requests
import time
import json
import os
# 分区ID映射
AREA_IDS = {
"动画": 1,
"音乐": 3,
"游戏": 4,
# 你可以继续扩展
}
def load_cookie():
"""从文件加载Cookie"""
if os.path.exists('bilibili_cookie.txt'):
with open('bilibili_cookie.txt', 'r', encoding='utf-8') as f:
return f.read().strip()
return None
def save_cookie(cookie):
"""保存Cookie到文件"""
with open('bilibili_cookie.txt', 'w', encoding='utf-8') as f:
f.write(cookie)
def get_bilibili_urls(area_name, pages=20):
if area_name not in AREA_IDS:
print(f"暂不支持该分区:{area_name}")
return []
# 获取Cookie
cookie = load_cookie()
if not cookie:
print("未找到Cookie,请输入您的B站Cookie(从浏览器开发者工具中获取):")
cookie = input().strip()
save_cookie(cookie)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Cookie": cookie,
"Referer": "https://www.bilibili.com/",
"Origin": "https://www.bilibili.com"
}
area_id = AREA_IDS[area_name]
urls = set()
for page in range(1, pages+1):
# 使用B站新的API接口
api_url = f"https://api.bilibili.com/x/web-interface/popular?ps=30&pn={page}"
print(f"正在采集第{page}页:{api_url}")
try:
resp = requests.get(api_url, headers=headers, timeout=10)
resp.raise_for_status()
data = resp.json()
# 保存API响应用于调试
with open(f"api_response_{page}.json", "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"已保存API响应到 api_response_{page}.json")
if data['code'] == 0 and 'data' in data and 'list' in data['data']:
for item in data['data']['list']:
if 'bvid' in item:
url = f"https://www.bilibili.com/video/{item['bvid']}"
urls.add(url)
print(f"当前页面找到 {len(data['data']['list'])} 个视频")
else:
print(f"API返回错误:{data.get('message', '未知错误')}")
print(f"完整响应:{json.dumps(data, ensure_ascii=False, indent=2)}")
time.sleep(2) # 增加延迟时间
except requests.RequestException as e:
print(f"请求出错:{e}")
continue
except json.JSONDecodeError as e:
print(f"JSON解析错误:{e}")
continue
return list(urls)
if __name__ == "__main__":
area = input("请输入分区名(如:动画):")
pages = int(input("请输入采集页数(如:20):"))
urls = get_bilibili_urls(area, pages)
print(f"共采集到{len(urls)}个视频链接")
# 保存到json文件
with open("video_urls.json", "w", encoding="utf-8") as f:
json.dump(urls, f, ensure_ascii=False, indent=2)
print("已保存到 video_urls.json")