抓取思路:
- .手动搜索要抓取的人的主页,进去,浏览器调试找到数据接口
- 通过curl工具,自动成成请求代码
- 编辑器请求代码,获取json
- 解析json,得到发微博人的id,本条微博的id等基础信息,将本条微博的id同步压入队列中,翻页直接循环一个(1,total_num)即可
- 用上面解析出来的内容(包括图片,视频链接),对图片,视频进行下载
- 通过队列中的微博id,对评论接口发起请求,获取一页(20条)评论,从返回的信息中解析构造新的json,还需要解析一个叫max_id的数据,作为参数请求接下来评论数据,直到微博评论条数为0为止,(这步有大量的请求是无效的,拿不到任何数据)
没有登陆,cookies是curl解析出来的,带不带都能请求到数据,原本以为要登陆,后来通过对接口测试发现,不登陆也能实现抓取,登陆只是前端强制,微博后端接口并未做该限制
缺点:
代码不太严谨,线程开启过多,json处理不够严谨,评论抓取耗时过长,电脑性能不好容易导致IDE崩溃,有时间会用线程池做优化。
# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des :
"""
# !/usr/bin/env python3
# -*- coding: UTF-8 -*-
"""
@des :
"""
from Project.util import pathways_util
import requests
import os
import time
from threading import Thread
from tqdm import tqdm
import queue
def req_get_data(url, headers=None, cookies=None, params=None, retry=3):
"""
get请求
:param url:
:param headers:
:param cookies:
:param params:
:param retry:重试次数
:return:
"""
status = False
while retry > 0 and not status:
try:
response = requests.get(
url=url,
headers=headers,
cookies=cookies,
params=params)
time.sleep(1)
response.encoding = "utf-8"
status = True
return response
except Exception as e:
retry -= 1
if retry == 0:
print(e)
def dowmload_pic(lis):
"""
图片下载
:param lis:
:return:
"""
parent_path = R"D:\gitlab\baidu\Project\test\KS\pic"
for url in lis:
li = url.split("/")
path1 = li[-2]
pic_name = li[-1]
save_path = os.path.join(parent_path, path1)
pathways_util.create_path(save_path)
pic_content = req_get_data(url).content
pic_file = os.path.join(save_path, pic_name)
# print(pic_file)
with open(pic_file, "wb")as fp:
fp.write(pic_content)
def dowmload_video(url):
"""
视频下载
:param url:
:return:
"""
save_path = R"D:\gitlab\baidu\Project\test\KS\video"
_li = url.split("?")[0]
li = _li.split("/")
pic_name = li[-1]
# print(os.path.exists(save_path))
pathways_util.create_path(save_path)
pic_content = req_get_data(url).content
pic_file = os.path.join(save_path, pic_name)
# print(pic_file)
with open(pic_file, "wb")as fp:
fp.write(pic_content)
def download(lis):
"""
下载
:param lis:
:return:
"""
# 图片视频下载
for i in lis:
if i["pic_lis"] != "not found":
# print(i["pic_lis"])
Thread(target=dowmload_pic, args=(i["pic_lis"],)).start()
# dowmload_pic(i["pic_lis"])
if i["pic_largest"] != "not found":
# print(i["pic_largest"])
Thread(target=dowmload_pic, args=(i["pic_largest"],)).start()
# dowmload_pic(i["pic_largest"])
if i["video_url"] != "not found":
# print(i["video_url"])
Thread(target=dowmload_video, args=(i["video_url"],)).start()
# dowmload_video(i["video_url"])
def run(total_page, uid):
"""
爬取微博信息
:return:
"""
# 微博信息接口地址
url = "https://weibo.com/ajax/statuses/mymblog"
for num in tqdm(range(1, total_page + 1)):
params = {
"uid": uid,
"page": "{}".format(num),
"feature": "0"
}
json_response = req_get_data(url, headers, cookies, params).json()
lis = []
# print(json_response["data"]["list"])
# 解析源码,转储json结构
for i in json_response["data"]["list"]:
map = {}
try:
id = i["id"]
queue.put(id)
except BaseException:
id = "未知"
try:
user_id = i["user"]["id"]
except BaseException:
user_id = "未知"
try:
created_time = i["created_at"]
except BaseException:
created_time = "未知"
try:
weibo_text = i["text_raw"]
except BaseException:
weibo_text = "not found"
try:
title = i["page_info"]["content1"]
except BaseException:
title = "not found"
try:
_title = i["page_info"]["content2"]
except BaseException:
_title = "not found"
try:
video_url = i["page_info"]["media_info"]["stream_url_hd"]
except BaseException:
video_url = "not found"
try:
pic_lis = [
"https://wx4.sinaimg.cn/orj360/{}.jpg".format(str(i)) for i in i["pic_ids"]]
if pic_lis == []:
pic_lis = "not found"
except BaseException:
pic_lis = "not found"
try:
pic_largest = [i["largest"]["url"]
for i in i["retweeted_status"]["pic_infos"].values()]
if pic_largest == []:
pic_largest = "not found"
except BaseException:
pic_largest = "not found"
map["id"] = id
map["user_id"] = user_id
map["created_time"] = created_time
map["weibo_text"] = weibo_text
map["title"] = title
map["_title"] = _title
map["video_url"] = video_url
map["pic_largest"] = pic_largest
map["pic_lis"] = pic_lis
lis.append(map)
save_file = os.path.join(save_json_path, str(num) + ".json")
with open(save_file, "w", encoding="utf-8")as fp:
fp.write(str(lis))
# print(lis)
download(lis)
def run2(article_id, uid, _path):
"""
评论抓取
:return:
"""
global max_id
url = "https://weibo.com/ajax/statuses/buildComments"
file = os.path.join(_path, "{}.json".format(article_id))
lis = []
params = {
"flow": "0",
"is_reload": "1",
"id": "{}".format(article_id),
"is_show_bulletin": "2",
"is_mix": "0",
"max_id": "{}".format(max_id),
"count": "20",
"uid": uid
}
try:
response = req_get_data(url, headers, cookies, params)
res = response.json()["data"]
for i in res:
map = {}
map["id"] = i["id"]
map["text_raw"] = i["text_raw"]
map["description"] = i["user"]["description"]
map["floor_number"] = i["floor_number"]
map["location"] = i["user"]["location"]
map["user_id"] = i["user"]["id"]
map["username"] = i["user"]["screen_name"]
map["profile_image_url"] = i["user"]["profile_image_url"]
map["avatar_hd"] = i["user"]["avatar_hd"]
lis.append(map)
if lis != []:
with open(file, "a", encoding="utf-8")as fp:
fp.write(str(lis) + '\n')
else:
pass
# print(lis)
try:
max_id = response.json()["max_id"]
# 多线程爬取下页评论
Thread(target=run2, args=(article_id,)).start()
except BaseException:
print("{}下评论抓取完成".format(article_id))
except BaseException:
print("数据返回错误错误")
if __name__ == '__main__':
uid = 1646239802
total_page = 29
queue = queue.Queue()
headers = {
"authority": "weibo.com",
"accept": "application/json, text/plain, */*",
"accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"cache-control": "no-cache",
"client-version": "v2.36.2",
"dnt": "1",
"pragma": "no-cache",
"referer": "https://weibo.com/liyuchun",
"sec-ch-ua": "\"Google Chrome\";v=\"105\", \"Not)A;Brand\";v=\"8\", \"Chromium\";v=\"105\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"server-version": "v2022.10.09.1",
# "traceparent": "00-9f3b2454c0810fc61a10ab18d3a3b855-057303b29348c864-00",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
"x-xsrf-token": "YCUWnOZu2_lCAgOQ-03rUhnb"
}
cookies = {
"SUB": "_2AkMUSRrgf8NxqwJRmP8dymrnZYt3zwDEieKiFes7JRMxHRl-yT9jql4PtRB6P8k0D2oZkvenD_TU_lupH_VyIu2HKt2W",
"SUBP": "0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWBTlk4q_ADB2gk335lJAyd",
"SINAGLOBAL": "9008534476428.42.1665208272507",
"XSRF-TOKEN": "YCUWnOZu2_lCAgOQ-03rUhnb",
"_s_tentry": "weibo.com",
"Apache": "3530993333926.089.1665457228837",
"ULV": "1665457228985:2:2:1:3530993333926.089.1665457228837:1665208272561",
"WBPSESS": "5fStQf4aE0d6e7rh9d-P6kT2L24ujmwJnUOkWzQKG-MQU8L534-LA9HTDBuvw3r9XO9hYndcirt_F-6AFGpc8XLzuH3spEY8m-xEoVtr8Wh4pEuH7EV06_mDtpV4V9GqKTzoOa8POo0fTtP5kMxMa50keg7bDIDCdqYI7iyEPCY="
}
save_json_path = R"D:\gitlab\baidu\Project\test\KS\json"
pathways_util.create_path(save_json_path)
run(total_page, uid)
max_id = 0
_path = R"D:\gitlab\baidu\Project\test\KS\评论"
pathways_util.create_path(_path)
while not queue.empty():
article_id = queue.get()
run2(_path, article_id)
# Thread(target=run2, args=(_path,article_id)).start()