主要利用requests来实现的。评论和微博内容都可以采集,只要找到对应的url即可,但回复不能自动展开,需要自己一个一个来。代码中注释已经写的很详细啦,有问题的可以评论区留言~ 希望能帮助到有需要的人~
import json
import requests
from lxml import etree
import openpyxl
import random
UserAgent=[
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0',
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
'User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
'User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
]
all_data=[]
headers = {"User-Agent": random.choice(UserAgent),
"Cookie":'SINAGLOBAL=9928012592056.682.1625326517841; UOR=,,www.baidu.com; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh69CYQBUvvY0rSOCF4uhYT5JpX5KMhUgL.FoMceo20SKqfShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMES0zRe0nNShqN; WBPSESS=W7_rJKAFWvGJejM4CNtZ-aS9P5e991U5EV4nmaDEVejXT4E96MltkYNZyuBkE79WAfEPzEseo5UW9RcAm3-iRSbrOUx-JYTMdGoPZSQZWm4ywHYHoq3kIKbdrEUw-pZS; ULV=1626573687933:6:6:2:1332489454319.9338.1626573687926:1626572505494; ALF=1658113202; SSOLoginState=1626577202; SCF=AndSA-4E21jxInJ20GUebgJkGwu8TiCzbkBfYxyGLmLevU_zn7XXxaUg7uUJuFGYqqmZPTMUcIgJmCUfygDcwOM.; SUB=_2A25N9-liDeRhGeFI6VMS9SjJzzuIHXVuhV2qrDV8PUNbmtB-LVHekW9NfVO6im1kzbWZjDVNyC2wTdRsby4OvW7o; XSRF-TOKEN=Vf0qhRls1U_BPxvwss4zwDL9'
}#随机获取请求头 添加cookie
for i in range(0,120):
url="https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=4749428051149654&is_show_bulletin=2&is_mix=0&count=10&uid=1655444627"#首页的url
try:
url=url_next #try后面是可能会出错的,如果出错则运行except下的语句,因为首页的url和后面不同,但后面的形式又都相同,所以要先运行except下面的
except:
url=url #循环请求换url 第一次请求为首页的url
print("开始爬取第"+str(i)+"页")
response=requests.get(url=url,headers=headers)
data_dict=response.json() #字典形式根据提取的数据可以观察到数据是字典形式的
page_text=response.text
max_id=data_dict["max_id"] #解析出max_id的数字用于拼接url
#数据解析 (字典形式)
for data in data_dict["data"]:
content=data["text_raw"]
location=data["user"]["location"]
gender = data["user"]["gender"]
time = data["created_at"]
#爬取其他个人信息
all_data.append([content,location,gender,time]) #实例化储存
file = openpyxl.Workbook() #导入表格中
sheet = file.active
sheet.title = "疫苗接种微博.xlsx"
sheet['A1'] = "内容"
sheet['B1'] = "省份"
sheet['C1'] = "性别"
sheet['D1'] = "评论时间"
for i in all_data:
sheet.append(i)
file.save("#“针心针意”!疫苗监督员来喊你打加强针了!#.xlsx")
url_next="https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4749428051149654&is_show_bulletin=2&is_mix=0&max_id="+str(max_id)+"&count=20&uid=1655444627"#第n页url的max_id是前一页的max_id,即第n-1页的max_id,所以url_next要放在url后面
print('导入完成')