python爬虫

主要利用requests来实现的。评论和微博内容都可以采集,只要找到对应的url即可,但回复不能自动展开,需要自己一个一个来。代码中注释已经写的很详细啦,有问题的可以评论区留言~ 希望能帮助到有需要的人~

import json
import requests
from lxml import etree
import openpyxl
import random
UserAgent=[
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
    'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
    'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)',
    'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E) QQBrowser/6.9.11079.201',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0',
    'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1',
    'User-Agent:Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
    'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
    'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
    'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
    'User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'User-Agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)',
    'User-Agent:Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0'
]
all_data=[]
headers = {"User-Agent": random.choice(UserAgent),
           "Cookie":'SINAGLOBAL=9928012592056.682.1625326517841; UOR=,,www.baidu.com; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh69CYQBUvvY0rSOCF4uhYT5JpX5KMhUgL.FoMceo20SKqfShM2dJLoIp7LxKML1KBLBKnLxKqL1hnLBoMES0zRe0nNShqN; WBPSESS=W7_rJKAFWvGJejM4CNtZ-aS9P5e991U5EV4nmaDEVejXT4E96MltkYNZyuBkE79WAfEPzEseo5UW9RcAm3-iRSbrOUx-JYTMdGoPZSQZWm4ywHYHoq3kIKbdrEUw-pZS; ULV=1626573687933:6:6:2:1332489454319.9338.1626573687926:1626572505494; ALF=1658113202; SSOLoginState=1626577202; SCF=AndSA-4E21jxInJ20GUebgJkGwu8TiCzbkBfYxyGLmLevU_zn7XXxaUg7uUJuFGYqqmZPTMUcIgJmCUfygDcwOM.; SUB=_2A25N9-liDeRhGeFI6VMS9SjJzzuIHXVuhV2qrDV8PUNbmtB-LVHekW9NfVO6im1kzbWZjDVNyC2wTdRsby4OvW7o; XSRF-TOKEN=Vf0qhRls1U_BPxvwss4zwDL9'

           }#随机获取请求头 添加cookie
for i in range(0,120):
    url="https://weibo.com/ajax/statuses/buildComments?is_reload=1&id=4749428051149654&is_show_bulletin=2&is_mix=0&count=10&uid=1655444627"#首页的url
    try:
        url=url_next            #try后面是可能会出错的,如果出错则运行except下的语句,因为首页的url和后面不同,但后面的形式又都相同,所以要先运行except下面的
    except:
        url=url                 #循环请求换url 第一次请求为首页的url
    print("开始爬取第"+str(i)+"页")
    response=requests.get(url=url,headers=headers)
    data_dict=response.json()           #字典形式根据提取的数据可以观察到数据是字典形式的
    page_text=response.text
    max_id=data_dict["max_id"]          #解析出max_id的数字用于拼接url
    
    #数据解析 (字典形式)
    for data in data_dict["data"]:
        content=data["text_raw"]
        location=data["user"]["location"]
        gender = data["user"]["gender"]
        time = data["created_at"]
        #爬取其他个人信息
        all_data.append([content,location,gender,time])        #实例化储存
    file = openpyxl.Workbook()                              #导入表格中
    sheet = file.active
    sheet.title = "疫苗接种微博.xlsx"
    sheet['A1'] = "内容"
    sheet['B1'] = "省份"
    sheet['C1'] = "性别"
    sheet['D1'] = "评论时间"
    for i in all_data:
        sheet.append(i)
        file.save("#“针心针意”!疫苗监督员来喊你打加强针了!#.xlsx")
    url_next="https://weibo.com/ajax/statuses/buildComments?flow=0&is_reload=1&id=4749428051149654&is_show_bulletin=2&is_mix=0&max_id="+str(max_id)+"&count=20&uid=1655444627"#第n页url的max_id是前一页的max_id,即第n-1页的max_id,所以url_next要放在url后面
print('导入完成')


  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

#温室里的土豆

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值