爬取微博全部一级评论（简单有效）

最新推荐文章于 2024-08-21 11:23:11 发布

稳在前

最新推荐文章于 2024-08-21 11:23:11 发布

阅读量3.2k

点赞数 2

分类专栏：不限文章标签： xpath 爬虫微博反爬

本文链接：https://blog.csdn.net/qq_44767889/article/details/103301518

版权

不限专栏收录该内容

24 篇文章 0 订阅

订阅专栏

要爬取的
在这里插入图片描述

import  requests,json
from lxml import etree
import xlwt
wookbook=xlwt.Workbook(encoding='utf-8')
sheet=wookbook.add_sheet('sheet',cell_overwrite_ok=True)
sheet.write(0,0,'昵称')
sheet.write(0,1,'时间')
sheet.write(0,2,'评论内容')

headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
        'Cookie': 'UOR=cq.people.com.cn,widget.weibo.com,www.baidu.com; SINAGLOBAL=8820348664023.912.1574510643668; ULV=1574590616135:2:2:1:2609036048862.229.1574590616131:1574510643676; SUB=_2AkMqha9xf8NxqwJRmfwRzWvgbYlxygvEieKc2V6qJRMxHRl-yT9jqmI_tRB6AQWBniAs_6LTl13APpmM6HFRJNr8korC; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhdwXUpZQU.RZS.ObOYcheW; login_sid_t=f7428b2bac48ff3272a6ca1a7f371494; cross_origin_proto=SSL; Ugrow-G0=589da022062e21d675f389ce54f2eae7; TC-V5-G0=799b73639653e51a6d82fb007f218b2f; WBStorage=42212210b087ca50|undefined; _s_tentry=www.baidu.com; Apache=2609036048862.229.1574590616131; wb_view_log=1366*7681; YF-Page-G0=aac25801fada32565f5c5e59c7bd227b|1574591169|1574591031; YF-V5-G0=2583080cfb7221db1341f7a137b6762e; TC-Page-G0=b32a5183aa64e96302acd8febeb88ce4|1574590842|1574590826'
}

def get_furl():
    flag=1
    url1='https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4430885656881201&from=singleWeiBo&page=1'
    txt=requests.get(url1,headers=headers).text
    cnt=1
    while flag==1:
            html=json.loads(txt)['data']['html']
            html=etree.HTML(html)
            # 得到该评论源码的所有评论
            uls = html.xpath('//div[@class="list_con"]')
            for ul in uls:
                user = ul.xpath('./div[@class="WB_text"]/a/text()')[0]
                comment = ul.xpath('./div[@class="WB_text"]/text()')[1]
                # 去除中文冒号：
                comment = comment.split('：', maxsplit=1)[-1]
                tim = ul.xpath('./div[contains(@class,"WB_func")]/div[contains(@class,"WB_from")]/text()')[0]
                user_url = 'https:' + ul.xpath('./div[@class="WB_text"]/a/@href')[0]
                print(user)
                sheet.write(cnt,0,user)
                print(comment)
                sheet.write(cnt,2,comment)
                print(tim)
                sheet.write(cnt,1,tim)
                cnt+=1

            try:
                net_url=html.xpath('//div[@node-type="comment_loading"]/@action-data')[0]
            except:
                try:

                    net_url=html.xpath('//a/@action-data')[-1]
                except:
                    print(cnt)
                    # print('*'*25)
                    wookbook.save('微博.xlsx')
                    exit()


                print(net_url)
            url1='https://weibo.com/aj/v6/comment/big?ajwvr=6&'+net_url+'&from=singleWeiBo&__rnd=1574942088520'
            txt = requests.get(url1, headers=headers).text

            print(url1)


    return  html
if __name__=='__main__':

    s=requests.Session()
    data=get_furl()