python爬虫获取给定新浪微博评论

最新推荐文章于 2024-04-16 15:26:40 发布

Petrichoryi

最新推荐文章于 2024-04-16 15:26:40 发布

阅读量1.5k

点赞数 4

分类专栏：数据挖掘文章标签： python 数据挖掘

本文链接：https://blog.csdn.net/Petrichoryi/article/details/106796363

版权

数据挖掘专栏收录该内容

5 篇文章 2 订阅

订阅专栏

本文分享爬取微博网页端指定微博的评论数据。
首先登录微博网页端，找一个感兴趣的微博：
在这里插入图片描述
打开评论页面，右键检查，点击network，ctrl+R加载页面

得到这个页面的cookie：

代码实现：
爬取了评论的昵称、时间和内容

import time
import  requests,json
from lxml import etree
import xlwt
wookbook=xlwt.Workbook(encoding='utf-8')
sheet=wookbook.add_sheet('sheet',cell_overwrite_ok=True)
sheet.write(0,0,'nick')
sheet.write(0,1,'time')
sheet.write(0,2,'content')

headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0',
        'Cookie':'SINAGLOBAL=5322597451823.386.1554213722659; Ugrow-G0=589da022062e21d675f389ce54f2eae7; login_sid_t=535c06faa28c0a73bbf2a70054bed5ac; cross_origin_proto=SSL; YF-V5-G0=bae6287b9457a76192e7de61c8d66c9d; WBStorage=42212210b087ca50|undefined; _s_tentry=passport.weibo.com; Apache=3011672908696.3213.1592668545629; ULV=1592668545635:44:6:1:3011672908696.3213.1592668545629:1591590712267; crossidccode=CODE-yf-1JMFR8-29rJK3-ng3qQtt3hYUdGQeb030fb; ALF=1624204599; SSOLoginState=1592668599; SCF=ApjScoaMbsXtNFObav_TZqQn86gd4_VisrebpOwKJO9-7nKNzPWApotfh41gp7QvIRfB-WzENTDQdqTziGo26tk.; SUB=_2A25z6kHoDeRhGeNJ61MZ8ijPwjmIHXVQnjQgrDV8PUNbmtANLRPtkW9NSBjGUQ-3h0MfrgBtUEtVUHAeybQTIcZ9; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhQl.su1CrnzMojsR4pBc225JpX5KzhUgL.Fo-Neh2Reoq01K-2dJLoIEnLxK-LBo5L12qLxKML1hqL122LxKqL1KnL1-qLxK-LB.2LBKU9C-_l; SUHB=0Jvg9O4IYZXCjE; wvr=6; UOR=www.psjia.com,widget.weibo.com,graph.qq.com; webim_unReadCount=%7B%22time%22%3A1592668759286%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A43%2C%22msgbox%22%3A0%7D; YF-Page-G0=580fe01acc9791e17cca20c5fa377d00|1592668778|1592668627'
}

def get_furl():
    flag=1
    url1='https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4517608383498080&from=singleWeiBo&page=1'
    txt=requests.get(url1,headers=headers).text
    cnt=1
    while flag==1:
            time.sleep(2)
            html=json.loads(txt)['data']['html']
            html=etree.HTML(html)
            # 得到该评论源码的所有评论
            uls = html.xpath('//div[@class="list_con"]')
            for ul in uls:
                user = ul.xpath('./div[@class="WB_text"]/a/text()')[0]
                comment = ul.xpath('./div[@class="WB_text"]/text()')[1]
                # 去除中文冒号：
                comment = comment.split('：', maxsplit=1)[-1]
                tim = ul.xpath('./div[contains(@class,"WB_func")]/div[contains(@class,"WB_from")]/text()')[0]
                user_url = 'https:' + ul.xpath('./div[@class="WB_text"]/a/@href')[0]
                print(user)
                sheet.write(cnt,0,user)
                print(comment)
                sheet.write(cnt,2,comment)
                print(tim)
                sheet.write(cnt,1,tim)
                cnt+=1

            try:
                net_url=html.xpath('//div[@node-type="comment_loading"]/@action-data')[0]
            except:
                try:

                    net_url=html.xpath('//a/@action-data')[-1]
                except:
                    print(cnt)
                    # print('*'*25)
                    wookbook.save('liziqi.xlsx')
                    exit()


                print(net_url)
            url1='https://weibo.com/aj/v6/comment/big?ajwvr=6&'+net_url+'&from=singleWeiBo&__rnd=1592668779880'
            txt = requests.get(url1, headers=headers).text

            print(url1)


    return  html
if __name__=='__main__':
    s=requests.Session()
    data=get_furl()