备忘1：爬取热门微博评论

最新推荐文章于 2022-11-07 09:48:03 发布

我曾记得曾经

最新推荐文章于 2022-11-07 09:48:03 发布

阅读量452

点赞数

分类专栏：微博爬虫

本文链接：https://blog.csdn.net/weixin_40576260/article/details/79802128

版权

微博爬虫专栏收录该内容

8 篇文章 5 订阅

订阅专栏

'''python3'''

'''2018/04/03'''

import requests
import os
import re
import csv
import time
import json


#headers是请求加上头信息，伪装成浏览器访问，不然会被限制
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
Cookies = {"Cookie": "_T_WM=ac858e32012c2c4bfcf46782f5928d99; ALF=1525310818; SCF=AktDkPHfGtZ_G6P28yFN5QufvOsFbI5pFfURfdnppHMy3xq6GeZP_nVIBfgPk63bE8-IWesUjhW3dLT-uIR5Uwo.; SUB=_2A253xqp3DeRhGeVO41YZ8ijOwjyIHXVVSDY_rDV6PUJbktANLUHCkW1NTSfFYX53m_bQ56KWd8ygtVjCf2FsTQ5t; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFEZ5vWGfwYo6aYTg5NIEeO5JpX5K-hUgL.Foe71hBReoqE1K52dJLoIpeLxKqL1-BLBKnLxKqL1KnL128j; SUHB=0cSXDvABOpSMLa; SSOLoginState=1522719271; WEIBOCN_FROM=1110006030; M_WEIBOCN_PARAMS=oid%3D4224051549979655%26luicode%3D10000011%26lfid%3D1076033084826290%26fid%3D1005053084826290%26uicode%3D10000011"}
#当出现一些解决不了的问题时候  试着更新一下Cookies

#用户信息，同时也能获取到uid、fid、oid等关键参数
def get_user_info(usr_id):
    
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={usr_id}'.format(usr_id=usr_id)
    resp = requests.get(url, headers=headers, cookies=Cookies)
    jsondata = resp.json()
    #print(jsondata)
    nickname = jsondata.get('data').get('userInfo').get('screen_name')
    mblog_num = jsondata.get('data').get('userInfo').get('statuses_count')
    verified = jsondata.get('data').get('userInfo').get('verified')
    verified_reason = jsondata.get('data').get('userInfo').get('verified_reason')
    gender = jsondata.get('data').get('userInfo').get('gender')
    urank = jsondata.get('data').get('userInfo').get('urank')  #用户等级
    mbrank = jsondata.get('data').get('userInfo').get('mbrank')
    followers_count = jsondata.get('data').get('userInfo').get('followers_count')
    follow_count = jsondata.get('data').get('userInfo').get('follow_count')
    uid = jsondata.get('data').get('userInfo').get('toolbar_menus')[0].get('params').get('uid')
    try:
        
        fid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('fid')
        oid = jsondata.get('data').get('userInfo').get('toolbar_menus')[2].get('params').get('menu_list')[1].get('actionlog').get('oid')#注意get('menu_list')[1] 是列表的第二个  不是第一个
        cardid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('cardid')
    except:
        uid = ''
        fid = ''
        oid = ''
        cardid = ''
    containerid = jsondata.get('data').get('tabsInfo').get('tabs')[0].get('containerid')
    Info = {'nickname':nickname,'mblog_num':mblog_num,
            'verified':verified,'verified_reason':verified_reason,
            'gender':gender,'urank':urank,'mbrank':mbrank,'followers_count':followers_count,
            'follow_count':follow_count,'uid':uid,'fid':fid,
            'cardid':cardid,'containerid':containerid,'oid':oid
            }
    print(Info)
    return Info

#获取所有热门微博信息（所发微博内容，每条微博的评论id,转发数，评论数...）
def mblog_list(uid,oid):
    Mblog_list = []
    base_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}'
    #base_url='https://m.weibo.cn/api/container/getIndex?containerid={oid}&luicode=10000011&lfid=1005051282005885&featurecode=20000320'
    #base_url= 'https://m.weibo.cn/api/container/getIndex?containerid={uid}'
    page_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}&type=uid&value={uid}&page={page}'
    #page_url ='https://m.weibo.cn/api/container/getIndex?containerid={uid}&page={page}'
    url = base_url.format(oid=oid)
    print(url)
    resp = requests.get(url, headers=headers, cookies=Cookies)
    resp.encoding = 'gbk'
    response = resp.json()
    #print(response)
    #热门微博数total
    total = response['data']['cardlistInfo']['total']
    print(total)
    #热门微博网页数
    page_num = int(int(total)/10)+1
    for i in range(page_num+1,1):
        p_url = page_url.format(oid=oid, uid=uid, page=i)
        #print(p_url)
        page_resp = requests.get(p_url,headers=headers,cookies=Cookies)
        resp.encoding = 'gbk'
        page_data = page_resp.json()
        '''filename='22.json'
        with open(filename,'w') as f:
            json.dump(page_data,f)'''
        try:
            cards = page_data['data']['cards']
            #print(cards)
            for card in cards:
                #print(card)
                try:                    
                    mblog = card['mblog']
                    created_at = mblog['created_at']
                    id = mblog['id']
                    dirty_text = mblog['text']  #dirty_text中含有很多链接杂质
                    cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
                    text = re.sub(r"<a .*?</a>", '', cleaned1)
                    reposts_count = mblog['reposts_count']
                    comments_count = mblog['comments_count']
                    attitudes_count = mblog['attitudes_count']
                    mblog_data = {'created_at': created_at, 'id': id, 'text': text, 'reposts_count': reposts_count,
                                  'comments_count': comments_count, 'attitudes_count': attitudes_count}
                    Mblog_list.append(mblog_data)
                    #print(' '*10,mblog_data)
                except:                 
                    continue
                
                
        except:
            continue
        time.sleep(1)
    return Mblog_list


#获取某微博评论，保存到usr_id下的文件夹wb_id.csv文件中
def get_comments(usr_id, wb_id):
    url = 'https://m.weibo.cn/api/comments/show?id={id}'.format(id=wb_id)
    page_url = 'https://m.weibo.cn/api/comments/show?id={id}&page={page}'
    Resp = requests.get(url, headers=headers, cookies=Cookies)
    Resp.encoding = 'gbk'
    #print(url)
    page_max_num = Resp.json()['data']['max']
    path = os.getcwd()+'/{dirname}/'.format(dirname=usr_id)
    os.mkdir(path)
    path2 = os.getcwd() + '/%s/%s.csv'%(usr_id,wb_id)
    csvfile = open(path2, 'a+', encoding='gb18030', newline='')
    writer = csv.writer(csvfile)
    writer.writerow(('username','verified','verified_type','profile_url','source','review_id','like_counts','image','date','comment'))
    for i in range(1,page_max_num+1):
        if i==100:
            break
            
        p_url = page_url.format(id=wb_id,page=i)
        resp = requests.get(p_url, cookies=Cookies, headers=headers)
        print(resp.status_code)
        resp_data = resp.json()
        try:
            data = resp_data.get('data').get('data')
            #print(data)
            for d in data:
                review_id = d['id']
                like_counts = d['like_counts']
                source = d['source']
                username = d['user']['screen_name']
                image = d['user']['profile_image_url']
                verified = d['user']['verified']
                verified_type = d['user']['verified_type']
                profile_url = d['user']['profile_url']
                dirty_text = d['text']
                cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
                comment = re.sub(r"<a .*?</a>", '', cleaned1)
                date = d['created_at']
                print(comment)
                writer.writerow((username, verified, verified_type, profile_url, source, review_id, like_counts, image,
                                 date, comment))
                print('有%d页，已经爬了%d页   %s'%(page_max_num, i, comment))
        except:
            print(resp_data['msg'])
            continue
        time.sleep(1)
    csvfile.close()
        
        
def main():
    #user_id= '1655128924'
    #user_id='2736225585'
    #user_id = '2386831995'
    user_id= '1721030997'
    wb_id='4223783835437135'
    user_info = get_user_info(user_id)
    uid = user_info.get('uid')
    oid = user_info.get('oid')
    print(uid,oid)
    mblog_list(uid,oid) 
    print('............')
    get_comments(user_id,wb_id)


main()

#备注1：微博的json内容 不断的在更新  如果出现数据为空的情况下，print（json数据）查看一个对应的字典内容有没有改变

#备注2：另外就目前而言只能爬取100页的微博评论

我曾记得曾经

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
备忘1：爬取热门微博评论

'''python3''''''2018/04/03'''import requestsimport osimport reimport csvimport timeimport json#headers是请求加上头信息，伪装成浏览器访问，不然会被限制headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) Appl...
复制链接

扫一扫

专栏目录