备忘5:爬取微博热门信息以及所有热门微博评论的用户信息


import requests
import os
import re
import csv
import time
import json




#headers是请求加上头信息,伪装成浏览器访问,不然会被限制
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
Cookies = {"Cookie": "_T_WM=ac858e32012c2c4bfcf46782f5928d99; WEIBOCN_FROM=1110006030; ALF=1525581075; SCF=AktDkPHfGtZ_G6P28yFN5QufvOsFbI5pFfURfdnppHMybWDbsdZlH3-nlg4Hh6tSqjgFhEvVQ1lq059Wazz218Y.; SUB=_2A253wogLDeRhGeVO41YZ8ijOwjyIHXVVTChDrDV6PUJbktAKLRbfkW1NTSfFYWsnMJ4LU8NJxBU3Eb81A5_vuVyZ; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WFEZ5vWGfwYo6aYTg5NIEeO5JpX5K-hUgL.Foe71hBReoqE1K52dJLoIpeLxKqL1-BLBKnLxKqL1KnL128j; SUHB=0jB402nFCGohUc; SSOLoginState=1522989147; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D1076033084826290%26fid%3D1005053084826290%26uicode%3D10000011"}
#当出现一些解决不了的问题时候  试着更新一下Cookies


#用户信息,同时也能获取到uid、fid、oid等关键参数
def get_user_info(usr_id):
    
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={usr_id}'.format(usr_id=usr_id)
    resp = requests.get(url, headers=headers, cookies=Cookies)
    jsondata = resp.json()
    #print(jsondata)
    nickname = jsondata.get('data').get('userInfo').get('screen_name')
    mblog_num = jsondata.get('data').get('userInfo').get('statuses_count')
    verified = jsondata.get('data').get('userInfo').get('verified')
    verified_reason = jsondata.get('data').get('userInfo').get('verified_reason')
    gender = jsondata.get('data').get('userInfo').get('gender')
    urank = jsondata.get('data').get('userInfo').get('urank')  #用户等级
    mbrank = jsondata.get('data').get('userInfo').get('mbrank')
    followers_count = jsondata.get('data').get('userInfo').get('followers_count')
    follow_count = jsondata.get('data').get('userInfo').get('follow_count')
    uid = jsondata.get('data').get('userInfo').get('toolbar_menus')[0].get('params').get('uid')
    try:
        
        fid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('fid')
        oid = jsondata.get('data').get('userInfo').get('toolbar_menus')[2].get('params').get('menu_list')[0].get('actionlog').get('oid') #有时候出现oid后面不是Hotblog错误的时候把0换成1  或者1换成0
        cardid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('cardid')
    except:
        uid = ''
        fid = ''
        oid = ''
        cardid = ''
    containerid = jsondata.get('data').get('tabsInfo').get('tabs')[0].get('containerid')
    Info = {'nickname':nickname,'mblog_num':mblog_num,
            'verified':verified,'verified_reason':verified_reason,
            'gender':gender,'urank':urank,'mbrank':mbrank,'followers_count':followers_count,
            'follow_count':follow_count,'uid':uid,'fid':fid,
            'cardid':cardid,'containerid':containerid,'oid':oid
            }
    print(Info)
    return Info


#获取所有热门微博信息(所发微博内容,创建时间,每条热门微博id,每条微博的评论数,转发数,评论数...)
def mblog_list(uid,oid):
    ids=[]
    base_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}'
    #base_url='https://m.weibo.cn/api/container/getIndex?containerid={oid}&luicode=10000011&lfid=1005051282005885&featurecode=20000320'
    #base_url= 'https://m.weibo.cn/api/container/getIndex?containerid={uid}'
    page_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}&type=uid&value={uid}&page={page}'
    #page_url ='https://m.weibo.cn/api/container/getIndex?containerid={uid}&page={page}'
    url = base_url.format(oid=oid)
    print(url)
    resp = requests.get(url, headers=headers, cookies=Cookies)
    resp.encoding = 'gbk'
    response = resp.json()
    #print(response)
    #热门微博数total
    total = response['data']['cardlistInfo']['total']
    print(total)
    #热门微博网页数


    path = os.getcwd()+'/{dirname}/'.format(dirname='博主微博热门信息汇总')
    os.mkdir(path)
    path2 = os.getcwd() + '/%s/%s.csv'%('博主微博热门信息汇总',uid)
    csvfile = open(path2, 'a+', encoding='gb18030', newline='')
    writer = csv.writer(csvfile)
    writer.writerow(('id','reposts_count','comments_count','attitudes_count','date','text'))
    
    page_num = int(int(total)/10)+1
    
    for i in range(1,page_num+1,1):
        #if i==2:
            #break
        p_url = page_url.format(oid=oid, uid=uid, page=i)
        #print(p_url)
        page_resp = requests.get(p_url,headers=headers,cookies=Cookies)
        resp.encoding = 'gbk'
        page_data = page_resp.json()
        '''filename='22.json'
        with open(filename,'w') as f:
            json.dump(page_data,f)'''
        try:
            cards = page_data['data']['cards']
            #print(cards)
            for card in cards:
                #print(card)
                try:
                    mblog = card['mblog']
                    #print(mblog)
                    date = mblog['created_at']
                    id = mblog['id']
                    ids.append(id)
                    dirty_text = mblog['text']  #dirty_text中含有很多链接杂质
                    cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
                    text = re.sub(r"<a .*?</a>", '', cleaned1)
                    reposts_count = mblog['reposts_count']
                    comments_count = mblog['comments_count']
                    attitudes_count = mblog['attitudes_count']          
                    writer.writerow((id,reposts_count,comments_count,attitudes_count,date,text))
                    print('有%d页,已经爬了%d页   %s'%(page_num, i, id))
                except:
                    continue


         
        except:
            continue
        
        time.sleep(1)
    return ids


#获取某微博评论,保存到usr_id下的文件夹wb_id.csv文件中
def get_comments(usr_id, wb_id):
    url = 'https://m.weibo.cn/api/comments/show?id={id}'.format(id=wb_id)
    page_url = 'https://m.weibo.cn/api/comments/show?id={id}&page={page}'
    Resp = requests.get(url, headers=headers, cookies=Cookies)
    Resp.encoding = 'gbk'
    #print(url)
    Resp_data=Resp.json()
    try:
	    page_max_num = Resp.json()['data']['max']
	    path2 = os.getcwd() + '/%s/%s.csv'%(usr_id,wb_id)
	    csvfile = open(path2, 'a+', encoding='gb18030', newline='')
	    writer = csv.writer(csvfile)
	    writer.writerow(('username','verified','verified_type','profile_url','source','review_id','like_counts','image','date','comment'))
	    for i in range(page_max_num//2,page_max_num+1):
	        if i==101:
	            break
	            
	        p_url = page_url.format(id=wb_id,page=i)
	        resp = requests.get(p_url, cookies=Cookies, headers=headers)
	        print(resp.status_code)
	        resp_data = resp.json()
	        try:
	            data = resp_data.get('data').get('data')
	            #print(data)
	            for d in data:
	                review_id = d['id']
	                like_counts = d['like_counts']
	                source = d['source']
	                username = d['user']['screen_name']
	                image = d['user']['profile_image_url']
	                verified = d['user']['verified']
	                verified_type = d['user']['verified_type']
	                profile_url = d['user']['profile_url']
	                dirty_text = d['text']
	                cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
	                comment = re.sub(r"<a .*?</a>", '', cleaned1)
	                date = d['created_at']
	                print(comment)
	                writer.writerow((username, verified, verified_type, profile_url, source, review_id, like_counts, image,
	                                 date, comment))
	                print('有%d页,已经爬了%d页   %s'%(page_max_num, i, comment))
	        except:
	            print(resp_data['msg'])
	            continue
	        time.sleep(1)
	    csvfile.close()
    except:
        print(Resp_data['msg'])
        
        
def main():
    #user_id= '1655128924'
    #user_id='2736225585'
    #user_id = '2386831995'
    user_id= '3118861807'
    #wb_id='4225101094628701'
    user_info = get_user_info(user_id)
    uid = user_info.get('uid')
    oid = user_info.get('oid')
    print(uid,oid)
    r=mblog_list(uid,oid)   
    print('............')
    
    path = os.getcwd()+'/{dirname}/'.format(dirname=user_id)#先在主函数中把名字为user_id建好
    os.mkdir(path)
    for i in range(len(r)): #循环爬取博主所有的热门微博的评论用户的信息
        print('这是第'+str(i)+'条热门微博')
        wb_id=r[i]       
        get_comments(user_id,wb_id)




main()







  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值