爬取热门微博数据2018.3.27更新

 
import requests
import os
import re
import csv
import time
import json



headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
Cookies = {"Cookie": "你的cookies"}
#当出现一些解决不了的问题时候  试着更新一下Cookies

#用户信息,同时也能获取到uid、fid、oid等关键参数
def get_user_info(usr_id):
    
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value={usr_id}'.format(usr_id=usr_id)
    resp = requests.get(url, headers=headers, cookies=Cookies)
    jsondata = resp.json()
    #print(jsondata)
    nickname = jsondata.get('data').get('userInfo').get('screen_name')
    mblog_num = jsondata.get('data').get('userInfo').get('statuses_count')
    verified = jsondata.get('data').get('userInfo').get('verified')
    verified_reason = jsondata.get('data').get('userInfo').get('verified_reason')
    gender = jsondata.get('data').get('userInfo').get('gender')
    urank = jsondata.get('data').get('userInfo').get('urank')  #用户等级
    mbrank = jsondata.get('data').get('userInfo').get('mbrank')
    followers_count = jsondata.get('data').get('userInfo').get('followers_count')
    follow_count = jsondata.get('data').get('userInfo').get('follow_count')
    uid = jsondata.get('data').get('userInfo').get('toolbar_menus')[0].get('params').get('uid')
    try:
        
        fid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('fid')
        oid = jsondata.get('data').get('userInfo').get('toolbar_menus')[2].get('params').get('menu_list')[0].get('actionlog').get('oid')
        cardid = jsondata.get('data').get('userInfo').get('toolbar_menus')[1].get('actionlog').get('cardid')
    except:
        uid = ''
        fid = ''
        oid = ''
        cardid = ''
    containerid = jsondata.get('data').get('tabsInfo').get('tabs')[0].get('containerid')
    Info = {'nickname':nickname,'mblog_num':mblog_num,
            'verified':verified,'verified_reason':verified_reason,
            'gender':gender,'urank':urank,'mbrank':mbrank,'followers_count':followers_count,
            'follow_count':follow_count,'uid':uid,'fid':fid,
            'cardid':cardid,'containerid':containerid,'oid':oid
            }
    print(Info)
    return Info

#获取所有热门微博信息(所发微博内容,每条微博的评论id,转发数,评论数...)
def mblog_list(uid,oid):
    Mblog_list = []
    base_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}'
    page_url = 'https://m.weibo.cn/api/container/getIndex?containerid={oid}&type=uid&value={uid}&page={page}'
    url = base_url.format(oid=oid)
    resp = requests.get(url, headers=headers, cookies=Cookies)
    resp.encoding = 'gbk'
    response = resp.json()
    #print(response)
    #热门微博数total
    total = response['data']['cardlistInfo']['total']
    print(total)
    #热门微博网页数
    page_num = int(int(total)/10)+1
    for i in range(1,page_num+1,1):
        p_url = page_url.format(oid=oid, uid=uid, page=i)
        #print(p_url)
        page_resp = requests.get(p_url,headers=headers,cookies=Cookies)
        resp.encoding = 'gbk'
        page_data = page_resp.json()
        '''filename='22.json'
        with open(filename,'w') as f:
            json.dump(page_data,f)'''
        try:
            cards = page_data['data']['cards']
            #print(cards)
            for card in cards:
                #print(card)
                try:                    
                    mblog = card['mblog']
                    created_at = mblog['created_at']
                    id = mblog['id']
                    dirty_text = mblog['text']  #dirty_text中含有很多链接杂质
                    cleaned1 = re.sub(r'<span .*?</span>', '', dirty_text)
                    text = re.sub(r"<a .*?</a>", '', cleaned1)
                    reposts_count = mblog['reposts_count']
                    comments_count = mblog['comments_count']
                    attitudes_count = mblog['attitudes_count']
                    mblog_data = {'created_at': created_at, 'id': id, 'text': text, 'reposts_count': reposts_count,
                                  'comments_count': comments_count, 'attitudes_count': attitudes_count}
                    Mblog_list.append(mblog_data)
                    print(' '*10,mblog_data)
                except:                 
                    continue
                
                print('................')
        except:
            continue
        time.sleep(1)
    return Mblog_list



def main():
    #user_id= '1655128924'
    #user_id='2736225585'
    #user_id = '2386831995'
    user_id= '1282005885'
    user_info = get_user_info(user_id)
    uid = user_info.get('uid')
    oid = user_info.get('oid')
    print(uid,oid)
    mblog_list(uid,oid) 
    print('............')


main()








  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值