通过微博用户名称获取用户id及用户的文章详情

最新推荐文章于 2024-11-24 19:20:31 发布

自沉于海

最新推荐文章于 2024-11-24 19:20:31 发布

阅读量2.5k

点赞数 1

分类专栏： python 爬虫

本文链接：https://blog.csdn.net/m0_37719865/article/details/107713827

版权

python 同时被 2 个专栏收录

11 篇文章

订阅专栏

爬虫

1 篇文章

订阅专栏

import requests
import re
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36",
}

def get_userinfo(user_id):
    #用户信息的url
    userapi = "https://m.weibo.cn/api/container/getIndex?type=uid&value={0}&containerid=100505{0}".format(user_id)
    usdata=requests.get(userapi).json()
    wbname=usdata.get('data').get('userInfo').get('screen_name')
    return wbname
def get_userdata(name):
    #用户数据的url
    user_id=get_userid(name)
    dataapi = "https://m.weibo.cn/api/container/getIndex?type=uid&value={0}&containerid=107603{0}".format(user_id)
    parm={
        "page":1
    }
    userdata = requests.get(dataapi,params=parm,headers=headers).json()
    # 微博主页内容,每页的全部微博内容
    content_infos = userdata.get('data').get('cards')
    article_author=get_userinfo(user_id)
    for content in content_infos:
        con=content.get('mblog',0)
        if con:
            create_time=con.get('created_at')
            text=con.get('text')
            if '全文' in text:
                err=etree.HTML(text)
                cont=err[0].xpath('string(.)').strip().replace(" ", "")
                article_title = cont[0:10]
                utl=err.xpath("//a/@href")[-1] #用于获取全文的url
                wzid=utl.split('/')[-1]
                print(wzid)
                Longarticle_url='https://m.weibo.cn/statuses/extend?id='+str(wzid)
                print(Longarticle_url)
                userdata = requests.get(Longarticle_url, headers=headers).json()
                longarticle=userdata.get('data').get('longTextContent')
                print("文章标题是{},文章的创建时间是{},文章的内容是{},文章的作者是{}".format(article_title, create_time, longarticle, article_author))
            else:
                err = etree.HTML(text)
                cont =err[0].xpath('string(.)').strip().replace(" ", "")
                article_title = cont[0:10]
                print("文章标题是{},文章的创建时间是{},文章的内容是{},文章的作者是{}".format(article_title, create_time, cont, article_author))

def get_userid(wbname):
    url = "https://s.weibo.com/user?q="+str(wbname)
    res=requests.get(url,headers=headers)
    main_url=etree.HTML(res.text).xpath('//div[@id="pl_user_feedList"]/div[@class="card card-user-b s-pg16 s-brt1"][1]/div[@class="avator"]/a/@href')[0]
    mainpage='https:'+main_url+'?profile_ftype=1&is_all=1'
    print(mainpage)
    cookie='SINAGLOBAL=9822604201551.475.1595077260698; un=15716291684; _s_tentry=ent.sina.com.cn; Apache=9492128753449.434.1596156568474; ULV=1596156568505:19:19:8:9492128753449.434.1596156568474:1596070600206; Ugrow-G0=589da022062e21d675f389ce54f2eae7; YF-V5-G0=b1b8bc404aec69668ba2d36ae39dd980; login_sid_t=663ba9eef66a71c4cad7386a937936e2; cross_origin_proto=SSL; wb_view_log=1920*10801; SCF=AlxhmsjmtfEtlIbAJfz-KnNyRXrZnfobCuff1q1sEZefLNec4FxXGnj_vVZS3sWMzHS6lMNVFIZYaQwRFeprR3w.; SUHB=04WuXIH_Q_UReY; UOR=cn.bing.com,www.weibo.com,login.sina.com.cn; wb_view_log_7318407842=1920*10801; SUB=_2AkMof_wSdcPxrARSmfARzGzqaI9H-jybqpXkAn7uJhMyAxh77lEqqSVutBF-XIS9aO0dDN10f4PaBo9YoHl4So_J; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WW9jKcDfaemH.d.96zLM1IR5JpVF02fS02ceo.pSonX; YF-Page-G0=b7e3c62ec2c0b957a92ff634c16e7b3f|1596164740|1596164727; webim_unReadCount=%7B%22time%22%3A1596165384782%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D'
    cookie_dict = {i.split("=")[0]: i.split("=")[1] for i in cookie.strip('\n').replace(' ', '').split(";")}
    page_content=requests.get(mainpage,headers=headers,cookies=cookie_dict)
    r=re.compile("CONFIG\[\'oid\'\]=\'(.*)\';")
    userid=r.findall(page_content.text)[0]
    print(userid)
    return userid

if __name__ == '__main__':
    get_userdata('宁波晚报')

参考文章:

https://blog.csdn.net/u012813109/article/details/107659556