import requests import re from lxml import etree headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36", } def get_userinfo(user_id): #用户信息的url userapi = "https://m.weibo.cn/api/container/getIndex?type=uid&value={0}&containerid=100505{0}".format(user_id) usdata=requests.get(userapi).json() wbname=usdata.get('data').get('userInfo').get('screen_name') return wbname def get_userdata(name): #用户数据的url user_id=get_userid(name) dataapi = "https://m.weibo.cn/api/container/getIndex?type=uid&value={0}&containerid=107603{0}".format(user_id) parm={ "page":1 } userdata = requests.get(dataapi,params=parm,headers=headers).json() # 微博主页内容,每页的全部微博内容 content_infos = userdata.get('data').get('cards') article_author=get_userinfo(user_id) for content in content_infos: con=content.get('mblog',0) if con: create_time=con.get('created_at') text=con.get('text') if '全文' in text: err=etree.HTML(text) cont=err[0].xpath('string(.)').strip().replace(" ", "") article_title = cont[0:10] utl=err.xpath("//a/@href")[-1] #用于获取全文的url wzid=utl.split('/')[-1] print(wzid) Longarticle_url='https://m.weibo.cn/statuses/extend?id='+str(wzid) print(Longarticle_url) userdata = requests.get(Longarticle_url, headers=headers).json() longarticle=userdata.get('data').get('longTextContent') print("文章标题是{},文章的创建时间是{},文章的内容是{},文章的作者是{}".format(article_title, create_time, longarticle, article_author)) else: err = etree.HTML(text) cont =err[0].xpath('string(.)').strip().replace(" ", "") article_title = cont[0:10] print("文章标题是{},文章的创建时间是{},文章的内容是{},文章的作者是{}".format(article_title, create_time, cont, article_author)) def get_userid(wbname): url = "https://s.weibo.com/user?q="+str(wbname) res=requests.get(url,headers=headers) main_url=etree.HTML(res.text).xpath('//div[@id="pl_user_feedList"]/div[@class="card card-user-b s-pg16 s-brt1"][1]/div[@class="avator"]/a/@href')[0] mainpage='https:'+main_url+'?profile_ftype=1&is_all=1' print(mainpage) cookie='SINAGLOBAL=9822604201551.475.1595077260698; un=15716291684; _s_tentry=ent.sina.com.cn; Apache=9492128753449.434.1596156568474; ULV=1596156568505:19:19:8:9492128753449.434.1596156568474:1596070600206; Ugrow-G0=589da022062e21d675f389ce54f2eae7; YF-V5-G0=b1b8bc404aec69668ba2d36ae39dd980; login_sid_t=663ba9eef66a71c4cad7386a937936e2; cross_origin_proto=SSL; wb_view_log=1920*10801; SCF=AlxhmsjmtfEtlIbAJfz-KnNyRXrZnfobCuff1q1sEZefLNec4FxXGnj_vVZS3sWMzHS6lMNVFIZYaQwRFeprR3w.; SUHB=04WuXIH_Q_UReY; UOR=cn.bing.com,www.weibo.com,login.sina.com.cn; wb_view_log_7318407842=1920*10801; SUB=_2AkMof_wSdcPxrARSmfARzGzqaI9H-jybqpXkAn7uJhMyAxh77lEqqSVutBF-XIS9aO0dDN10f4PaBo9YoHl4So_J; SUBP=0033WrSXqPxfM72wWs9jqgMF55529P9D9WW9jKcDfaemH.d.96zLM1IR5JpVF02fS02ceo.pSonX; YF-Page-G0=b7e3c62ec2c0b957a92ff634c16e7b3f|1596164740|1596164727; webim_unReadCount=%7B%22time%22%3A1596165384782%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D' cookie_dict = {i.split("=")[0]: i.split("=")[1] for i in cookie.strip('\n').replace(' ', '').split(";")} page_content=requests.get(mainpage,headers=headers,cookies=cookie_dict) r=re.compile("CONFIG\[\'oid\'\]=\'(.*)\';") userid=r.findall(page_content.text)[0] print(userid) return userid if __name__ == '__main__': get_userdata('宁波晚报')
参考文章: