新浪微博自己的首页数据抓取（微博列表）

最新推荐文章于 2024-04-12 13:00:00 发布

qq_27047075

最新推荐文章于 2024-04-12 13:00:00 发布

阅读量513

点赞数

分类专栏：代码

本文链接：https://blog.csdn.net/qq_27047075/article/details/100164435

版权

代码专栏收录该内容

8 篇文章 0 订阅

订阅专栏

新浪微博自己的首页数据抓取（微博列表）
header 里的cookie参数需要自己从浏览器或者抓包获取

# -*- coding: utf-8 -*-
#获取第一页的信息，这是第一部分，由于后续的微博列表是动态json数据加载的，所以要有所不同
"""

@author: Administrator
"""
from bs4 import BeautifulSoup
import re
import requests
url='https://weibo.com/u/3861053083/home'


def getheader():
    header={
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Cookie': 'SINAGLOBAL=3879284861414.9067.1563893990409; UOR=,,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WW83xapWzevNz9e.2c0oy8V5JpX5KMhUgL.FoeRSo27SKe71he2dJLoIf2LxKqL1hnL1K2LxKBLBonL12BLxKBLB.eL1-2LxKqLBonLBonLxKqLBoeLBoeLxKBLB.BL1hqLxKqL1hnL1K2LxKBLB.eL1-2LxK-L1KqL1-Bt; wvr=6; wb_timefeed_3861053083=1; YF-V5-G0=86b4280420ced6d22f1c1e4dc25fe846; ALF=1597144486; SSOLoginState=1565608487; SCF=Amciou6xV3kvPiARuklGgLMkQki_V-kvu1iMGsyIQ4m_rmkQwvjv8eNyX6EuuOVG4Qsgmve5TBtaOt1_7bbzYKw.; SUB=_2A25wVTp3DeRhGeVG7VMR9S3Mwz-IHXVTIyy_rDV8PUNbmtANLW36kW9NT7XzYkmw6kaN3IclVMa-sVWgPZPbCk21; SUHB=0pHhc6amYcCm2h; _s_tentry=login.sina.com.cn; Apache=973419693606.7429.1565608492909; ULV=1565608492959:29:17:4:973419693606.7429.1565608492909:1565527206425; YF-Page-G0=89906ffc3e521323122dac5d52f3e959|1565608492|1565608492; wb_view_log_3861053083=1360*7681; webim_unReadCount=%7B%22time%22%3A1565608493916%2C%22dm_pub_total%22%3A0%2C%22chat_group_client%22%3A0%2C%22allcountNum%22%3A0%2C%22msgbox%22%3A0%7D',
'Host': 'weibo.com',
'Referer': 'https://weibo.com/u/3861053083/home',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
        }
    return header
def getmaininfo():
    html_doc=requests.get(url,headers=getheader())
    s=html_doc.text.replace('\\n','')
    s=s.replace('\\r','')
    s=s.replace('\\t','')
    s=s.replace('\\','')
    z=re.findall('<div node-type="homefeed">(.*?)</script>', s,re.M|re.I)
    print type(html_doc.text)
    soup=BeautifulSoup(str(z),'html.parser')
    WB_detailx= soup.find_all('div',class_='WB_detail')

    for WB_detail in WB_detailx:
        WB_detail=str(WB_detail)
        WB_info=BeautifulSoup(WB_detail,'html.parser',from_encoding='unicode-escape')
        nick_name=WB_info.find('div',class_='WB_info').a
        name=BeautifulSoup(str(nick_name),'html.parser').a['nick-name']

        WB_text=BeautifulSoup(WB_detail,'html.parser')
        divcontent=WB_text.find('div',class_='WB_text W_f14')
        divcontent=str(divcontent)
        content=BeautifulSoup(divcontent,'html.parser')
        contenttext=str(content.div)
        contentstring=BeautifulSoup(contenttext,'html.parser',from_encoding='unicode-escape')
        print name
        print '\n'
   
        print contentstring.text
        print 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
    
getmaininfo()