Python爬虫 爬取新浪微博热搜
文章目录
网页分析
找到热搜的排名,标题和热度,发现它们在同一路径
数据爬取
import requests
from lxml import etree
url= 'https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
#print(response.text)
headers={
'User-Agent': 'Mozilla/5.0 (Wind ows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36'
}
response=requests.get(url,headers=headers)
html=etree.HTML(response.text)
datas=html.xpath('//*[@id="pl_top_realtimehot"]/table/tbody/tr')
for data in datas:
data_title=data.xpath('td[2]/a/text()'