#>>>>>>>>>>>>>>>>>>>>>>>百度头条实时热点爬取
#>>>>>>>>>>>>>>>>>>>>>>>百度头条实时热点爬取
from bs4 import BeautifulSoup
import requests
import time
from fake_useragent import UserAgent
#伪造一个 UserAgent 赋值给use
use = UserAgent().chrome
url = 'http://top.baidu.com/buzz?b=1&c=513&fr=topbuzz_b341_c513'
headers = {
'User-Agent':use
}
#发起请求,请求源码, 转码
response = requests.get(url,headers = headers)
response.encoding = response.apparent_encoding #因为输出的不是中文,所以要转换编码格式
html = response.text
#调用bs4解析网页,把所有的tr标签全部匹配
soup = BeautifulSoup(html,'lxml')
masgs = soup.find_all('tr')[1:]
# print(masgs)
#遍历获得每个需要的信息
for td in masgs:
ranking1 = td.find_all(class_="num-top") #排名 #前三
ranking2 = td.find_all(class_="num-normal") #除去前三 class_="num-normal"
content = td.find_all(class_="list-title") #文案
head = td.find_all(class_="icon-rise") #热度
href = td.find_all('a',attrs={'class':"list-title",'target':"_blank"}) # 信息的跳转href
for rank1,content,head,href in zip(ranking1,content,head,href):
data1={
'rand': rank1.get_text(),
'content':content.get_text(),
'head':int(head.get_text()),
'href':href['href']
}
# print(data1)
for rank2,content,head,href in zip(ranking2,content,head,href):
data2={
'rand': rank2.get_text(),
'content':content.get_text(),
'head': int(head.get_text()),
'href':href['href']
}
print(data2)