百度
链接:https://top.baidu.com/board?tab=realtime
找到热搜div标签
找到标题div标签
找到热搜链接
找到热搜度标签
核心代码
def crawl_baidu():
response = requests.get('https://top.baidu.com/board?tab=realtime')
soup = BeautifulSoup(response.text, 'html.parser')
record_tags = soup.find_all('div', {'class': 'category-wrap_iQLoo'})
titles, urls, hot_indices = [], [], []
for item in record_tags:
title_tag = item.find('div', {'class': 'c-single-text-ellipsis'})
url_a = item.find_all('a', {'class': 'img-wrapper_29V76'})
url = ""
for a in url_a:
url = a.get('href')
hot_index_tag = item.find('div', {'class': 'hot-index_1Bl1a'})
if (title_tag is not None) and (hot_index_tag is not None) and (urls is not None):
titles.append(title_tag.text.strip())
urls.append(url)
hot_indices.append(hot_index_tag.text.strip())
return titles, urls, hot_indices