原文:https://blog.csdn.net/weixin_43881394/article/details/108200983
新学requests-html模块
import pandas as pd
from requests_html import HTMLSession
session = HTMLSession()
news_dict = {}
r = session.get('http://news.baidu.com/')
# 提取首页新闻内容
hot_news = r.html.find('div#pane-news', first=True)
a_s = hot_news.find('a')
news_dict['首页新闻标题'] = [a.text for a in a_s] # 首页新闻标题
news_dict['首页新闻链接'] = [a.attrs['href'] for a in a_s] # 首页新闻链接
# 提取热搜词内容
hot_news_words = r.html.find('ul.hotwords', first=True)
a_s = hot_news_words.find('a')
news_dict['热搜新闻词'] = [a.text for a in a_s] # 热搜新闻词
news_dict['热搜链接'] = [a.attrs['href'] for a in a_s] # 热搜链接
# 输出csv文件
dataframe = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in news_dict.items()]))
dataframe.to_csv('首页新闻.csv', sep=',', encoding='utf-8-sig')
输出: