主要爬取财联社网站中头条页面、金融页面。
def cls_news(start_time,source, title_list, url_list, time_list):
web_name = '财联社'
web_headers = headers_dic['财联社']
urls = url_dic['财联社']
id = ['金融', '头条']
for url in urls:
try:
# 获取页面内容
res = requests.get(url, headers=web_headers)
res.encoding = res.apparent_encoding
# 2.用BeautifulSoup提取标题、时间、作者、来源、正文、图片
soup = BeautifulSoup(res.text, 'html.parser')
# 标题
data = res.json()
for j in data['data']:
news_time = j['ctime']
if news_time >= start_time:
source.append(web_name)
title_list.append(j['title'])
url_list.append('https://www.cls.cn/detail/' + str(j['id']))
time_list.append(dt.datetime.fromtimestamp(news_time))
else:
continue
except Exception as e:
print(web_name + "抓取出错,此条新闻略过")
if __name__=='__main__':
current_time = time.time()
start_time = dt.datetime(2024, 1, 3, 19, 0, 0, 0).timestamp()
title_list = []
url_list = []
time_list = []
source = []
cls_news(start_time,source, title_list, url_list, time_list)
返回新闻发布时间,新闻标题,新闻链接