1.准备工作
- url:“https://www.sina.com.cn/”
- 分析目标: 1.新闻详情页
2.新闻标题
3.新闻内容
开始爬取
目录结构
代码
import requests, re
def get_sina_news(url):
# url
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
}
# 响应内容
req = requests.get(url=url, headers=headers)
html = req.content.decode()
# print(html) # 测试html
# 提取数据
reg = r'href="https:\/\/news\.sina\..*?"'
news_url_list = re.findall(reg, html)[1::]
news_url_list.pop(0)
# print(news_url_list) # 测试url_list
# print(news_url_list[0][6:-1])
# news_html = requests.get(url=news_url_list[0][6:-1])
# print(news_html)
count = 1
for news_url in news_url_list:
# 新闻详情url
# print(news_url[6:-1])
news_html = requests.get(url=news_url[6:-1]).content.decode()
# print(news_html) # 测试新闻详情页
title_reg = r'<h1 class="main-title">(.*)</h1>'
try:
news_title = re.findall(title_reg, news_html)[0] + ".txt"
except Exception as e:
print(e)
continue
# print(news_title)
# 新闻内容
content_reg = r'<p>(.*)</p>'
content_list = re.findall(content_reg, news_html)
# print(content_list)
content = ""
for string in content_list:
content += string
# print(content) # 测试
# 内容排版
content = content.replace("\u3000\u3000", '\n ')
content = re.sub(r'<strong>', '', content)
content = re.sub(r'<\/strong>', '', content)
# content = content.split('\n ')
# print(content)
# 存储数据
with open("news/sina_news_1130/"+news_title, 'w') as f:
f.writelines(content)
print("已完成第%d篇新闻爬取" % count)
count += 1
if __name__ == '__main__':
ret = get_sina_news("https://www.sina.com.cn/")