代码
import requests
import re
# 爬取人民数据网2020-01-01至2020-02-29的全部新闻
# 初始url指向2020-01-01的网页
url_cur = 'http://data.people.com.cn/rmrb/20200101/1'
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.59 Safari/537.36"}
while 1:
# 打开文件
f = open('crawled news.txt', 'a')
# 不加header伪装成客户端,会报403 forbidden
response = requests.get(url_cur, headers=headers, timeout=30)
response.encoding = 'utf-8'
html = response.text
# 时间
date = url_cur.split('/')[-2]
weekday = re.findall(r'(.*?)
', html)[0]
datetime = date[:4] + '年' + date[4:6] + '月' + date[6:] + '日' + ',' + weekday
f.write(datetime)
f.write('\n')
# 获取当天文章信息集合
article_info = re.findall(r'.*?', html)
num_article = len(article_info)
try:
url_next = re.findall(r'
下一期', html)[0]url_next = ''.join(['http://data.people.com.cn', url_next])
except IndexError:
break
# 循环进入每篇新闻爬取内容
for id, (_, href) in enumerate(article_info):
article_url = ''.join([url_cur, '/', href.split('/')[-1]])
# 发出http请求
article_response = requests.get(article_url, headers=headers, timeout=30)
article_response.encoding = 'utf-8'
article_content = article_response.text
# 匹配信息:标题,作者,正文
title = re.findall(r'
try:
author = re.findall(r'
', article_content)[0]except IndexError:
author = 'UNKNOWN'
body_list = re.findall(r'
(.*?)
', article_content, re.S) # 不加re.S,匹配不到内容# 数据清洗
body = '
' + '
'.join(body_list[1:]) + '
'body = body.replace(' ', '')
body = body.replace('\u3000', '')
body = body.replace('\r', '')
body = body.replace('\n', '')
body = body.replace('\t', '')
body = body.replace(' ', '')
body = body.replace('”', '’')
body = body.replace('“', '‘')
body = body.replace('…', '…')
f.write('[{}/{}]'.format(id+1, num_article))
f.write(title)
f.write(author)
f.write(body)
f.write('\n')
# 更新文件,跳转到下一天的页面
f.close()
url_cur = url_next
print('done!')