在本篇博客中,我们将使用requests+正则表达式爬取指定页面的新闻内容,使用正则表达式解析网页,获取新闻的题目、日期、来源、正文内容。
首先,搭建起主体程序,爬虫四部曲:
import json
import requests
from requests.exceptions import RequestException
import re
import time
def get_page(url):
pass
def get_parser(html):
pass
def write_tofile(title,article):
pass
if __name__=='__main__':
#指定新闻url
url = 'http://it.people.com.cn/n1/2019/0325/c1009-30993121.html'
#发送请求,获取响应
html = get_page(url)
#解析响应
title,article = get_parser(html)
#数据存储
write_tofile(title,article)
发送请求,获取响应。编写get_page()函数:
def get_page(url):
try:
# 添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
except RequestException:
return None
解析响应,提取题目、日期、来源,正文:
题目、日期、来源:
正文(分布在各个p标签中):
def get_parser(html):
#提取题目 日期 来源
html = str(html)
pattern = re.compile('<h1>(.*?)</h1>.*?<div class="fl">(.*?) 来源:<a.*?>(.*?)</a></div>',re.S)
title = pattern.findall(html)[0]
print(title)
#正文
#可以提取两次 第一次缩小范围,第二词再从小范围HTML代码中提取
pattern = re.compile('<div class="fl text_con_left">(.*?)<div class="edit clearfix">',re.S)
article = pattern.findall(html)[0]
pattern = re.compile('<p>(.*?)</p>',re.S)
article = pattern.findall(article)
return title,article
数据存储:
def write_tofile(title,article):
with open('news.txt','w',encoding='utf-8') as f:
f.write(title[0].replace(' ','')+'\t'+title[1]+'\t'+title[2]+'\n')
for i in article:
f.write(i.strip())
完整代码:
import json
import requests
from requests.exceptions import RequestException
import re
import time
def get_page(url):
try:
# 添加User-Agent,放在headers中,伪装成浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
return None
except RequestException:
return None
def get_parser(html):
#提取题目 日期 来源
html = str(html)
pattern = re.compile('<h1>(.*?)</h1>.*?<div class="fl">(.*?) 来源:<a.*?>(.*?)</a></div>',re.S)
title = pattern.findall(html)[0]
print(title)
#正文
#可以提取两次 第一次缩小范围,第二词再从小范围HTML代码中提取
pattern = re.compile('<div class="fl text_con_left">(.*?)<div class="edit clearfix">',re.S)
article = pattern.findall(html)[0]
pattern = re.compile('<p>(.*?)</p>',re.S)
article = pattern.findall(article)
return title,article
def write_tofile(title,article):
with open('news.txt','w',encoding='utf-8') as f:
f.write(title[0].replace(' ','')+'\t'+title[1]+'\t'+title[2]+'\n')
for i in article:
f.write(i.strip())
if __name__=='__main__':
#指定新闻url
url = 'http://tc.people.com.cn/n1/2019/0227/c183008-30906013.html'
#发送请求,获取响应
html = get_page(url)
#解析响应
title,article = get_parser(html)
#数据存储
write_tofile(title,article)