爬取的网址为http://www.tianqihoubao.com/lishi/chengdu/month/201802.html。
在用正则表达式爬取时,要注意对换行符的处理。
import csv
import re
import requests
import time
def urlPool():
'''构建url池'''
urlList = []
for i in range(1, 13):
if i < 10:
# %字符串的替换
urlList.append('http://www.tianqihoubao.com/lishi/chengdu/month/20180%s.html' % i)
else:
urlList.append('http://www.tianqihoubao.com/lishi/chengdu/month/2018%s.html' % i)
return urlList
def getHtml(url):
'''构造请求url函数,返回html文本'''
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except reques