前言:
前几日亚太杯数学建模需要相关的环境指数数据,于是我决定去爬取有关天气数据,但是奈何天气网有着自己一套的反爬机制,外加自己电脑IP劣迹斑斑,所以在爬取时,受到了反爬,后来多次更改表头才得以爬取相应数据,此代码可以爬取天气网任何地区任何时间的天气数据,只需要更改相应的url和year
代码分享:
import io
import sys
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import csv
import time
import urllib
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gbk') # 改变标准输出的默认编码, 防止控制台打印乱码
def get_soup(year, month):
url = 'http://www.tianqihoubao.com/lishi/' + 'changsha/' + 'month' + '/' + str(year) + str(month) + '.html'
try:
r = requests.get(url, timeout=30)
r.raise_for_status() # 若请求不成功,抛出HTTPError 异常
# r.encoding = 'gbk'
soup = BeautifulSoup(r.text, 'lxml')
return soup
except:
return "Request Error"
def saveTocsv(data, fileName):
'''
将天气数据保存至csv文件
'''
result_weather = pd.DataFrame(data, columns=['date', 'tq', 'temp', 'wind'])
result_weather.to_csv(fileName, index=False, encoding='gbk')
print('Save all weather success!')
def get_data():
soup = get_soup(year, month)
all_weather = soup.find('div', class_="wdetail").find('table').find_all("tr")
data = list()
for tr in all_weather[1:]:
td_li = tr.find_all("td")
for td in td_li:
s = td.get_text()
data.append("".join(s.split()))
res = np.array(data).reshape(-1, 4)
return res
if __name__ == '__main__':
years = ['2011','2012','2013','2014','2015','2016','2017','2018','2019']
months = ['01','02','03','04','05','06','07','08','09','10','11','12']
for year in years:
for month in months:
data = get_data()
saveTocsv(data, '长沙'+str(year)+str(month)+'.csv')
效果展示: