历史天气数据获取–python爬虫(http://lishi.tianqi.com/)
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 15 11:20:40 2018
@author: CSM
"""
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
# 参数city为城市拼音,*years为年份参数(int类型),若只传入一个数字则只爬取对应年份数据,若输入多个年份则默认以第一个年份为起始年,最后一个年份为终止年(例如传入2011,2018,则爬取2011到2018年天气数据),目前最久远的天气数据只有2011年的
def get_weather_historic_data(city, *years):
res = []
for year in range(years[0], years[-1] + 1):
print('正在获取%d年数据...' % (year))
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59'}
for month in range(1, 13):
https = 'http://lishi.tianqi.com/%s/%d0%d.html'% (city, year, month)
if month < 10:
response = requests.get(https,headers=headers).text
else:
response = requests.get('http://lishi.tianqi.com/%s/%d%d.html' % (city, year, month),headers=headers).text
soup = BeautifulSoup(response, "html.parser")
# 检查是否找到该时段天气数据,没有则跳到下个月
try:
ul = soup.find(name='div', attrs={"class":"tian_three"}).find_all('ul')
ul_colums = soup.find(name = 'div',attrs={"class":"flex thalin"})
except:
continue
# columns作为DataFrame对象的列名
a= ul[0].get_text()
data = ul[0].contents[1::2 ]
data.pop(-1)#天数
columns = ul_colums.get_text().split()
columns.insert(1 , ' 星期')
columns.insert(-1 , '风速')
for i in range(0, len(data)):
res.append(data[i].get_text().split())
time.sleep(random.uniform(1, 2.5))
# 返回pandas中的dataframe数据类型
return pd.DataFrame(res, columns=columns)
st = time.time()
# shenzhen指的是上海,2011是起始年份,2019是终止年份,即爬取2019到2021年深圳天气数据
df = get_weather_historic_data('shanghai', 2019, 2021)
# 保存成本地excel文件
df.to_excel(r'D:\历史天气数据.xlsx')
print('完成,用时', round(time.time() - st, 3), 's')
根据新网页内容更新了数据格式以适用
部分引用自 这里【点击以转到】