本文示例程序的功能主要是获取湖北省各个地市每日最高温、最低温、风速、风向等数据。包括历史气温数据以及未来40天预报气温数据。
历史气温数据地址:http://www.tianqihoubao.com/lishi/wuhan/month/202309.html
天气预报数据地址:http://www.weather.com.cn/weather40d/101200101.shtml
代码比较简单,就不详细注释了。
一、全省各地市历史气温数据爬取处理
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
cities = {'huangshi':'黄石市',
'yichang':'宜昌市',
'ezhou':'鄂州市',
'jingmen':'荆门市',
'suizhou':'随州市',
'enshi':'恩施州',
'jingzhou':'荆州市',
'shennongjia':'神农架',
'tianmen':'天门市',
'qianjiang':'潜江市',
'xiantao':'仙桃市',
'wuhan':'武汉市',
'xianning':'咸宁市',
'huanggang':'黄冈市',
'shiyan':'十堰市',
'xiangyang':'襄阳市',
'xiaogan':'孝感市'}
#爬取年月,需要手动更改传参,按季度手动追加增量数据至存量表
years = ['2023']
months = ['09']
df = pd.DataFrame(columns=['city','date','tempture_low','tempture_high','weather_day','weather_night','winddirect_day','windpwr_day','winddirect_night','windpwr_night'])
for k,v in cities.items():
for year in years:
for month in months:
url = 'http://www.tianqihoubao.com/lishi/'+k+'/month/'+year+month+'.html'
print('正在加载'+url)
headers={'User-Agent':'User-Agent:Mozilla/5.0'}
response = requests.get(url,headers=headers)
html=response.content
soup = BeautifulSoup(html, features="html.parser")
headData = soup.select('table.b td')
r1=[]
for index, value in enumerate(headData):
l1 = headData[index].text.split()
s1 = ",".join(l1)
s2 = s1.replace('\n','').replace('/','').replace(',,',',').replace('℃','')
l2=s2.split(',')
r1.append(l2)
r2=[val for sublist in r1[4:] for val in sublist]
r3=np.reshape(r2, (int(len(r2)/9), 9))
d = pd.DataFrame(r3,columns=['date','weather_day','weather_night','tempture_low','tempture_high','winddirect_day','windpwr_day','winddirect_night','windpwr_night'])
d['city']=v
df=pd.concat([df,d],ignore_index=True)
df['tempture_low'] = pd.to_numeric(df['tempture_low'])
df['tempture_high'] = pd.to_numeric(df['tempture_high'])
df=df.drop_duplicates()
#保存为表格
df.to_excel('D:\python代码\气温数据爬虫\湖北省各地区气温数据-增量.xlsx',index=False)
#存储至本地postgres数据库
engine1 =create_engine('postgresql+psycopg2://postgres:root@localhost:5432/postgres')
try:
df.to_sql('hb_qy_qw_zl',engine1,schema='sjsj',index=False,if_exists='replace')
except Exception as e:
print(e)
二、全省各地市预报气温数据爬取处理
import requests
import json
import pandas as pd
from sqlalchemy import create_engine
headers = {"Referer":"http://www.weather.com.cn/",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"}
cities={'101200101':'武汉市',
'101200201':'襄阳市',
'101200301':'鄂州市',
'101200401':'孝感市',
'101200501':'黄冈市',
'101200601':'黄石市',
'101200701':'咸宁市',
'101200801':'荆州市',
'101200901':'宜昌市',
'101201001':'恩施州',
'101201101':'十堰市',
'101201201':'神农架',
'101201301':'随州市',
'101201401':'荆门市',
'101201501':'天门市',
'101201601':'仙桃市',
'101201701':'潜江市'}
#预测月份,需要手动更改传参
yms = ['202309','202310']
sheet1 = pd.DataFrame(columns=['city','date','tempture_low','tempture_high'])
for k,v in cities.items():
for ym in yms:
url = 'http://d1.weather.com.cn/calendar_new/2023/'+k+'_'+ym+'.html'
print(v,ym,url)
rows=[]
response = requests.get(url,headers=headers)
if response.status_code ==200:
content=response.content.decode('utf-8')
weathers = json.loads(content[11:])
for i in range(len(weathers)):
date=weathers[i]['date']
temperature_high=weathers[i]['max']
temperature_low=weathers[i]['min']
rows.append([date,temperature_high,temperature_low])
content_df = pd.DataFrame(rows,columns=['date','tempture_high','tempture_low'])
content_df['city']=v
sheet1=pd.concat([sheet1,content_df],ignore_index=True)
writer1 = pd.ExcelWriter(r'D:\python代码\气温数据爬虫\湖北省各地区天气预报数据.xlsx', engine='xlsxwriter')
sheet2=sheet1.copy()
sheet2['date'] = sheet2['date'].str.slice(0, 4)+'年'+sheet2['date'].str.slice(4, 6)+'月'+sheet2['date'].str.slice(6, 8)+'日'
sheet2['tempture_high']=pd.to_numeric(sheet2['tempture_high'])
sheet2['tempture_low']=pd.to_numeric(sheet2['tempture_low'])
sheet2.dropna(inplace=True)
sheet2=sheet2.drop_duplicates()
sheet2.to_excel(writer1,sheet_name='未来40天预报',index=False)
sheet2['row_num']=sheet2.groupby('city')['date'].rank(ascending=True)
sheet2['row_num']=sheet2['row_num'].values.astype(int)
sheet3=sheet2[['city','date','tempture_low','tempture_high']][sheet2['row_num']<17]
sheet3.to_excel(writer1,sheet_name='未来半个月预报',index=False)
sheet4=sheet2[['city','date','tempture_low','tempture_high']][sheet2['row_num']<9]
sheet4.to_excel(writer1,sheet_name='未来一周预报',index=False)
writer1.save()
#存储至本地postgres数据库
engine1 =create_engine('postgresql+psycopg2://postgres:root@localhost:5432/postgres')
try:
sheet2.to_sql('hb_qy_qw_yb_1m',engine1,schema='sjsj',index=False,if_exists='replace')
sheet3.to_sql('hb_qy_qw_yb_hm',engine1,schema='sjsj',index=False,if_exists='replace')
sheet4.to_sql('hb_qy_qw_yb_1w',engine1,schema='sjsj',index=False,if_exists='replace')
except Exception as e:
print(e)