气温数据爬虫(历史+预报)

本文示例程序的功能主要是获取湖北省各个地市每日最高温、最低温、风速、风向等数据。包括历史气温数据以及未来40天预报气温数据。

历史气温数据地址:http://www.tianqihoubao.com/lishi/wuhan/month/202309.html

天气预报数据地址:http://www.weather.com.cn/weather40d/101200101.shtml

代码比较简单,就不详细注释了。

一、全省各地市历史气温数据爬取处理

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

cities = {'huangshi':'黄石市',
          'yichang':'宜昌市',
          'ezhou':'鄂州市',
          'jingmen':'荆门市',
          'suizhou':'随州市',
          'enshi':'恩施州',
          'jingzhou':'荆州市',
          'shennongjia':'神农架',
          'tianmen':'天门市',
          'qianjiang':'潜江市',
          'xiantao':'仙桃市',
          'wuhan':'武汉市',
          'xianning':'咸宁市',
          'huanggang':'黄冈市',
          'shiyan':'十堰市',
          'xiangyang':'襄阳市',
          'xiaogan':'孝感市'}

#爬取年月,需要手动更改传参,按季度手动追加增量数据至存量表
years = ['2023']
months = ['09']


df = pd.DataFrame(columns=['city','date','tempture_low','tempture_high','weather_day','weather_night','winddirect_day','windpwr_day','winddirect_night','windpwr_night'])

for k,v in cities.items():
    for year in years:
        for month in months:            
            url = 'http://www.tianqihoubao.com/lishi/'+k+'/month/'+year+month+'.html'
            print('正在加载'+url)
            headers={'User-Agent':'User-Agent:Mozilla/5.0'}
            response = requests.get(url,headers=headers)
            html=response.content
            
            soup = BeautifulSoup(html, features="html.parser")
            headData = soup.select('table.b td')
               
            r1=[]   
            for index, value in enumerate(headData):
                l1 = headData[index].text.split()
                s1 = ",".join(l1)
                s2 = s1.replace('\n','').replace('/','').replace(',,',',').replace('℃','')
                l2=s2.split(',')
                r1.append(l2)
                
            r2=[val for sublist in r1[4:] for val in sublist]
            r3=np.reshape(r2, (int(len(r2)/9), 9))
            
            d = pd.DataFrame(r3,columns=['date','weather_day','weather_night','tempture_low','tempture_high','winddirect_day','windpwr_day','winddirect_night','windpwr_night'])
            d['city']=v
            
            df=pd.concat([df,d],ignore_index=True)


df['tempture_low'] = pd.to_numeric(df['tempture_low']) 
df['tempture_high'] = pd.to_numeric(df['tempture_high']) 
df=df.drop_duplicates()

#保存为表格
df.to_excel('D:\python代码\气温数据爬虫\湖北省各地区气温数据-增量.xlsx',index=False)

#存储至本地postgres数据库
engine1 =create_engine('postgresql+psycopg2://postgres:root@localhost:5432/postgres')
  
try:
    df.to_sql('hb_qy_qw_zl',engine1,schema='sjsj',index=False,if_exists='replace')
except Exception as e:
    print(e)
        

二、全省各地市预报气温数据爬取处理

import requests
import json
import pandas as pd
from sqlalchemy import create_engine


headers = {"Referer":"http://www.weather.com.cn/",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"}
  
cities={'101200101':'武汉市',
        '101200201':'襄阳市',
        '101200301':'鄂州市',
        '101200401':'孝感市',
        '101200501':'黄冈市',
        '101200601':'黄石市',
        '101200701':'咸宁市',
        '101200801':'荆州市',
        '101200901':'宜昌市',
        '101201001':'恩施州',
        '101201101':'十堰市',
        '101201201':'神农架',
        '101201301':'随州市',
        '101201401':'荆门市',
        '101201501':'天门市',
        '101201601':'仙桃市',
        '101201701':'潜江市'}

#预测月份,需要手动更改传参
yms = ['202309','202310']

sheet1 = pd.DataFrame(columns=['city','date','tempture_low','tempture_high'])

for k,v in cities.items():
    for ym in yms:
        url = 'http://d1.weather.com.cn/calendar_new/2023/'+k+'_'+ym+'.html' 
        print(v,ym,url)
        rows=[]
        response = requests.get(url,headers=headers)
        if response.status_code ==200:
            content=response.content.decode('utf-8')
            weathers = json.loads(content[11:])
            for i in range(len(weathers)):
                date=weathers[i]['date']
                temperature_high=weathers[i]['max']
                temperature_low=weathers[i]['min']
                rows.append([date,temperature_high,temperature_low])
        
        content_df = pd.DataFrame(rows,columns=['date','tempture_high','tempture_low'])
        content_df['city']=v
        sheet1=pd.concat([sheet1,content_df],ignore_index=True)


writer1 = pd.ExcelWriter(r'D:\python代码\气温数据爬虫\湖北省各地区天气预报数据.xlsx', engine='xlsxwriter')

sheet2=sheet1.copy()
sheet2['date'] = sheet2['date'].str.slice(0, 4)+'年'+sheet2['date'].str.slice(4, 6)+'月'+sheet2['date'].str.slice(6, 8)+'日'
sheet2['tempture_high']=pd.to_numeric(sheet2['tempture_high'])
sheet2['tempture_low']=pd.to_numeric(sheet2['tempture_low'])
sheet2.dropna(inplace=True)
sheet2=sheet2.drop_duplicates()
sheet2.to_excel(writer1,sheet_name='未来40天预报',index=False)

sheet2['row_num']=sheet2.groupby('city')['date'].rank(ascending=True)
sheet2['row_num']=sheet2['row_num'].values.astype(int)

sheet3=sheet2[['city','date','tempture_low','tempture_high']][sheet2['row_num']<17]
sheet3.to_excel(writer1,sheet_name='未来半个月预报',index=False)

sheet4=sheet2[['city','date','tempture_low','tempture_high']][sheet2['row_num']<9]
sheet4.to_excel(writer1,sheet_name='未来一周预报',index=False)

writer1.save()


#存储至本地postgres数据库
engine1 =create_engine('postgresql+psycopg2://postgres:root@localhost:5432/postgres')
  
try:
    sheet2.to_sql('hb_qy_qw_yb_1m',engine1,schema='sjsj',index=False,if_exists='replace')
    sheet3.to_sql('hb_qy_qw_yb_hm',engine1,schema='sjsj',index=False,if_exists='replace')
    sheet4.to_sql('hb_qy_qw_yb_1w',engine1,schema='sjsj',index=False,if_exists='replace')
except Exception as e:
    print(e)

  • 1
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值