基于XGBOOST模型预测货物运输耗时 - Part 1 天气预警数据获取

  • 基于互联网获取到的数据进行模型构建
  • Background: 1、运输耗时会受到恶劣天气因素影响,如大雾天/雷暴天气/台风天气等。无论是空运还是海运,起飞前和起飞时可能会遇到恶劣天气而导致起飞延误,同样会影响我们的到达时间;
  • Background: 2、运输耗时会受到节假日因素影响,如国庆/过年/周末等。如遇到长假,可能对我们总体的耗时带来比较大的影响。比如:提货时间是周中可能距离起飞时间的Leadtime比较短,如果是周五和周末,可能要等到周一才能起飞,如果是国庆前一天,可能要等7天后才能起飞。
  • 所以我们需要一份可以判断出提货->起飞之间经历了多少个周末和多少个极端天气的数据进行研究。
  • 下面我们可以开始从网上获取数据了
import prestodb
import pandas as pd
import numpy as np
import math
import pymysql
import pymssql
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")
from IPython.display import display
from urllib import parse
import datetime
import xlwings as xw
import matplotlib.pyplot as plt
import chinese_calendar
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import requests
from scipy.stats import f_oneway
from bs4 import BeautifulSoup
import re
import time
headers_lists =(
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
           'Chrome/65.0.3325.181 Safari/537.36',
    'Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Maxthon2.0',
    'Opera/9.80(Android2.3.4;Linux;Operamobi/adr-1107051709;U;zh-cn)Presto/2.8.149Version/11.10',
    'Mozilla/5.0(WindowsNT6.1;rv:2.0.1)Gecko/20100101Firefox/4.0.1',
    'Mozilla/5.0(Android;Linuxarmv7l;rv:5.0)Gecko/Firefox/5.0fennec/5.0',
    'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',)

Step 1:Get Climate Data

#爬取天气
#url = 'https://lishi.tianqi.com/wujiang/202201.html'
headers = {'User-Agent':random.choice(headers_lists),
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
            }

def set_link(year):
    #year参数为需要爬取数据的年份
    link = []
    for i in range(1,13):
        #一年有12个月份
        if i < 10:
            url='https://lishi.tianqi.com/shanghai/{}0{}.html'.format(year,i)
        else:
            url='https://lishi.tianqi.com/shanghai/{}{}.html'.format(year,i)
        link.append(url)
    return link

def get_page(url,headers):
    html = requests.get(url,headers=headers)
    if html.status_code == 200:
        html.encoding = html.apparent_encoding
        return html.text
    else:
        return None

date_box = []
max_temp = []
min_temp = []
weh = []
wind = []
week_box = []

def get_data(years):
    link = set_link(years)
    for url in link:
        html = get_page(url,headers)
        bs = BeautifulSoup(html,'html.parser')

        data = bs.find_all(class_='thrui')
        date = re.compile('class="th200">(.*?)</')
        tem = re.compile('class="th140">(.*?)</')
        time = re.findall(date,str(data))
#         print(time)
#         print(len(time))
        for item in time:
            week = item[10:]
            week_box.append(week)
            date_box.append(item[:10])
        temp = re.findall(tem, str(data))
        for i in range(len(time)):
            #之前因为自身需要的只是19年6月的天气信息,没有考虑到每个月的天数不一样,现在修改后就没有问题了
            max_temp.append(temp[i*4+0])
            min_temp.append(temp[i*4+1])
            weh.append(temp[i*4+2])
            wind.append(temp[i*4+3]) 
get_data(2022)
get_data(2023)
datas = pd.DataFrame({'日期':date_box,'星期':week_box,'最高温度':max_temp,'最低温度':min_temp,'天气':weh,'风向':wind})
df_climate = datas.drop_duplicates()
#爬取天气
#url = 'https://lishi.tianqi.com/wujiang/202201.html'
headers = {'User-Agent':random.choice(headers_lists),
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
            }

url='https://tianqi.2345.com/wea_history/58362.htm'
def get_page(url,headers):
    html = requests.get(url,headers=headers)
    if html.status_code == 200:
        html.encoding = html.apparent_encoding
        return html.text
    else:
        return None
html = get_page(url,headers)
bs = BeautifulSoup(html,'html.parser')
list_final = []
for i in bs.find("table",class_="history-table").find_all('tr'):
    if len(i.find_all('td')) >0:
        list_ = []
        for i_info in i.find_all('td'):
            if len(i_info.text.split(' ')) == 2:
                list_.append(i_info.text.split(' ')[0])
                list_.append(i_info.text.split(' ')[1].replace('周','星期'))
            else:
                list_.append(i_info.text)
        list_final.append(list_)
df_climate_backup = pd.DataFrame(list_final,columns=['日期','星期','最高温度','最低温度','天气','风向','空气质量指数','空气质量等级'])
df_climate_backup = df_climate_backup[['日期','星期','最高温度','最低温度','天气','风向']]
for i_date in df_climate_backup['日期']:
    if i_date not in df_climate['日期'].tolist():
        df_climate = pd.concat([df_climate,df_climate_backup[df_climate_backup['日期']==i_date]])
df_climate = df_climate.reset_index().drop('index',axis=1)
df_climate['城市'] = '上海'
df_climate

天气预警

#爬取天气
import requests
import re
import time
from bs4 import BeautifulSoup
import pandas as pd

#url = 'https://lishi.tianqi.com/wujiang/202201.html'
headers = {'User-Agent':random.choice(headers_lists),
            'Cookie':'lianjia_uuid=9d3277d3-58e4-440e-bade-5069cb5203a4; UM_distinctid=16ba37f7160390-05f17711c11c3e-454c0b2b-100200-16ba37f716618b; _smt_uid=5d176c66.5119839a; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22%24device_id%22%3A%2216ba37f7a942a6-0671dfdde0398a-454c0b2b-1049088-16ba37f7a95409%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22%22%2C%22%24latest_referrer_host%22%3A%22%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D; _ga=GA1.2.1772719071.1561816174; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1561822858; _jzqa=1.2532744094467475000.1561816167.1561822858.1561870561.3; CNZZDATA1253477573=987273979-1561811144-%7C1561865554; CNZZDATA1254525948=879163647-1561815364-%7C1561869382; CNZZDATA1255633284=1986996647-1561812900-%7C1561866923; CNZZDATA1255604082=891570058-1561813905-%7C1561866148; _qzja=1.1577983579.1561816168942.1561822857520.1561870561449.1561870561449.1561870847908.0.0.0.7.3; select_city=110000; lianjia_ssid=4e1fa281-1ebf-e1c1-ac56-32b3ec83f7ca; srcid=eyJ0Ijoie1wiZGF0YVwiOlwiMzQ2MDU5ZTQ0OWY4N2RiOTE4NjQ5YmQ0ZGRlMDAyZmFhODZmNjI1ZDQyNWU0OGQ3MjE3Yzk5NzFiYTY4ODM4ZThiZDNhZjliNGU4ODM4M2M3ODZhNDNiNjM1NzMzNjQ4ODY3MWVhMWFmNzFjMDVmMDY4NWMyMTM3MjIxYjBmYzhkYWE1MzIyNzFlOGMyOWFiYmQwZjBjYjcyNmIwOWEwYTNlMTY2MDI1NjkyOTBkNjQ1ZDkwNGM5ZDhkYTIyODU0ZmQzZjhjODhlNGQ1NGRkZTA0ZTBlZDFiNmIxOTE2YmU1NTIxNzhhMGQ3Yzk0ZjQ4NDBlZWI0YjlhYzFiYmJlZjJlNDQ5MDdlNzcxMzAwMmM1ODBlZDJkNmIwZmY0NDAwYmQxNjNjZDlhNmJkNDk3NGMzOTQxNTdkYjZlMjJkYjAxYjIzNjdmYzhiNzMxZDA1MGJlNjBmNzQxMTZjNDIzNFwiLFwia2V5X2lkXCI6XCIxXCIsXCJzaWduXCI6XCIzMGJlNDJiN1wifSIsInIiOiJodHRwczovL2JqLmxpYW5qaWEuY29tL3p1ZmFuZy9yY28zMS8iLCJvcyI6IndlYiIsInYiOiIwLjEifQ=='
            }
def get_climate(x):
    url='https://www.tianqi.com/alarmnews_02/{0}/'.format(x)
    def get_page(url,headers):
        html = requests.get(url,headers=headers)
        if html.status_code == 200:
            html.encoding = html.apparent_encoding
            return html.text
        else:
            time.sleep(1)
            url='https://www.tianqi.com/alarmnews_02/{0}'.format(x)
            print('time sleep')
            html = requests.get(url,headers=headers)
            if html.status_code == 200:
                return html.text
            
    html = get_page(url,headers)
    bs = BeautifulSoup(html,'html.parser')
    return bs
list_alarm = []
list_timestamp = []
for i_x in range(1,50):
    try:
        bs = get_climate(i_x)
        for i_index in bs.find('ul',class_='leftlist').find_all('li'):
            if len(i_index.find('img').text)>0:
                alarm_ = i_index.find('img').text.split('发布')[1]
                list_alarm.append(alarm_)
                list_timestamp.append(i_index.find('span').text.split(' ')[0])
            if len(i_index.find('img').text)==0:
                time_str = i_index.find('span').text
                alarm_ = i_index.find('a').text.split('发布')[1].replace('\n','').replace(time_str,'')
                time_str = time_str.split(' ')[0]
                list_alarm.append(alarm_)
                list_timestamp.append(time_str)
    except:
        pass
df_climate_alarm = pd.DataFrame([list_alarm,list_timestamp]).T.drop_duplicates()
df_climate_alarm.columns = ['Alarm_info','date']
df_climate_alarm = df_climate_alarm.groupby('date').agg({'Alarm_info':lambda x:x.str.cat(sep='/')}).reset_index()
df_climate_alarm
cur_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df_climate_alarm['unic_version'] = cur_time
return_to_mysql(df_climate_alarm,'raw_climate_alarm_unic')

Step 2:Get Workalendar Data

from datetime import date
from workalendar.asia import China

# 创建中国日历对象
cal = China()

# 定义日期范围
start_date = date(2021, 10, 1)
end_date = date(2023, 12, 31)

# 定义一个空的DataFrame
df = pd.DataFrame(columns=["date", "is_workday", "is_holiday", "holiday_name"])

# 遍历日期范围,将节假日信息添加到DataFrame中
for dt in pd.date_range(start=start_date, end=end_date):
    dt = dt.date()
    is_work = cal.is_working_day(dt)
    is_holi = not is_work and cal.is_holiday(dt)
    holi_name = cal.get_holiday_label(dt) if is_holi else ""
    df = df.append({"date": dt, "is_workday": is_work, "is_holiday": is_holi, "holiday_name": holi_name}, ignore_index=True)
# 定义一个函数,用于将日期转换为星期几
from datetime import datetime
def get_weekday(date_str):
    date_obj = datetime.strptime(str(date_str), "%Y-%m-%d")
    return date_obj.weekday()

# 使用apply方法,将get_weekday函数应用到date列中的每个元素,生成一个新的列weekday
df["weekday"] = df["date"].apply(get_weekday)

def is_holiday_(x):
    if str(x) == 'False':
        return 'holiday'
    if str(x) == 'True':
        return 'workday'
df["weekday_cat"] = df.apply(lambda x:is_holiday_(x['is_workday']),axis=1)
df_workday = df.drop_duplicates()
df_workday['date'] = pd.to_datetime(df_workday['date'])
import datetime
df_climate['日期'] = pd.to_datetime(df_climate['日期'])
df_workday['date'] = pd.to_datetime(df_workday['date'])
df_climate_alarm['date'] = pd.to_datetime(df_climate_alarm['date'])

df_climate_workday = pd.merge(df_climate,df_workday,left_on='日期',right_on='date',how='left')
df_climate_workday  = pd.merge(df_climate_workday,df_climate_alarm.rename(columns={'date':'date_alarm'}),left_on='日期',right_on='date_alarm',how='left')

上海lockdown 22.3.18-22.4.27

index_Shutdown = df_climate_workday[(df_climate_workday['日期']>=('2022-03-18')) &(df_climate_workday['日期']<('2022-04-27'))].index
for i_index in index_Shutdown:
    df_climate_workday.loc[i_index,'weekday_cat']='lockdown day'
def get_key_alarm(x):
    if pd.isna(x)==False:
        if x.find('雷电')>=0:
            return 'Abnormal climate'
        if x.find('雾')>=0:
            return 'Abnormal climate'
        if x.find('雷电')>=0:
            return 'Abnormal climate'
        if x.find('台风')>=0:
            return 'Abnormal climate'
        if x.find('暴雨')>=0:
            return 'Abnormal climate'
        else:
            return 'Normal climate'
df_climate_workday['Alarm_info_cat'] = df_climate_workday['Alarm_info'].map(lambda x:get_key_alarm(x))
df_climate_workday = df_climate_workday.drop('date',axis=1)
df_climate_workday = df_climate_workday.rename(columns={'日期':'Date','星期':'Week','最高温度':'maximum_temperature','最低温度':'minimum_temperature','天气':'climate','风向':'wind_direction','城市':'city'})
df_climate_workday = df_climate_workday[['Date', 'Week', 'maximum_temperature', 'minimum_temperature', 'climate',
       'wind_direction', 'city', 'is_workday', 'is_holiday', 'holiday_name',
       'weekday', 'weekday_cat', 'date_alarm', 'Alarm_info','Alarm_info_cat', 'unic_version'
       ]]
# df_climate_workday['unic_version'] = cur_time
# return_to_mysql(df_climate_workday,'raw_climate_workday_unic')
def Connect_MySQL_Get_data(sql):
#     ************** 忽略
    return data
df_climate_workday = Connect_MySQL_Get_data(sql="""
""")
df_climate_alarm = Connect_MySQL_Get_data(sql="""
""")

Step 4:Get History Replenishment Data

df_replen = Get_df_source('2022-01-21','2023-05-15')

Step 5:Get Newly Replenishment Data

df_replen_open = Get_df_source(s=""" 
    """)

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值