Untitled555555555

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings


warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'  
path1 = 'trainb/'

入住信息

cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN19800627222405NaNNaN
13525D57CAE104A078E4962B2B89377B0371099C301202107090001199510253705232.021071e+112.021071e+11
gc.collect()
216
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(2)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN198006272224050.00.0
13525D57CAE104A078E4962B2B89377B0371099C30120210709000119951025370523202107092103202107111158
cust.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_PRIMARY_ID  44030 non-null  object
 1   GUEST_ID          5158 non-null   object
 2   BDATE             44030 non-null  object
 3   XZQH              44030 non-null  object
 4   IN_TIME           44030 non-null  object
 5   OUT_TIME          44030 non-null  object
dtypes: object(6)
memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID    43883
GUEST_ID             3698
BDATE                9503
XZQH                 2659
IN_TIME              3396
OUT_TIME              236
dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[5:10]
03158D5CA62E42339D697612EA347FB3    2
0A99D74F1E0248D68A729CB79FC640E6    2
1FE3E448558347D89C56B0AFCEC8ACFB    2
864E417B7BDE4C7B9449B7E087E9F21E    2
3DEFA75566934CA5929D0C47F0B95FDE    2
Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
2280170798782D6C04A438360D80AFE4845C1371099B389202107040001199903141309830.00.032
cust['IN_TIME'].max(), cust['IN_TIME'].min()
('202110041413', '0.0')

订单信息

# platform infomation
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv') 
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
092352196BBDB7CC049421C85826AE07020B1392783372020080112002020080212001.596120e+111NaN20200730224152202007302241523
1923696C72F20539AD1447D86CD1A8E5EAEC63A2829322020080414002020080612001.596121e+111NaN20200730225524202007302255243
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
3097044220273CDEDB5E03534D379687645675898CA41708982021080813582021081011582.021071e+113NaN20210712210153202107122101533
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

##########################没有考虑包含取消时间的订单

order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order[order['ORDER_ID']==4402846]
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
301414402846BC0E8E3602434EA6A5F29A6F6FF422331001772021071113582021071211582021071110321nan20210711140042202107111400423
order.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_ID          29941 non-null  int64 
 1   ORDER_PRIMARY_ID  29941 non-null  object
 2   HOTELID           29941 non-null  object
 3   PRE_IN_TIME       29941 non-null  object
 4   PRE_OUT_TIME      29941 non-null  object
 5   ORDER_TIME        29941 non-null  object
 6   STATUS            29941 non-null  int64 
 7   CANCEL_TIME       29941 non-null  object
 8   INSERT_TIME       29941 non-null  object
 9   MODIFY_TIME       29941 non-null  object
 10  FIRM              29941 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order.nunique()
ORDER_ID            29941
ORDER_PRIMARY_ID    29941
HOTELID              4801
PRE_IN_TIME          6875
PRE_OUT_TIME          729
ORDER_TIME          26818
STATUS                  3
CANCEL_TIME             1
INSERT_TIME         25432
MODIFY_TIME         25432
FIRM                    2
dtype: int64
order_info['FIRM'].value_counts()
3     32029
10     8521
Name: FIRM, dtype: int64
order['STATUS'].value_counts()
1    26137
2     3221
3      583
Name: STATUS, dtype: int64
#插入时间不等于修改时间,就是进行过修改过的订单,最后发现为空,等于没有修改过
order[order['INSERT_TIME']!=order['MODIFY_TIME']]
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM

合并数据order和cust

# order时间包含了cust时间
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')

月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天

df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)

#######订单插入时间和订单时间的时间差

df.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN2001040937100220200715130020200716120010
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN1988102334120220201001140020201002101110
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ORDER_ID           29941 non-null  int64 
 1   ORDER_PRIMARY_ID   29941 non-null  object
 2   HOTELID            29941 non-null  object
 3   ORDER_TIME         29941 non-null  object
 4   STATUS             29941 non-null  int64 
 5   INSERT_TIME        29941 non-null  object
 6   MODIFY_TIME        29941 non-null  object
 7   FIRM               29941 non-null  int64 
 8   GUEST_ID           3561 non-null   object
 9   BDATE              29941 non-null  object
 10  XZQH               29941 non-null  object
 11  IN_TIME            29941 non-null  object
 12  OUT_TIME           29941 non-null  object
 13  guest_sum          29941 non-null  int64 
 14  guest_sum_notnull  29941 non-null  int64 
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df.nunique();
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_time
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN20010409371002202007151300202007161200102020-07-15 13:00:002020-07-16 12:00:00
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN19881023341202202010011400202010021011102020-10-01 14:00:002020-10-02 10:11:00
# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')                 #入住时间的年份
# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int')               #退房时间的年份
# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int')               #入住时间的月份
# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int')             #退房时间的月份
# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')                   #入住时间的日期
# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')                 #退房时间的日期
# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int')                 #入住时间的小时
# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int')               #退房时间的小时
#2月份没有
# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)
# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)
# month_ = [1, 3, 5, 7, 8, 10]
# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)
# df['day_gap'].sum()

#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房

# df_0 = df[(df['day_gap']==0)]
# df_0['date'] = df_0['IN_TIME']
# 这里边包含了钟点房和当天入住并退房的。
dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
    stat1 = pd.DataFrame()
    stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
    stat1['datetime'] = stat1['datetime'].dt.date.fillna(0).astype('str')
    stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
    dfs.append(stat1)
gc.collect()
17
df_date = pd.concat(dfs).reset_index(drop=True)
df_date[:3]
datetimeORDER_PRIMARY_ID
02021-06-3000003FC18B254E86803C00F4BBA382E4
12021-07-0100003FC18B254E86803C00F4BBA382E4
22021-07-0200003FC18B254E86803C00F4BBA382E4
dfs1 = []
for idx, group in df.groupby(['HOTELID']):
    stat = pd.DataFrame()
    stat['datetime'] = pd.date_range(start='20200601', end='20210830', freq='D', normalize=False, closed=None)
    stat['datetime'] = stat['datetime'].astype('str')
    stat['HOTELID'] = group['HOTELID'].values[0]
    dfs1.append(stat)
df_date1 = pd.concat(dfs1).reset_index(drop=True)
df_date1[:10]
datetimeORDER_PRIMARY_ID
02020-06-0100003FC18B254E86803C00F4BBA382E4
12020-06-0200003FC18B254E86803C00F4BBA382E4
22020-06-0300003FC18B254E86803C00F4BBA382E4
32020-06-0400003FC18B254E86803C00F4BBA382E4
42020-06-0500003FC18B254E86803C00F4BBA382E4
52020-06-0600003FC18B254E86803C00F4BBA382E4
62020-06-0700003FC18B254E86803C00F4BBA382E4
72020-06-0800003FC18B254E86803C00F4BBA382E4
82020-06-0900003FC18B254E86803C00F4BBA382E4
92020-06-1000003FC18B254E86803C00F4BBA382E4
df11 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
_df1 = df_date1.merge(df11, on=['HOTELID', 'datetime'], how='left') 
# df['guest_sum'] = 0
# df['guest_sum_notnull'] = 0
ddf = _df1[_df1['ORDER_ID'].isnull()]
df11['in_date'] = df11['in_time'].dt.date
df11['out_date'] = df11['out_time'].dt.date
df11['datetime'] = pd.to_datetime(df11['datetime'], errors='coerce', format='%Y-%m-%d')
df11['date'] = df11['datetime'].dt.date
_df2 = df11[(df11['out_date']!=df11['date']) | ((df11['out_date']==df11['date']) & (df11['in_date']==df11['out_date']))]
_df2.drop(['in_time', 'out_time', 'in_date', 'out_date', 'datetime'], axis=1, inplace=True)
_df2.rename(columns={'date':'datetime'}, inplace=True)
#增加特征:民宿单日的总订单量
in_num = _df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
_df2 = _df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
_df2.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
       'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'datetime',
       'in_hotel_num'],
      dtype='object')
ddf.columns
Index(['datetime', 'HOTELID', 'ORDER_ID', 'ORDER_PRIMARY_ID', 'ORDER_TIME',
       'STATUS', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE',
       'XZQH', 'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull',
       'in_time', 'out_time'],
      dtype='object')
ddf.drop(['in_time', 'out_time'], axis=1, inplace=True)
ddf['in_hotel_num'] = 0
df1 = pd.concat([_df2, ddf])
df1 = df1.sort_values(['HOTELID', 'datetime'])
df1.head(2).append(df1.tail(2))
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnulldatetimein_hotel_num
0NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2020-06-010
1NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2020-06-020
2196800NaNNaNB109977684NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2021-08-290
2196801NaNNaNB109977684NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2021-08-300
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2186104 entries, 0 to 2196801
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ORDER_ID           float64
 1   ORDER_PRIMARY_ID   object 
 2   HOTELID            object 
 3   ORDER_TIME         object 
 4   STATUS             float64
 5   INSERT_TIME        object 
 6   MODIFY_TIME        object 
 7   FIRM               float64
 8   GUEST_ID           object 
 9   BDATE              object 
 10  XZQH               object 
 11  IN_TIME            object 
 12  OUT_TIME           object 
 13  guest_sum          float64
 14  guest_sum_notnull  float64
 15  datetime           object 
 16  in_hotel_num       int64  
dtypes: float64(5), int64(1), object(11)
memory usage: 300.2+ MB
test = pd.read_csv('testb/submit_example_2.csv')
test.head()
HOTELIDDATEROOM_EMPTY
03037602021-09-040.1
13037602021-09-050.1
23037602021-09-110.1
33037602021-09-120.1
43037602021-09-190.1
# #增加特征:民宿单日的总订单量
# in_num = df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
# df2 = df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
test['type'] = 'test'
df1['type'] = 'train'
df1['DATE'] = df1['datetime'].astype('str')
df2 = df1[df1['DATE'] < '2021-09-01']
df2.shape
(2185510, 19)
df2['datetime'] = pd.to_datetime(df2['datetime'], errors='coerce', format='%Y-%m-%d')
df2['in_year'] = df2['datetime'].dt.year.fillna(0).astype('int')
df2['in_month'] = df2['datetime'].dt.month.fillna(0).astype('int')
df2['in_day'] = df2['datetime'].dt.day.fillna(0).astype('int')
df2['in_quarter'] = df2['datetime'].dt.quarter.fillna(0).astype('int')
df2['in_dayofweek'] = df2['datetime'].dt.dayofweek.fillna(0).astype('int')
df2['in_dayofyear'] = df2['datetime'].dt.dayofyear.fillna(0).astype('int')
df2['in_weekofyear'] = df2['datetime'].dt.weekofyear.fillna(0).astype('int')
df2['in_is_wknd'] = df2['datetime'].dt.dayofweek // 5                 #是否周末
df2.head(3)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnulldatetimein_hotel_numtypeDATEin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wknd
0NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2020-06-010train2020-06-0120206120153230
1NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2020-06-020train2020-06-0220206221154230
2NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2020-06-030train2020-06-0320206322155230
#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df2[(df2['in_is_wknd']==1) & (df2['in_month'].isin(use_month))]
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
df_2 = df2[df2['DATE'].isin(holiday)]
# 节假日为1
df_2['holiday'] = 1
df_1['holiday'] = 0
df3 = pd.concat([df_2, df_1])
df3.shape
(373093, 28)
drop_col = df3.columns.tolist()
drop_col.remove('holiday')
df3.drop_duplicates(drop_col, keep='first', inplace=True)
df3.shape
(354019, 28)
test['datetime'] = pd.to_datetime(test['DATE'], errors='coerce', format='%Y-%m-%d')
test['in_year'] = test['datetime'].dt.year.fillna(0).astype('int')
test['in_month'] = test['datetime'].dt.month.fillna(0).astype('int')
test['in_day'] = test['datetime'].dt.day.fillna(0).astype('int')
test['in_quarter'] = test['datetime'].dt.quarter.fillna(0).astype('int')
test['in_dayofweek'] = test['datetime'].dt.dayofweek.fillna(0).astype('int')
test['in_dayofyear'] = test['datetime'].dt.dayofyear.fillna(0).astype('int')
test['in_weekofyear'] = test['datetime'].dt.weekofyear.fillna(0).astype('int')
test['in_is_wknd'] = test['datetime'].dt.dayofweek // 5                 #是否周末
test[:3]
HOTELIDDATEROOM_EMPTYtypedatetimein_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wknd
03037602021-09-040.1test2021-09-0420219435247351
13037602021-09-050.1test2021-09-0520219536248351
23037602021-09-110.1test2021-09-11202191135254361
test['holiday'] = 0
test['holiday'] = test.apply(lambda x: x['holiday']+1 if x['DATE'] in holiday else x['holiday'], axis=1)
del test['datetime'], df3['datetime'], test['ROOM_EMPTY']
df3.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
       'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'in_hotel_num',
       'type', 'DATE', 'in_year', 'in_month', 'in_day', 'in_quarter',
       'in_dayofweek', 'in_dayofyear', 'in_weekofyear', 'in_is_wknd',
       'holiday'],
      dtype='object')
df3.drop(['IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df4 = df3.merge(test, on=['HOTELID', 'DATE', 'type', 'in_year', 'in_month', 'in_day', 'in_quarter', 'in_dayofweek', 
                          'in_dayofyear', 'in_weekofyear', 'in_is_wknd', 'holiday'], how='outer')
df4.shape
(357330, 25)
dd = df4[df4['DATE']>'2021-09-01']
dd['type'].value_counts()
test    3311
Name: type, dtype: int64
df4.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 357330 entries, 0 to 357329
Data columns (total 25 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ORDER_ID           12543 non-null   float64
 1   ORDER_PRIMARY_ID   12543 non-null   object 
 2   HOTELID            357330 non-null  object 
 3   ORDER_TIME         12543 non-null   object 
 4   STATUS             12543 non-null   float64
 5   INSERT_TIME        12543 non-null   object 
 6   MODIFY_TIME        12543 non-null   object 
 7   FIRM               12543 non-null   float64
 8   GUEST_ID           1877 non-null    object 
 9   BDATE              12543 non-null   object 
 10  XZQH               12543 non-null   object 
 11  guest_sum          12543 non-null   float64
 12  guest_sum_notnull  12543 non-null   float64
 13  in_hotel_num       354019 non-null  float64
 14  type               357330 non-null  object 
 15  DATE               357330 non-null  object 
 16  in_year            357330 non-null  int64  
 17  in_month           357330 non-null  int64  
 18  in_day             357330 non-null  int64  
 19  in_quarter         357330 non-null  int64  
 20  in_dayofweek       357330 non-null  int64  
 21  in_dayofyear       357330 non-null  int64  
 22  in_weekofyear      357330 non-null  int64  
 23  in_is_wknd         357330 non-null  int64  
 24  holiday            357330 non-null  int64  
dtypes: float64(6), int64(9), object(10)
memory usage: 70.9+ MB

room信息

room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
CODEHOTELIDJYMJROOM_NUMBED_NUMFWLYCZLYCALLEDCITY_CODEBUR_CODESTA_CODESSXADDRESSMPHMJYQKFIRMDJSJBGSJSTATUSAUDITSTATUS
010017737108300020.01121【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽371000000000NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM001【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽32020-05-18 10:33:552020-07-15 10:23:58NaNNaN
room_info['STATUS'].value_counts()
1.0    2658
Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS','AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)
room_info['FIRM'].value_counts()
3     3429
10    1878
Name: FIRM, dtype: int64
room_info.nunique()
HOTELID     5307
JYMJ         173
ROOM_NUM      11
BED_NUM       17
CZLY           4
CALLED      4232
BUR_CODE       7
STA_CODE      54
SSX            5
ADDRESS     4850
MPHM         584
FIRM           2
DJSJ        3753
BGSJ        3294
dtype: int64

登记时间和变更时间无变化的只有两家

#经营面积为0的修改为空值
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
#登记时间和变更时间的差值
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
    st = room_info[col].value_counts().reset_index()
    st.columns = [col, col+'_freq']
    room_info = room_info.merge(st, on=col, how='left')
#房间面积率和床位率\单间房床位量
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date
#等级时间和变更时间的时间差
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'),
 Timestamp('2021-10-19 16:05:41'),
 Timestamp('2020-07-15 10:23:06'),
 Timestamp('2021-10-19 16:09:15'))
# room['BG_dayofweek'].value_counts()
plt.figure(figsize=(20, 8))
sns.countplot(room_info['JYMJ'])
<AxesSubplot:xlabel='JYMJ', ylabel='count'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xMlhlyik-1646247028436)(output_119_1.png)]

经营面积缺失超过一半,3000多,考虑删除

合并room到df中

del room_info['FIRM']
df = df4.merge(room_info, on=['HOTELID'], how='left')
df = df.sort_values(by=['HOTELID', 'DATE'])
gc.collect()
170
#增加民宿单日的总订单量
# in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
# df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')
df.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHguest_sumguest_sum_notnullin_hotel_numtypeDATEin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wkndholidayJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gap
37946NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.0train2020-06-06202066251582310NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0
37947NaNNaN100177NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.0train2020-06-07202067261592310NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 357330 entries, 37946 to 354018
Data columns (total 54 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   ORDER_ID           12543 non-null   float64       
 1   ORDER_PRIMARY_ID   12543 non-null   object        
 2   HOTELID            357330 non-null  object        
 3   ORDER_TIME         12543 non-null   object        
 4   STATUS             12543 non-null   float64       
 5   INSERT_TIME        12543 non-null   object        
 6   MODIFY_TIME        12543 non-null   object        
 7   FIRM               12543 non-null   float64       
 8   GUEST_ID           1877 non-null    object        
 9   BDATE              12543 non-null   object        
 10  XZQH               12543 non-null   object        
 11  guest_sum          12543 non-null   float64       
 12  guest_sum_notnull  12543 non-null   float64       
 13  in_hotel_num       354019 non-null  float64       
 14  type               357330 non-null  object        
 15  DATE               357330 non-null  object        
 16  in_year            357330 non-null  int64         
 17  in_month           357330 non-null  int64         
 18  in_day             357330 non-null  int64         
 19  in_quarter         357330 non-null  int64         
 20  in_dayofweek       357330 non-null  int64         
 21  in_dayofyear       357330 non-null  int64         
 22  in_weekofyear      357330 non-null  int64         
 23  in_is_wknd         357330 non-null  int64         
 24  holiday            357330 non-null  int64         
 25  JYMJ               141308 non-null  float64       
 26  ROOM_NUM           357330 non-null  int64         
 27  BED_NUM            357330 non-null  int64         
 28  CZLY               357330 non-null  int64         
 29  CALLED             357330 non-null  object        
 30  BUR_CODE           167625 non-null  float64       
 31  STA_CODE           167625 non-null  object        
 32  SSX                357330 non-null  int64         
 33  ADDRESS            357330 non-null  object        
 34  MPHM               357330 non-null  object        
 35  DJSJ               356948 non-null  datetime64[ns]
 36  BGSJ               357330 non-null  datetime64[ns]
 37  CALLED_freq        357330 non-null  int64         
 38  ADDRESS_freq       357330 non-null  int64         
 39  JYMJ_freq          141308 non-null  float64       
 40  ROOM_NUM_freq      357330 non-null  int64         
 41  BED_NUM_freq       357330 non-null  int64         
 42  CZLY_freq          357330 non-null  int64         
 43  BUR_CODE_freq      167625 non-null  float64       
 44  STA_CODE_freq      167625 non-null  float64       
 45  SSX_freq           357330 non-null  int64         
 46  MPHM_freq          357330 non-null  int64         
 47  FIRM_freq          357330 non-null  int64         
 48  room_ratio         141308 non-null  float64       
 49  bed_ratio          141308 non-null  float64       
 50  room_bed           357330 non-null  float64       
 51  DJ_date            356948 non-null  object        
 52  BG_date            357330 non-null  object        
 53  DJ_gap             356948 non-null  float64       
dtypes: datetime64[ns](2), float64(15), int64(21), object(16)
memory usage: 149.9+ MB
missing = df.isnull().sum()
missing = missing[missing>180000]
missing_col = missing.index.tolist()
df.drop(columns=missing_col, inplace=True)
df['DATE'].min(), df['DATE'].max()
('2020-06-06', '2021-09-21')
df.drop(columns=['CALLED', 'ADDRESS'], inplace=True)
df[:5]
HOTELIDin_hotel_numtypeDATEin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wkndholidayROOM_NUMBED_NUMCZLYSSXMPHMDJSJBGSJCALLED_freqADDRESS_freqROOM_NUM_freqBED_NUM_freqCZLY_freqSSX_freqMPHM_freqFIRM_freqroom_bedDJ_dateBG_dateDJ_gap
379461001770.0train2020-06-06202066251582310111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.0
379471001770.0train2020-06-07202067261592310111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.0
379481001770.0train2020-06-132020613251652410111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.0
379491001770.0train2020-06-142020614261662410111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.0
379501001770.0train2020-06-202020620251722510111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.0
def qty_shift(df, val):
    #昨天,上周,上个月, 去年
    df['last_1_qty'] = df.groupby('HOTELID')[val].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_2_qty'] = df.groupby('HOTELID')[val].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_3_qty'] = df.groupby('HOTELID')[val].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_4_qty'] = df.groupby('HOTELID')[val].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_5_qty'] = df.groupby('HOTELID')[val].shift(5).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_6_qty'] = df.groupby('HOTELID')[val].shift(6).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_7_qty'] = df.groupby('HOTELID')[val].shift(7).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_8_qty'] = df.groupby('HOTELID')[val].shift(8).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_10_qty'] = df.groupby('HOTELID')[val].shift(10).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_12_qty'] = df.groupby('HOTELID')[val].shift(12).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_15_qty'] = df.groupby('HOTELID')[val].shift(15).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_20_qty'] = df.groupby('HOTELID')[val].shift(20).fillna(method='ffill').reset_index().sort_index().set_index('index')
    return df
vals = ['in_hotel_num', 'in_year', 'in_month', 'in_dayofyear', 'in_quarter']           
for val in vals:
    print(val)
    df = qty_shift(df, val)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
# #昨天,上周,上个月, 去年
# df['yesterday_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_2_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_3_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_4_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_52_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(52).fillna(method='ffill').reset_index().sort_index().set_index('index')
def qty_rolling(df, window, val, keys):
    df['qty_rolling'+str(window)+'_mean'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").mean()).values.tolist()
    df['qty_rolling'+str(window)+'_max'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).max()).values.tolist()
    df['qty_rolling'+str(window)+'_sum'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).sum()).values.tolist()
#     df['qty_rolling'+str(window)+'_std'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").std()).values.tolist()
#     df['qty_rolling'+str(window)+'_skew'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).skew()).values.tolist()
#     df['qty_rolling'+str(window)+'_kurt'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).kurt()).values.tolist()
    #df['qty_rolling'+str(window)+'_quantile'] = df.groupby(keys)[val].transform(  
              #lambda x: x.rolling(window=window, min_periods=3).quantile()).values.tolist()
#     df['qty_rolling'+str(window)+'_corr'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).corr()).values.tolist()
    return df

gc.collect()
34
# 滚动7天和14天
keys = 'HOTELID'
for val in vals:
    print(val)
    df = qty_rolling(df, 3, val, keys)
    df = qty_rolling(df, 4, val, keys)
    df = qty_rolling(df, 6, val, keys)
# keys = ['HOTELID']
# df = qty_rolling(df, 2, 'in_hotel_num', keys)

in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
def qty_ewm(df, alpha, val, keys):
    df['qty_ewm'+'_mean'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).mean()).values.tolist()
    df['qty_ewm'+'_std'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).std()).values.tolist()
    df['qty_ewm'+'_corr'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).corr()).values.tolist()
    return df
for val in vals:
    print(val)
    df = qty_ewm(df, 0.95, val, keys)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
df.select_dtypes(include='object').columns
Index(['HOTELID', 'type', 'DATE', 'MPHM', 'DJ_date', 'BG_date'], dtype='object')
df['in_hotel_num'] = df['in_hotel_num'].apply(lambda x: 1 if x==0 else 0)
df[:5]
HOTELIDin_hotel_numtypeDATEin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wkndholidayROOM_NUMBED_NUMCZLYSSXMPHMDJSJBGSJCALLED_freqADDRESS_freqROOM_NUM_freqBED_NUM_freqCZLY_freqSSX_freqMPHM_freqFIRM_freqroom_bedDJ_dateBG_dateDJ_gaplast_1_qtylast_2_qtylast_3_qtylast_4_qtylast_5_qtylast_6_qtylast_7_qtylast_8_qtylast_10_qtylast_12_qtylast_15_qtylast_20_qtyqty_rolling3_meanqty_rolling3_maxqty_rolling3_sumqty_rolling4_meanqty_rolling4_maxqty_rolling4_sumqty_rolling6_meanqty_rolling6_maxqty_rolling6_sumqty_ewm_meanqty_ewm_stdqty_ewm_corr
379461001771train2020-06-06202066251582310111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
379471001771train2020-06-07202067261592310111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.02.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2.0NaNNaN
379481001771train2020-06-132020613251652410111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.02.02.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2.00.0NaN
379491001771train2020-06-142020614261662410111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.02.02.02.0NaNNaNNaNNaNNaNNaNNaNNaNNaN2.02.06.02.02.06.02.02.06.02.00.0NaN
379501001771train2020-06-202020620251722510111371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811309115024727150256534291.02020-05-182020-07-1558.02.02.02.02.0NaNNaNNaNNaNNaNNaNNaNNaN2.02.06.02.02.08.02.02.08.02.00.0NaN
df = df.fillna(0)
col_list = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
col_list.remove('in_hotel_num')
col_list
['in_year',
 'in_month',
 'in_day',
 'in_quarter',
 'in_dayofweek',
 'in_dayofyear',
 'in_weekofyear',
 'in_is_wknd',
 'holiday',
 'ROOM_NUM',
 'BED_NUM',
 'CZLY',
 'SSX',
 'CALLED_freq',
 'ADDRESS_freq',
 'ROOM_NUM_freq',
 'BED_NUM_freq',
 'CZLY_freq',
 'SSX_freq',
 'MPHM_freq',
 'FIRM_freq',
 'room_bed',
 'DJ_gap',
 'last_1_qty',
 'last_2_qty',
 'last_3_qty',
 'last_4_qty',
 'last_5_qty',
 'last_6_qty',
 'last_7_qty',
 'last_8_qty',
 'last_10_qty',
 'last_12_qty',
 'last_15_qty',
 'last_20_qty',
 'qty_rolling3_mean',
 'qty_rolling3_max',
 'qty_rolling3_sum',
 'qty_rolling4_mean',
 'qty_rolling4_max',
 'qty_rolling4_sum',
 'qty_rolling6_mean',
 'qty_rolling6_max',
 'qty_rolling6_sum',
 'qty_ewm_mean',
 'qty_ewm_std',
 'qty_ewm_corr']
gc.collect()
102
used_features = col_list
cate_cols = ['HOTELID', 'MPHM']
X_train = df[df["DATE"] < '2021-08-02'][used_features].reset_index(drop=True)
y_train = df[df["DATE"] < '2021-08-02']["in_hotel_num"]
X_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')][used_features].reset_index(drop=True)
y_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')]["in_hotel_num"]
X_test = df[df["type"] == 'test'][used_features].reset_index(drop=True)

clf_1 = LGBMClassifier(num_leaves = 256,
                     n_estimators = 20000,
                     learning_rate = 0.005,
                     verbose = -1,
                     max_bin = 100,
                     max_depth = 10,
                     feature_fraction_seed = 66,
                     feature_fraction = 0.7,
                     bagging_seed = 66,
                     bagging_freq = 1,
                     bagging_fraction = 0.95,
                     metric = 'auc', # MultiAuc_score
                     lambda_l1 = 0.1,
                     lambda_l2 = 0.1, 
                     min_child_weight = 30,
                     n_jobs=80)

clf_1.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,verbose=200) #, categorical_feature = object_list

gc.collect()
[LightGBM] [Warning] feature_fraction is set=0.7, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7
[LightGBM] [Warning] lambda_l1 is set=0.1, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.1
[LightGBM] [Warning] bagging_fraction is set=0.95, subsample=1.0 will be ignored. Current value: bagging_fraction=0.95
[LightGBM] [Warning] lambda_l2 is set=0.1, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.1
[LightGBM] [Warning] bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.541449





357
oof_prob = clf_1.predict_proba(X_valid[used_features])[:, 1]  
oof_prob.min(), oof_prob.max()
(0.9610922808728843, 0.9625584615170373)
oof_prob1 = clf_1.predict_proba(X_test[used_features])[:, 1]  
oof_prob1.min(), oof_prob1.max()
(0.960959321386754, 0.9624418595928675)
X_test[used_features][:10]
STATUSFIRMguest_sumguest_sum_notnullin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wkndholidayJYMJROOM_NUMBED_NUMCZLYBUR_CODESSXCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_gaplast_1_qtylast_2_qtylast_3_qtylast_4_qtylast_5_qtylast_6_qtylast_7_qtylast_8_qtylast_10_qtylast_12_qtylast_15_qtylast_20_qtyqty_rolling3_meanqty_rolling3_maxqty_rolling3_sumqty_rolling4_meanqty_rolling4_maxqty_rolling4_sumqty_rolling6_meanqty_rolling6_maxqty_rolling6_sumqty_ewm_meanqty_ewm_stdqty_ewm_corr
0NaNNaNNaNNaN202194352473510NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.02.03.03.09.03.03.012.03.03.018.03.06.328848e-121.0
1NaNNaNNaNNaN202195362483510NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.02.03.03.09.03.03.012.03.03.018.03.01.415174e-121.0
2NaNNaNNaNNaN2021911352543610NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.03.164424e-131.0
3NaNNaNNaNNaN2021912362553610NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.07.075868e-141.0
4NaNNaNNaNNaN2021919362623711NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.01.582212e-141.0
5NaNNaNNaNNaN2021920302633801NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.03.537934e-151.0
6NaNNaNNaNNaN2021921312643801NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.07.911060e-161.0
7NaNNaNNaNNaN202194352473510NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.02.03.03.09.03.03.012.03.03.018.03.06.328848e-121.0
8NaNNaNNaNNaN202195362483510NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.02.03.03.09.03.03.012.03.03.018.03.01.415174e-121.0
9NaNNaNNaNNaN2021911352543610NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN3.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.03.164424e-131.0
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
#     'objective': 'binary',
    'n_estimators': 20000,
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'auc'},
    'verbose': -1
}

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
        continue  
  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
                          categorical_feature=cate_cols)  
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
                          categorical_feature=cate_cols)  
    clf = LGBMClassifier.fit(params, trn_data, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    early_stopping_rounds=Early_Stopping_Rounds)
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
  
    print('MAE: {}'.format(mae_))  
  
    print("ReTraining on all data")  
    gc.enable()  
    del trn_data, val_data  
    gc.collect()  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
#     clf = LGBMClassifier.fit(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

<ipython-input-535-a56605d14a52> in <module>
     49     val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
     50                           categorical_feature=cate_cols)  
---> 51     clf = LGBMClassifier.fit(params, train.iloc[train_index], df[df['type'] == 'train'].iloc[train_index]['in_hotel_num'], eval_set=[trn_data, val_data], verbose=Verbose, early_stopping_rounds=Early_Stopping_Rounds)
     52     val = clf.predict(train.iloc[valid_index])
     53     mae_ = mean_absolute_error(labels[valid_index], val)


F:\anaconda\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
    783         _LGBMAssertAllFinite(y)
    784         _LGBMCheckClassificationTargets(y)
--> 785         self._le = _LGBMLabelEncoder().fit(y)
    786         _y = self._le.transform(y)
    787         self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))


AttributeError: 'dict' object has no attribute '_le'
df['HOTELID'] = df['HOTELID'].astype('category')
df['MPHM'] = df['MPHM'].astype('category')
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'l2', 'l1'},
    'verbose': -1
}

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
        continue  
  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
                          categorical_feature=cate_cols)  
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
                          categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    early_stopping_rounds=Early_Stopping_Rounds)
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
  
    print('MAE: {}'.format(mae_))  
  
    print("ReTraining on all data")  
    gc.enable()  
    del trn_data, val_data  
    gc.collect()  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation
Training until validation scores don't improve for 100 rounds
[500]	training's l2: 0.0357266	training's l1: 0.0733432	valid_1's l2: 0.0140039	valid_1's l1: 0.0424839
[1000]	training's l2: 0.034654	training's l1: 0.0715026	valid_1's l2: 0.0136241	valid_1's l1: 0.0362867
[1500]	training's l2: 0.0340906	training's l1: 0.0702526	valid_1's l2: 0.0134038	valid_1's l1: 0.0323129
[2000]	training's l2: 0.0337541	training's l1: 0.0694053	valid_1's l2: 0.0132898	valid_1's l1: 0.0298233
[2500]	training's l2: 0.0335207	training's l1: 0.0688021	valid_1's l2: 0.0131952	valid_1's l1: 0.028312
[3000]	training's l2: 0.0333523	training's l1: 0.068367	valid_1's l2: 0.0131123	valid_1's l1: 0.0273034
[3500]	training's l2: 0.033219	training's l1: 0.0680436	valid_1's l2: 0.0130372	valid_1's l1: 0.0264549
[4000]	training's l2: 0.033109	training's l1: 0.0678005	valid_1's l2: 0.0129586	valid_1's l1: 0.0259314
[4500]	training's l2: 0.0330175	training's l1: 0.0676386	valid_1's l2: 0.0129	valid_1's l1: 0.0256456
[5000]	training's l2: 0.0329394	training's l1: 0.0675283	valid_1's l2: 0.0128614	valid_1's l1: 0.0254616
[5500]	training's l2: 0.0328693	training's l1: 0.0674366	valid_1's l2: 0.0128206	valid_1's l1: 0.0253768
Early stopping, best iteration is:
[5586]	training's l2: 0.0328584	training's l1: 0.067422	valid_1's l2: 0.0128154	valid_1's l1: 0.0253677
MAE: 0.025367655553334097
ReTraining on all data
Best_iteration:  5586
pred1 = clf.predict(test)
pred1.min(), pred1.max()
(0.2106748391712024, 0.9860467852769609)
pre_d = df[df['type'] == 'test']
pre_d['ROOM_EMPTY'] = pred1.tolist()
pre_d1 = pre_d[['HOTELID', 'DATE', 'ROOM_EMPTY']]
pp = pre_d1[pre_d1['DATE']=='2021-09-21']
pp[:5]
HOTELIDDATEROOM_EMPTY
356552100832021-09-210.726068
355831101252021-09-210.825109
354872102372021-09-210.841640
356181102732021-09-210.841640
3568251048142021-09-210.726068
np.median(pp['ROOM_EMPTY'])
0.8079157059926819
pre_d1[pre_d1['ROOM_EMPTY']>0.807].shape
(2667, 3)
pre_d1['ROOM_EMPTY'] = pre_d1['ROOM_EMPTY'].apply(lambda x: 0 if x>0.807 else 1)
pre_d1['ROOM_EMPTY'].sum()
644
pre_d1.to_csv('pre_02_reg.csv', index=False)


pre_d1['ROOM_EMPTY'].median()
0.8752981387595966
pre_d1['ROOM_EMPTY'].quantile(0.2)
0.8070988321951651
sns.countplot(pre_d1['ROOM_EMPTY'])
<AxesSubplot:xlabel='ROOM_EMPTY', ylabel='count'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-281lRSik-1646247028443)(output_177_1.png)]

pre_d1.shape
(3311, 3)
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
nan




pred.min(), pred.max()
(0.03637936444857589, 1.8554153046371984)
pre_df = df[df['type'] == 'test']
pre_df['ROOM_EMPTY'] = pred.tolist()
pre_df1.to_csv('origion.csv', index=False)
pre_df[:5]
HOTELIDin_hotel_numtypeDATEin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearin_weekofyearin_is_wkndholidaylast_1_qtylast_2_qtylast_3_qtylast_4_qtylast_5_qtylast_6_qtylast_7_qtylast_8_qtylast_10_qtylast_12_qtylast_15_qtylast_20_qtyqty_rolling3_meanqty_rolling3_maxqty_rolling3_sumqty_rolling4_meanqty_rolling4_maxqty_rolling4_sumqty_rolling6_meanqty_rolling6_maxqty_rolling6_sumqty_ewm_meanqty_ewm_stdqty_ewm_corrROOM_EMPTY
356546100830test2021-09-042021943524735103.03.03.03.03.03.03.03.03.03.03.02.03.03.09.03.03.012.03.03.018.03.06.328848e-121.00.054686
356547100830test2021-09-052021953624835103.03.03.03.03.03.03.03.03.03.03.02.03.03.09.03.03.012.03.03.018.03.01.415174e-121.00.055554
356548100830test2021-09-1120219113525436103.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.03.164424e-131.00.247202
356549100830test2021-09-1220219123625536103.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.07.075868e-141.00.105092
356550100830test2021-09-1920219193626237113.03.03.03.03.03.03.03.03.03.03.03.03.03.09.03.03.012.03.03.018.03.01.582212e-141.00.094995
pre_df1 = pre_df[['HOTELID', 'DATE', 'ROOM_EMPTY']]
# 8月份500多
pre_df1[pre_df1['ROOM_EMPTY']>0.07].shape
(2380, 3)
pre_df1['ROOM_EMPTY'] = pre_df1['ROOM_EMPTY'].apply(lambda x: 0 if x>=0.25 else 1)

# mean_absolute_error(labels[valid_index], val)  
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
0.7065505364632024
a = pd.DataFrame()
a['y'] = labels[valid_index].tolist()
a['val'] = val.tolist()
# _val = np.array(a['val'].values.tolist())
# _y = np.array(a['y'].values.tolist())
fpr, tpr, thresholds = metrics.roc_curve(a['y'], a['val'], pos_label=2)
metrics.auc(fpr, tpr)
nan
np.array(a['val'].values.tolist())
array([0.98077955, 0.98028159, 0.98090901, ..., 0.99187334, 0.9917258 ,
       0.99187287])
a['y'].max()
1
a['y'] = a['y'].apply(lambda x: 1 if x==0 else 0)
a['val'] = a['val'].apply(lambda x: (1-x/(4.03212491)))
a[:5]
yval
010.980780
110.980282
210.980909
310.986415
410.988064
val.max()-val.min()
4.0321249098355265
labels[valid_index]
array([0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])
val[140:160]
array([0.02691592, 0.02701038, 0.02690596, 0.02699238, 0.02690596,
       0.07806598, 0.07950708, 0.29049198, 0.77018916, 0.98434103,
       0.05606983, 0.06270357, 0.04589361, 0.05028036, 0.04938166,
       0.05054352, 0.04737767, 0.11532118, 0.05313139, 0.04558791])

gc.collect()
388
from tqdm import tqdm
from sklearn.metrics import *
print("#############################输出第一个模型的评价参数及结果#############################")

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    recall = 0
    precision = 0 
    best_th = 0
    for th in tqdm([i/2000 for i in range(100, 2000)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0

#         recall = recall_score(y_valid, oof_prob_copy)
#         precision = precision_score(y_valid, oof_prob_copy)
        fpr, tpr, thresholds = metrics.roc_curve(oof_prob_copy, y_valid, pos_label=2)
        aucs = metrics.auc(fpr, tpr)
        if aucs > best_f2:
            best_th = th
            best_f2 = aucs
#         gc.collect()
#     recall = recall
#     precision = precision
        
    return best_th, best_f2
val1 = val
y1 = labels[valid_index]
best_th, aucs = find_best_threshold(val1, y1)
print("分界值", best_th)
print("F2评价分数", aucs)
# print("recall召回率", recall)
# print("precision精确度", precision)
  1%|▏                                       | 10/1900 [00:00<00:23, 80.05it/s]

#############################输出第一个模型的评价参数及结果#############################


100%|██████████████████████████████████████| 1900/1900 [00:20<00:00, 94.61it/s]

分界值 0.05
F2评价分数 0.7065505364632024
pre_df1[20:33]
HOTELIDDATEROOM_EMPTY
354872102372021-09-210
356175102732021-09-040
356176102732021-09-050
356177102732021-09-110
356178102732021-09-120
356179102732021-09-190
356180102732021-09-200
356181102732021-09-210
3568191048142021-09-040
3568201048142021-09-050
3568211048142021-09-110
3568221048142021-09-120
3568231048142021-09-190
pre_df1.to_csv('pre_025_regression.csv', index=False)
pre_df1['ROOM_EMPTY'].mode()
0    0.20966
dtype: float64
pre_df1['ROOM_EMPTY'].median()
0.10509209231427108



plt.figure(figsize=(16,8))
sns.countplot(pre_df1['ROOM_EMPTY'])
<AxesSubplot:xlabel='ROOM_EMPTY', ylabel='count'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uSxKnO4p-1646247028446)(output_213_1.png)]


# df4['in_year'] = df4['datetime'].dt.year.fillna(0).astype('int')
# df4['in_month'] = df4['datetime'].dt.month.fillna(0).astype('int')
# df4['in_day'] = df4['datetime'].dt.day.fillna(0).astype('int')
# df4['in_quarter'] = df4['datetime'].dt.quarter.fillna(0).astype('int')
# df4['in_dayofweek'] = df4['datetime'].dt.dayofweek.fillna(0).astype('int')
# df4['in_dayofyear'] = df4['datetime'].dt.dayofyear.fillna(0).astype('int')
# df4['in_is_wknd'] = df4['datetime'].dt.dayofweek // 5                 #是否周末
df4['date1'] = df4['date'].astype('str')
#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]
df_1.shape
(11583, 59)
df_1['in_month'].value_counts()
7    4391
5    2520
6    2192
8    1656
9     824
Name: in_month, dtype: int64
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
df4[df4['date']=='2021-05-03']
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wkndin_is_wknd
df_2 = df4[df4['date1'].isin(holiday)]
df_2['holiday'] = 1
df_1['holiday'] = 0
df5 = pd.concat([df_2, df_1])
df5.shape
(14662, 61)
df5.drop(['CALLED', 'ADDRESS', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df5[:3]
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSFIRMGUEST_IDBDATEXZQHguest_sumin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYBUR_CODESTA_CODESSXMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wkndin_is_wknddate1holiday
93597092CFD17F6BB142485DB3DEC6B5E2D0B66410019320210501204613NaN1977100813282612021-05-01 20:46:002021-05-02 11:58:002021-05-01 20:46:002021-05-012021-05-022021-05-01NaN221NaNNaN371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN124922814727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0120215125121112021-05-011
1136018375C96E986C1F04691A3EB07962BCC7B0010019520210502143513NaN1995100937020312021-05-02 14:35:002021-05-03 11:58:002021-05-02 14:35:002021-05-022021-05-032021-05-02NaN231NaNNaN371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN12498954727NaNNaN15025653429NaNNaN1.52020-05-182020-07-1558.0120215226122112021-05-021
533314117C0C5516462D3487C895DD654B836F8551003720210408213013NaN1973031312010312021-04-30 13:58:002021-05-03 11:58:002021-05-01 13:58:002021-04-302021-05-032021-05-01NaN121NaNNaN371002ROOM0012020-05-18 10:22:132020-07-15 10:23:5822NaN309122814727NaNNaN466125653429NaNNaN2.02020-05-182020-07-1558.0120215125121112021-05-011
drop_col = df5.columns.tolist()
drop_col.remove('holiday')
df5.drop_duplicates(drop_col, keep='first', inplace=True)
df5.shape
(12708, 54)
df5['HOTELID'].nunique()
3133
test = pd.read_csv('testb/submit_example_2.csv')
test[:2]
HOTELIDDATEROOM_EMPTY
03037602021-09-040.1
13037602021-09-050.1
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3311 entries, 0 to 3310
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   HOTELID     3311 non-null   object 
 1   DATE        3311 non-null   object 
 2   ROOM_EMPTY  3311 non-null   float64
dtypes: float64(1), object(2)
memory usage: 77.7+ KB
hotel = test['HOTELID'].values.tolist()
df5 = df5[df5['HOTELID'].isin(hotel)]
df5.shape
(3708, 54)
d = df5[df5['in_month']==9]
d['date'].value_counts()
2021-09-04    42
2021-09-05    29
2021-09-11    16
2021-09-19    14
2021-09-20    13
2021-09-18    12
2021-09-12    11
2020-09-06     9
2020-09-13     9
2020-09-05     9
2021-09-21     7
2020-09-19     7
2021-09-26     6
2020-09-27     6
2021-09-25     6
2020-09-26     6
2020-09-20     6
2020-09-12     4
Name: date, dtype: int64
test['date1'] = test['DATE']
test1 = test.merge(df5, on=['HOTELID', 'date1'], how='left')
test.shape
(3311, 4)
test1.shape
(3311, 56)
test1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3311 entries, 0 to 3310
Data columns (total 56 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   HOTELID           3311 non-null   object        
 1   DATE              3311 non-null   object        
 2   ROOM_EMPTY        3311 non-null   float64       
 3   date1             3311 non-null   object        
 4   ORDER_ID          132 non-null    float64       
 5   ORDER_PRIMARY_ID  132 non-null    object        
 6   ORDER_TIME        132 non-null    object        
 7   STATUS            132 non-null    float64       
 8   FIRM              132 non-null    float64       
 9   GUEST_ID          16 non-null     object        
 10  BDATE             132 non-null    object        
 11  XZQH              132 non-null    object        
 12  guest_sum         132 non-null    float64       
 13  in_time           132 non-null    datetime64[ns]
 14  out_time          132 non-null    datetime64[ns]
 15  datetime          132 non-null    datetime64[ns]
 16  in_date           132 non-null    object        
 17  out_date          132 non-null    object        
 18  date              132 non-null    object        
 19  JYMJ              1 non-null      float64       
 20  ROOM_NUM          132 non-null    float64       
 21  BED_NUM           132 non-null    float64       
 22  CZLY              132 non-null    float64       
 23  BUR_CODE          86 non-null     float64       
 24  STA_CODE          86 non-null     object        
 25  SSX               132 non-null    float64       
 26  MPHM              132 non-null    object        
 27  DJSJ              132 non-null    datetime64[ns]
 28  BGSJ              132 non-null    datetime64[ns]
 29  CALLED_freq       132 non-null    float64       
 30  ADDRESS_freq      132 non-null    float64       
 31  JYMJ_freq         1 non-null      float64       
 32  ROOM_NUM_freq     132 non-null    float64       
 33  BED_NUM_freq      132 non-null    float64       
 34  CZLY_freq         132 non-null    float64       
 35  BUR_CODE_freq     86 non-null     float64       
 36  STA_CODE_freq     86 non-null     float64       
 37  SSX_freq          132 non-null    float64       
 38  MPHM_freq         132 non-null    float64       
 39  FIRM_freq         132 non-null    float64       
 40  room_ratio        1 non-null      float64       
 41  bed_ratio         1 non-null      float64       
 42  room_bed          132 non-null    float64       
 43  DJ_date           132 non-null    object        
 44  BG_date           132 non-null    object        
 45  DJ_gap            132 non-null    float64       
 46  in_hotel_num      132 non-null    float64       
 47  in_year           132 non-null    float64       
 48  in_month          132 non-null    float64       
 49  in_day            132 non-null    float64       
 50  in_quarter        132 non-null    float64       
 51  in_dayofweek      132 non-null    float64       
 52  in_dayofyear      132 non-null    float64       
 53  record_is_wknd    132 non-null    float64       
 54  in_is_wknd        132 non-null    float64       
 55  holiday           132 non-null    float64       
dtypes: datetime64[ns](5), float64(36), object(15)
memory usage: 1.4+ MB
test1[:5]
HOTELIDDATEROOM_EMPTYdate1ORDER_IDORDER_PRIMARY_IDORDER_TIMESTATUSFIRMGUEST_IDBDATEXZQHguest_sumin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYBUR_CODESTA_CODESSXMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wkndin_is_wkndholiday
03037602021-09-040.12021-09-04NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
13037602021-09-050.12021-09-05NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
23037602021-09-110.12021-09-11NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
33037602021-09-120.12021-09-12NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
43037602021-09-190.12021-09-19NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
test1['holiday1'] = test1['holiday'].apply(lambda x: 0 if x else 1)
test1['holiday1'].value_counts()
0    3213
1      98
Name: holiday1, dtype: int64
test1[:10]
HOTELIDDATEROOM_EMPTYdate1ORDER_IDORDER_PRIMARY_IDORDER_TIMESTATUSFIRMGUEST_IDBDATEXZQHguest_sumin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYBUR_CODESTA_CODESSXMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wkndin_is_wkndholidayholiday1
03037602021-09-040.12021-09-04NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
13037602021-09-050.12021-09-05NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
23037602021-09-110.12021-09-11NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
33037602021-09-120.12021-09-12NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
43037602021-09-190.12021-09-19NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
53037602021-09-200.12021-09-20NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
63037602021-09-210.12021-09-21NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
73154712021-09-040.12021-09-044798603.0A139D5C20BB44529A66957AA491FB88E2021082123271.03.0NaN200011013310211.02021-09-03 13:58:002021-09-05 11:58:002021-09-04 13:58:002021-09-032021-09-052021-09-04NaN1.02.01.03.710980e+11371000000007371002.0ROOM0012021-05-05 13:58:582021-05-05 14:02:452.01.0NaN3091.02281.04727.0852.0433.04661.02565.03429.0NaNNaN2.02021-05-052021-05-050.01.02021.09.04.03.05.0247.01.01.00.01
83154712021-09-050.12021-09-05NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
93154712021-09-110.12021-09-11NaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaTNaTNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0
test1['ROOM_EMPTY'] = test1['holiday1']
test2 = test1[['HOTELID', 'DATE', 'ROOM_EMPTY']]
test2[:5]
HOTELIDDATEROOM_EMPTY
03037602021-09-040
13037602021-09-050
23037602021-09-110
33037602021-09-120
43037602021-09-190
test2.to_csv('pre.csv', index=False)
pre = pd.read_csv('file41704172.csv')
pre[:2]
HOTELIDDATEroom
03037602021-09-040.0
13037602021-09-050.0
pre.rename(columns={'ROOM_EMPTY':'room'}, inplace=True)
pr = test2.merge(pre, on=['HOTELID', 'DATE'], how='left')
pr['ROOM_EMPTY'] = pr['ROOM_EMPTY'] + pr['room']
del pr['room']
pre['room'].value_counts()
0.0    2838
1.0     473
Name: room, dtype: int64
pr.to_csv('pre1.csv', index=False)



dd = df5.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()
dd
in_yearin_monthORDER_ID
02020712
120208102
22020956
320215735
420216833
5202171279
620218535
720219156
dd['ym'] = dd['in_year']*100+dd['in_month']
dd['ym'].value_counts()
202105    1
202007    1
202008    1
202009    1
202106    1
202107    1
202108    1
202109    1
Name: ym, dtype: int64
plt.figure(figsize=(15,8))
plt.scatter(dd.index, dd['ORDER_ID'])
<matplotlib.collections.PathCollection at 0x1356dc08520>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fiksPl1J-1646247028449)(output_270_1.png)]










  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值