% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import scipy. stats as st
import os
import re
import gc
import warnings
warnings. filterwarnings( 'ignore' )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'display.max_rows' , None )
pd. set_option( 'display.max_colwidth' , 100 )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
path = 'traina/'
path1 = 'trainb/'
入住信息
cust_info = pd. read_csv( path + '网约房平台入住人表.csv' )
cust_info1 = pd. read_csv( path1 + '网约房平台入住人表.csv' )
cust = pd. concat( [ cust_info, cust_info1] )
cust. head( 2 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 NaN NaN 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 2.021071e+11 2.021071e+11
gc. collect( )
216
cust[ 'BDATE' ] = cust[ 'BDATE' ] . astype( 'str' )
cust[ 'XZQH' ] = cust[ 'XZQH' ] . astype( 'str' )
cust[ 'IN_TIME' ] = cust[ 'IN_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
cust[ 'OUT_TIME' ] = cust[ 'OUT_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
cust. head( 2 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 0.0 0.0 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 202107092103 202107111158
cust. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_PRIMARY_ID 44030 non-null object
1 GUEST_ID 5158 non-null object
2 BDATE 44030 non-null object
3 XZQH 44030 non-null object
4 IN_TIME 44030 non-null object
5 OUT_TIME 44030 non-null object
dtypes: object(6)
memory usage: 2.4+ MB
cust. nunique( )
ORDER_PRIMARY_ID 43883
GUEST_ID 3698
BDATE 9503
XZQH 2659
IN_TIME 3396
OUT_TIME 236
dtype: int64
cust[ 'ORDER_PRIMARY_ID' ] . value_counts( ) [ 5 : 10 ]
03158D5CA62E42339D697612EA347FB3 2
0A99D74F1E0248D68A729CB79FC640E6 2
1FE3E448558347D89C56B0AFCEC8ACFB 2
864E417B7BDE4C7B9449B7E087E9F21E 2
3DEFA75566934CA5929D0C47F0B95FDE 2
Name: ORDER_PRIMARY_ID, dtype: int64
cust. sort_values( [ 'ORDER_PRIMARY_ID' , 'GUEST_ID' ] , inplace= True )
stat = cust. groupby( [ 'ORDER_PRIMARY_ID' ] ) [ 'BDATE' ] . count( ) . reset_index( )
stat. columns = [ 'ORDER_PRIMARY_ID' , 'guest_sum' ]
stat[ 'guest_sum_notnull' ] = cust. groupby( [ 'ORDER_PRIMARY_ID' ] ) [ 'GUEST_ID' ] . agg( 'count' ) . values
cust = cust. merge( stat, on= [ 'ORDER_PRIMARY_ID' ] , how= 'left' )
cust. drop_duplicates( 'ORDER_PRIMARY_ID' , keep= 'first' , inplace= True )
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '70798782D6C04A438360D80AFE4845C1' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 22801 70798782D6C04A438360D80AFE4845C1 371099B389202107040001 19990314 130983 0.0 0.0 3 2
cust[ 'IN_TIME' ] . max ( ) , cust[ 'IN_TIME' ] . min ( )
('202110041413', '0.0')
订单信息
order_info = pd. read_csv( path + '网约平台旅客订单信息.csv' )
order1 = pd. read_csv( path1 + '网约平台旅客订单信息.csv' )
order = pd. concat( [ order_info, order1] )
order. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 1.596120e+11 1 NaN 20200730224152 20200730224152 3 1 923696 C72F20539AD1447D86CD1A8E5EAEC63A 282932 202008041400 202008061200 1.596121e+11 1 NaN 20200730225524 20200730225524 3
order[ order[ 'ORDER_PRIMARY_ID' ] == '3CDEDB5E03534D379687645675898CA4' ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 30970 4422027 3CDEDB5E03534D379687645675898CA4 170898 202108081358 202108101158 2.021071e+11 3 NaN 20210712210153 20210712210153 3
order[ 'PRE_IN_TIME' ] = order[ 'PRE_IN_TIME' ] . astype( 'str' )
order[ 'PRE_OUT_TIME' ] = order[ 'PRE_OUT_TIME' ] . astype( 'str' )
order[ 'INSERT_TIME' ] = order[ 'INSERT_TIME' ] . astype( 'str' )
order[ 'MODIFY_TIME' ] = order[ 'MODIFY_TIME' ] . astype( 'str' )
order[ 'CANCEL_TIME' ] = order[ 'CANCEL_TIME' ] . astype( 'str' )
order[ 'ORDER_TIME' ] = order[ 'ORDER_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
##########################没有考虑包含取消时间的订单
order = order[ ( order[ 'CANCEL_TIME' ] == 'nan' ) ]
order = order. sort_values( [ 'ORDER_ID' , 'MODIFY_TIME' ] )
order. drop_duplicates( 'ORDER_ID' , keep= 'last' , inplace= True )
order. shape
(29941, 11)
order[ order[ 'ORDER_ID' ] == 4402846 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 30141 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111358 202107121158 202107111032 1 nan 20210711140042 20210711140042 3
order. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 PRE_IN_TIME 29941 non-null object
4 PRE_OUT_TIME 29941 non-null object
5 ORDER_TIME 29941 non-null object
6 STATUS 29941 non-null int64
7 CANCEL_TIME 29941 non-null object
8 INSERT_TIME 29941 non-null object
9 MODIFY_TIME 29941 non-null object
10 FIRM 29941 non-null int64
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order. nunique( )
ORDER_ID 29941
ORDER_PRIMARY_ID 29941
HOTELID 4801
PRE_IN_TIME 6875
PRE_OUT_TIME 729
ORDER_TIME 26818
STATUS 3
CANCEL_TIME 1
INSERT_TIME 25432
MODIFY_TIME 25432
FIRM 2
dtype: int64
order_info[ 'FIRM' ] . value_counts( )
3 32029
10 8521
Name: FIRM, dtype: int64
order[ 'STATUS' ] . value_counts( )
1 26137
2 3221
3 583
Name: STATUS, dtype: int64
order[ order[ 'INSERT_TIME' ] != order[ 'MODIFY_TIME' ] ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
合并数据order和cust
df = pd. merge( order, cust, on= 'ORDER_PRIMARY_ID' )
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df[ 'IN_TIME' ] = df[ 'IN_TIME' ] . apply ( lambda x: np. nan if x== '0.0' else x)
df[ 'OUT_TIME' ] = df[ 'OUT_TIME' ] . apply ( lambda x: np. nan if x== '0.0' else x)
df[ 'IN_TIME' ] . fillna( df[ 'PRE_IN_TIME' ] , inplace= True )
df[ 'OUT_TIME' ] . fillna( df[ 'PRE_OUT_TIME' ] , inplace= True )
df[ 'OUT_TIME' ] = df. apply ( lambda x: x[ 'OUT_TIME' ] if x[ 'OUT_TIME' ] >= x[ 'PRE_OUT_TIME' ] else x[ 'PRE_OUT_TIME' ] , axis= 1 )
df[ 'IN_TIME' ] = df. apply ( lambda x: x[ 'IN_TIME' ] if x[ 'IN_TIME' ] >= x[ 'PRE_IN_TIME' ] else x[ 'PRE_IN_TIME' ] , axis= 1 )
df. drop( [ 'PRE_IN_TIME' , 'PRE_OUT_TIME' , 'CANCEL_TIME' ] , axis= 1 , inplace= True )
#######订单插入时间和订单时间的时间差
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 ORDER_TIME 29941 non-null object
4 STATUS 29941 non-null int64
5 INSERT_TIME 29941 non-null object
6 MODIFY_TIME 29941 non-null object
7 FIRM 29941 non-null int64
8 GUEST_ID 3561 non-null object
9 BDATE 29941 non-null object
10 XZQH 29941 non-null object
11 IN_TIME 29941 non-null object
12 OUT_TIME 29941 non-null object
13 guest_sum 29941 non-null int64
14 guest_sum_notnull 29941 non-null int64
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df. nunique( ) ;
df[ 'in_time' ] = pd. to_datetime( df[ 'IN_TIME' ] , errors= 'coerce' , format = '%Y%m%d%H%M' )
df[ 'out_time' ] = pd. to_datetime( df[ 'OUT_TIME' ] , errors= 'coerce' , format = '%Y%m%d%H%M' )
df = df[ ~ df[ 'out_time' ] . isnull( ) ]
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 2020-07-15 13:00:00 2020-07-16 12:00:00 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0 2020-10-01 14:00:00 2020-10-02 10:11:00
#后移一天 df[‘lock_time’][0] + pd.Timedelta(days=1) #前移一天 df[‘lock_time’][0] + pd.Timedelta(days=-1) #df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
dfs = [ ]
for idx, group in df. groupby( [ 'ORDER_PRIMARY_ID' ] ) :
stat1 = pd. DataFrame( )
stat1[ 'datetime' ] = pd. date_range( start= group[ 'IN_TIME' ] . values[ 0 ] , end= group[ 'OUT_TIME' ] . values[ 0 ] , freq= 'D' , normalize= False , closed= None )
stat1[ 'datetime' ] = stat1[ 'datetime' ] . dt. date. fillna( 0 ) . astype( 'str' )
stat1[ 'ORDER_PRIMARY_ID' ] = group[ 'ORDER_PRIMARY_ID' ] . values[ 0 ]
dfs. append( stat1)
gc. collect( )
17
df_date = pd. concat( dfs) . reset_index( drop= True )
df_date[ : 3 ]
datetime ORDER_PRIMARY_ID 0 2021-06-30 00003FC18B254E86803C00F4BBA382E4 1 2021-07-01 00003FC18B254E86803C00F4BBA382E4 2 2021-07-02 00003FC18B254E86803C00F4BBA382E4
dfs1 = [ ]
for idx, group in df. groupby( [ 'HOTELID' ] ) :
stat = pd. DataFrame( )
stat[ 'datetime' ] = pd. date_range( start= '20200601' , end= '20210830' , freq= 'D' , normalize= False , closed= None )
stat[ 'datetime' ] = stat[ 'datetime' ] . astype( 'str' )
stat[ 'HOTELID' ] = group[ 'HOTELID' ] . values[ 0 ]
dfs1. append( stat)
df_date1 = pd. concat( dfs1) . reset_index( drop= True )
df_date1[ : 10 ]
datetime ORDER_PRIMARY_ID 0 2020-06-01 00003FC18B254E86803C00F4BBA382E4 1 2020-06-02 00003FC18B254E86803C00F4BBA382E4 2 2020-06-03 00003FC18B254E86803C00F4BBA382E4 3 2020-06-04 00003FC18B254E86803C00F4BBA382E4 4 2020-06-05 00003FC18B254E86803C00F4BBA382E4 5 2020-06-06 00003FC18B254E86803C00F4BBA382E4 6 2020-06-07 00003FC18B254E86803C00F4BBA382E4 7 2020-06-08 00003FC18B254E86803C00F4BBA382E4 8 2020-06-09 00003FC18B254E86803C00F4BBA382E4 9 2020-06-10 00003FC18B254E86803C00F4BBA382E4
df11 = df. merge( df_date, on= [ 'ORDER_PRIMARY_ID' ] , how= 'left' )
_df1 = df_date1. merge( df11, on= [ 'HOTELID' , 'datetime' ] , how= 'left' )
ddf = _df1[ _df1[ 'ORDER_ID' ] . isnull( ) ]
df11[ 'in_date' ] = df11[ 'in_time' ] . dt. date
df11[ 'out_date' ] = df11[ 'out_time' ] . dt. date
df11[ 'datetime' ] = pd. to_datetime( df11[ 'datetime' ] , errors= 'coerce' , format = '%Y-%m-%d' )
df11[ 'date' ] = df11[ 'datetime' ] . dt. date
_df2 = df11[ ( df11[ 'out_date' ] != df11[ 'date' ] ) | ( ( df11[ 'out_date' ] == df11[ 'date' ] ) & ( df11[ 'in_date' ] == df11[ 'out_date' ] ) ) ]
_df2. drop( [ 'in_time' , 'out_time' , 'in_date' , 'out_date' , 'datetime' ] , axis= 1 , inplace= True )
_df2. rename( columns= { 'date' : 'datetime' } , inplace= True )
in_num = _df2. groupby( [ 'HOTELID' , 'datetime' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( )
in_num. columns = [ 'HOTELID' , 'datetime' , 'in_hotel_num' ]
_df2 = _df2. merge( in_num, on= [ 'HOTELID' , 'datetime' ] , how= 'left' )
_df2. columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'datetime',
'in_hotel_num'],
dtype='object')
ddf. columns
Index(['datetime', 'HOTELID', 'ORDER_ID', 'ORDER_PRIMARY_ID', 'ORDER_TIME',
'STATUS', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE',
'XZQH', 'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull',
'in_time', 'out_time'],
dtype='object')
ddf. drop( [ 'in_time' , 'out_time' ] , axis= 1 , inplace= True )
ddf[ 'in_hotel_num' ] = 0
df1 = pd. concat( [ _df2, ddf] )
df1 = df1. sort_values( [ 'HOTELID' , 'datetime' ] )
df1. head( 2 ) . append( df1. tail( 2 ) )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull datetime in_hotel_num 0 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-01 0 1 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-02 0 2196800 NaN NaN B109977684 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2021-08-29 0 2196801 NaN NaN B109977684 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2021-08-30 0
df1. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2186104 entries, 0 to 2196801
Data columns (total 17 columns):
# Column Dtype
--- ------ -----
0 ORDER_ID float64
1 ORDER_PRIMARY_ID object
2 HOTELID object
3 ORDER_TIME object
4 STATUS float64
5 INSERT_TIME object
6 MODIFY_TIME object
7 FIRM float64
8 GUEST_ID object
9 BDATE object
10 XZQH object
11 IN_TIME object
12 OUT_TIME object
13 guest_sum float64
14 guest_sum_notnull float64
15 datetime object
16 in_hotel_num int64
dtypes: float64(5), int64(1), object(11)
memory usage: 300.2+ MB
test = pd. read_csv( 'testb/submit_example_2.csv' )
test. head( )
HOTELID DATE ROOM_EMPTY 0 303760 2021-09-04 0.1 1 303760 2021-09-05 0.1 2 303760 2021-09-11 0.1 3 303760 2021-09-12 0.1 4 303760 2021-09-19 0.1
test[ 'type' ] = 'test'
df1[ 'type' ] = 'train'
df1[ 'DATE' ] = df1[ 'datetime' ] . astype( 'str' )
df2 = df1[ df1[ 'DATE' ] < '2021-09-01' ]
df2. shape
(2185510, 19)
df2[ 'datetime' ] = pd. to_datetime( df2[ 'datetime' ] , errors= 'coerce' , format = '%Y-%m-%d' )
df2[ 'in_year' ] = df2[ 'datetime' ] . dt. year. fillna( 0 ) . astype( 'int' )
df2[ 'in_month' ] = df2[ 'datetime' ] . dt. month. fillna( 0 ) . astype( 'int' )
df2[ 'in_day' ] = df2[ 'datetime' ] . dt. day. fillna( 0 ) . astype( 'int' )
df2[ 'in_quarter' ] = df2[ 'datetime' ] . dt. quarter. fillna( 0 ) . astype( 'int' )
df2[ 'in_dayofweek' ] = df2[ 'datetime' ] . dt. dayofweek. fillna( 0 ) . astype( 'int' )
df2[ 'in_dayofyear' ] = df2[ 'datetime' ] . dt. dayofyear. fillna( 0 ) . astype( 'int' )
df2[ 'in_weekofyear' ] = df2[ 'datetime' ] . dt. weekofyear. fillna( 0 ) . astype( 'int' )
df2[ 'in_is_wknd' ] = df2[ 'datetime' ] . dt. dayofweek // 5
df2. head( 3 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull datetime in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd 0 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-01 0 train 2020-06-01 2020 6 1 2 0 153 23 0 1 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-02 0 train 2020-06-02 2020 6 2 2 1 154 23 0 2 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-03 0 train 2020-06-03 2020 6 3 2 2 155 23 0
use_month = [ 5 , 6 , 7 , 8 , 9 ]
df_1 = df2[ ( df2[ 'in_is_wknd' ] == 1 ) & ( df2[ 'in_month' ] . isin( use_month) ) ]
holiday = [ '2021-05-01' , '2021-05-02' , '2021-05-03' , '2021-05-04' , '2021-05-05' , '2021-06-12' , '2021-06-13' , '2021-06-14' ,
'2021-09-19' , '2021-09-20' , '2021-09-21' ]
df_2 = df2[ df2[ 'DATE' ] . isin( holiday) ]
df_2[ 'holiday' ] = 1
df_1[ 'holiday' ] = 0
df3 = pd. concat( [ df_2, df_1] )
df3. shape
(373093, 28)
drop_col = df3. columns. tolist( )
drop_col. remove( 'holiday' )
df3. drop_duplicates( drop_col, keep= 'first' , inplace= True )
df3. shape
(354019, 28)
test[ 'datetime' ] = pd. to_datetime( test[ 'DATE' ] , errors= 'coerce' , format = '%Y-%m-%d' )
test[ 'in_year' ] = test[ 'datetime' ] . dt. year. fillna( 0 ) . astype( 'int' )
test[ 'in_month' ] = test[ 'datetime' ] . dt. month. fillna( 0 ) . astype( 'int' )
test[ 'in_day' ] = test[ 'datetime' ] . dt. day. fillna( 0 ) . astype( 'int' )
test[ 'in_quarter' ] = test[ 'datetime' ] . dt. quarter. fillna( 0 ) . astype( 'int' )
test[ 'in_dayofweek' ] = test[ 'datetime' ] . dt. dayofweek. fillna( 0 ) . astype( 'int' )
test[ 'in_dayofyear' ] = test[ 'datetime' ] . dt. dayofyear. fillna( 0 ) . astype( 'int' )
test[ 'in_weekofyear' ] = test[ 'datetime' ] . dt. weekofyear. fillna( 0 ) . astype( 'int' )
test[ 'in_is_wknd' ] = test[ 'datetime' ] . dt. dayofweek // 5
test[ : 3 ]
HOTELID DATE ROOM_EMPTY type datetime in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd 0 303760 2021-09-04 0.1 test 2021-09-04 2021 9 4 3 5 247 35 1 1 303760 2021-09-05 0.1 test 2021-09-05 2021 9 5 3 6 248 35 1 2 303760 2021-09-11 0.1 test 2021-09-11 2021 9 11 3 5 254 36 1
test[ 'holiday' ] = 0
test[ 'holiday' ] = test. apply ( lambda x: x[ 'holiday' ] + 1 if x[ 'DATE' ] in holiday else x[ 'holiday' ] , axis= 1 )
del test[ 'datetime' ] , df3[ 'datetime' ] , test[ 'ROOM_EMPTY' ]
df3. columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'in_hotel_num',
'type', 'DATE', 'in_year', 'in_month', 'in_day', 'in_quarter',
'in_dayofweek', 'in_dayofyear', 'in_weekofyear', 'in_is_wknd',
'holiday'],
dtype='object')
df3. drop( [ 'IN_TIME' , 'OUT_TIME' ] , axis= 1 , inplace= True )
df4 = df3. merge( test, on= [ 'HOTELID' , 'DATE' , 'type' , 'in_year' , 'in_month' , 'in_day' , 'in_quarter' , 'in_dayofweek' ,
'in_dayofyear' , 'in_weekofyear' , 'in_is_wknd' , 'holiday' ] , how= 'outer' )
df4. shape
(357330, 25)
dd = df4[ df4[ 'DATE' ] > '2021-09-01' ]
dd[ 'type' ] . value_counts( )
test 3311
Name: type, dtype: int64
df4. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 357330 entries, 0 to 357329
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 12543 non-null float64
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 357330 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 354019 non-null float64
14 type 357330 non-null object
15 DATE 357330 non-null object
16 in_year 357330 non-null int64
17 in_month 357330 non-null int64
18 in_day 357330 non-null int64
19 in_quarter 357330 non-null int64
20 in_dayofweek 357330 non-null int64
21 in_dayofyear 357330 non-null int64
22 in_weekofyear 357330 non-null int64
23 in_is_wknd 357330 non-null int64
24 holiday 357330 non-null int64
dtypes: float64(6), int64(9), object(10)
memory usage: 70.9+ MB
room信息
room_info = pd. read_csv( path + '网约房注册民宿.csv' )
room_info. head( 1 )
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS 0 100177 3710830002 0.0 1 1 2 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 371000000000 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN NaN
room_info[ 'STATUS' ] . value_counts( )
1.0 2658
Name: STATUS, dtype: int64
room_info. drop( [ 'HOTELID' , 'FWLY' , 'CITY_CODE' , 'STATUS' , 'AUDITSTATUS' , 'JYQK' ] , axis= 1 , inplace= True )
room_info. rename( columns= { 'CODE' : 'HOTELID' } , inplace= True )
room_info[ 'FIRM' ] . value_counts( )
3 3429
10 1878
Name: FIRM, dtype: int64
room_info. nunique( )
HOTELID 5307
JYMJ 173
ROOM_NUM 11
BED_NUM 17
CZLY 4
CALLED 4232
BUR_CODE 7
STA_CODE 54
SSX 5
ADDRESS 4850
MPHM 584
FIRM 2
DJSJ 3753
BGSJ 3294
dtype: int64
登记时间和变更时间无变化的只有两家
room_info[ 'JYMJ' ] = room_info[ 'JYMJ' ] . apply ( lambda x: np. nan if x== 0 else x)
room_use_col = [ 'HOTELID' , 'JYMJ' , 'ROOM_NUM' , 'BED_NUM' , 'FIRM' , 'STATUS' ]
freq_col = [ 'CALLED' , 'ADDRESS' , 'JYMJ' , 'ROOM_NUM' , 'BED_NUM' , 'CZLY' , 'BUR_CODE' , 'STA_CODE' , 'SSX' , 'MPHM' , 'FIRM' ]
for col in freq_col:
st = room_info[ col] . value_counts( ) . reset_index( )
st. columns = [ col, col+ '_freq' ]
room_info = room_info. merge( st, on= col, how= 'left' )
room_info[ 'room_ratio' ] = room_info[ 'JYMJ' ] / room_info[ 'ROOM_NUM' ]
room_info[ 'bed_ratio' ] = room_info[ 'JYMJ' ] / room_info[ 'BED_NUM' ]
room_info[ 'room_bed' ] = room_info[ 'BED_NUM' ] / room_info[ 'ROOM_NUM' ]
room_info[ 'DJSJ' ] = pd. to_datetime( room_info[ 'DJSJ' ] , format = '%Y-%m-%d %H:%M:%S' , errors= 'coerce' )
room_info[ 'BGSJ' ] = pd. to_datetime( room_info[ 'BGSJ' ] , format = '%Y-%m-%d %H:%M:%S' , errors= 'coerce' )
room_info[ 'DJ_date' ] = room_info[ 'DJSJ' ] . dt. date
room_info[ 'BG_date' ] = room_info[ 'BGSJ' ] . dt. date
room_info[ 'DJ_gap' ] = ( room_info[ 'BG_date' ] - room_info[ 'DJ_date' ] ) . dt. days
room_info[ 'DJSJ' ] . min ( ) , room_info[ 'DJSJ' ] . max ( ) , room_info[ 'BGSJ' ] . min ( ) , room_info[ 'BGSJ' ] . max ( )
(Timestamp('2020-05-12 13:16:24'),
Timestamp('2021-10-19 16:05:41'),
Timestamp('2020-07-15 10:23:06'),
Timestamp('2021-10-19 16:09:15'))
plt. figure( figsize= ( 20 , 8 ) )
sns. countplot( room_info[ 'JYMJ' ] )
<AxesSubplot:xlabel='JYMJ', ylabel='count'>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xMlhlyik-1646247028436)(output_119_1.png)]
经营面积缺失超过一半,3000多,考虑删除
合并room到df中
del room_info[ 'FIRM' ]
df = df4. merge( room_info, on= [ 'HOTELID' ] , how= 'left' )
df = df. sort_values( by= [ 'HOTELID' , 'DATE' ] )
gc. collect( )
170
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH guest_sum guest_sum_notnull in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap 37946 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 train 2020-06-06 2020 6 6 2 5 158 23 1 0 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 37947 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 train 2020-06-07 2020 6 7 2 6 159 23 1 0 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 357330 entries, 37946 to 354018
Data columns (total 54 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 12543 non-null float64
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 357330 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 354019 non-null float64
14 type 357330 non-null object
15 DATE 357330 non-null object
16 in_year 357330 non-null int64
17 in_month 357330 non-null int64
18 in_day 357330 non-null int64
19 in_quarter 357330 non-null int64
20 in_dayofweek 357330 non-null int64
21 in_dayofyear 357330 non-null int64
22 in_weekofyear 357330 non-null int64
23 in_is_wknd 357330 non-null int64
24 holiday 357330 non-null int64
25 JYMJ 141308 non-null float64
26 ROOM_NUM 357330 non-null int64
27 BED_NUM 357330 non-null int64
28 CZLY 357330 non-null int64
29 CALLED 357330 non-null object
30 BUR_CODE 167625 non-null float64
31 STA_CODE 167625 non-null object
32 SSX 357330 non-null int64
33 ADDRESS 357330 non-null object
34 MPHM 357330 non-null object
35 DJSJ 356948 non-null datetime64[ns]
36 BGSJ 357330 non-null datetime64[ns]
37 CALLED_freq 357330 non-null int64
38 ADDRESS_freq 357330 non-null int64
39 JYMJ_freq 141308 non-null float64
40 ROOM_NUM_freq 357330 non-null int64
41 BED_NUM_freq 357330 non-null int64
42 CZLY_freq 357330 non-null int64
43 BUR_CODE_freq 167625 non-null float64
44 STA_CODE_freq 167625 non-null float64
45 SSX_freq 357330 non-null int64
46 MPHM_freq 357330 non-null int64
47 FIRM_freq 357330 non-null int64
48 room_ratio 141308 non-null float64
49 bed_ratio 141308 non-null float64
50 room_bed 357330 non-null float64
51 DJ_date 356948 non-null object
52 BG_date 357330 non-null object
53 DJ_gap 356948 non-null float64
dtypes: datetime64[ns](2), float64(15), int64(21), object(16)
memory usage: 149.9+ MB
missing = df. isnull( ) . sum ( )
missing = missing[ missing> 180000 ]
missing_col = missing. index. tolist( )
df. drop( columns= missing_col, inplace= True )
df[ 'DATE' ] . min ( ) , df[ 'DATE' ] . max ( )
('2020-06-06', '2021-09-21')
df. drop( columns= [ 'CALLED' , 'ADDRESS' ] , inplace= True )
df[ : 5 ]
HOTELID in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday ROOM_NUM BED_NUM CZLY SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq SSX_freq MPHM_freq FIRM_freq room_bed DJ_date BG_date DJ_gap 37946 100177 0.0 train 2020-06-06 2020 6 6 2 5 158 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 37947 100177 0.0 train 2020-06-07 2020 6 7 2 6 159 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 37948 100177 0.0 train 2020-06-13 2020 6 13 2 5 165 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 37949 100177 0.0 train 2020-06-14 2020 6 14 2 6 166 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 37950 100177 0.0 train 2020-06-20 2020 6 20 2 5 172 25 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0
def qty_shift ( df, val) :
df[ 'last_1_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 1 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_2_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 2 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_3_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 3 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_4_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 4 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_5_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 5 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_6_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 6 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_7_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 7 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_8_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 8 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_10_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 10 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_12_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 12 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_15_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 15 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
df[ 'last_20_qty' ] = df. groupby( 'HOTELID' ) [ val] . shift( 20 ) . fillna( method= 'ffill' ) . reset_index( ) . sort_index( ) . set_index( 'index' )
return df
vals = [ 'in_hotel_num' , 'in_year' , 'in_month' , 'in_dayofyear' , 'in_quarter' ]
for val in vals:
print ( val)
df = qty_shift( df, val)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
def qty_rolling ( df, window, val, keys) :
df[ 'qty_rolling' + str ( window) + '_mean' ] = df. groupby( keys) [ val] . transform(
lambda x: x. shift( 1 ) . rolling( window= window, min_periods= 3 , win_type= "triang" ) . mean( ) ) . values. tolist( )
df[ 'qty_rolling' + str ( window) + '_max' ] = df. groupby( keys) [ val] . transform(
lambda x: x. shift( 1 ) . rolling( window= window, min_periods= 3 ) . max ( ) ) . values. tolist( )
df[ 'qty_rolling' + str ( window) + '_sum' ] = df. groupby( keys) [ val] . transform(
lambda x: x. shift( 1 ) . rolling( window= window, min_periods= 3 ) . sum ( ) ) . values. tolist( )
return df
gc. collect( )
34
keys = 'HOTELID'
for val in vals:
print ( val)
df = qty_rolling( df, 3 , val, keys)
df = qty_rolling( df, 4 , val, keys)
df = qty_rolling( df, 6 , val, keys)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
def qty_ewm ( df, alpha, val, keys) :
df[ 'qty_ewm' + '_mean' ] = df. groupby( keys) [ val] . transform( lambda x: x. shift( 1 ) . ewm( alpha= alpha) . mean( ) ) . values. tolist( )
df[ 'qty_ewm' + '_std' ] = df. groupby( keys) [ val] . transform( lambda x: x. shift( 1 ) . ewm( alpha= alpha) . std( ) ) . values. tolist( )
df[ 'qty_ewm' + '_corr' ] = df. groupby( keys) [ val] . transform( lambda x: x. shift( 1 ) . ewm( alpha= alpha) . corr( ) ) . values. tolist( )
return df
for val in vals:
print ( val)
df = qty_ewm( df, 0.95 , val, keys)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
df. select_dtypes( include= 'object' ) . columns
Index(['HOTELID', 'type', 'DATE', 'MPHM', 'DJ_date', 'BG_date'], dtype='object')
df[ 'in_hotel_num' ] = df[ 'in_hotel_num' ] . apply ( lambda x: 1 if x== 0 else 0 )
df[ : 5 ]
HOTELID in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday ROOM_NUM BED_NUM CZLY SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq SSX_freq MPHM_freq FIRM_freq room_bed DJ_date BG_date DJ_gap last_1_qty last_2_qty last_3_qty last_4_qty last_5_qty last_6_qty last_7_qty last_8_qty last_10_qty last_12_qty last_15_qty last_20_qty qty_rolling3_mean qty_rolling3_max qty_rolling3_sum qty_rolling4_mean qty_rolling4_max qty_rolling4_sum qty_rolling6_mean qty_rolling6_max qty_rolling6_sum qty_ewm_mean qty_ewm_std qty_ewm_corr 37946 100177 1 train 2020-06-06 2020 6 6 2 5 158 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 37947 100177 1 train 2020-06-07 2020 6 7 2 6 159 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 NaN NaN 37948 100177 1 train 2020-06-13 2020 6 13 2 5 165 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 0.0 NaN 37949 100177 1 train 2020-06-14 2020 6 14 2 6 166 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 2.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.0 6.0 2.0 2.0 6.0 2.0 2.0 6.0 2.0 0.0 NaN 37950 100177 1 train 2020-06-20 2020 6 20 2 5 172 25 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 2.0 2.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.0 6.0 2.0 2.0 8.0 2.0 2.0 8.0 2.0 0.0 NaN
df = df. fillna( 0 )
col_list = df. select_dtypes( include= [ 'float64' , 'int64' ] ) . columns. tolist( )
col_list. remove( 'in_hotel_num' )
col_list
['in_year',
'in_month',
'in_day',
'in_quarter',
'in_dayofweek',
'in_dayofyear',
'in_weekofyear',
'in_is_wknd',
'holiday',
'ROOM_NUM',
'BED_NUM',
'CZLY',
'SSX',
'CALLED_freq',
'ADDRESS_freq',
'ROOM_NUM_freq',
'BED_NUM_freq',
'CZLY_freq',
'SSX_freq',
'MPHM_freq',
'FIRM_freq',
'room_bed',
'DJ_gap',
'last_1_qty',
'last_2_qty',
'last_3_qty',
'last_4_qty',
'last_5_qty',
'last_6_qty',
'last_7_qty',
'last_8_qty',
'last_10_qty',
'last_12_qty',
'last_15_qty',
'last_20_qty',
'qty_rolling3_mean',
'qty_rolling3_max',
'qty_rolling3_sum',
'qty_rolling4_mean',
'qty_rolling4_max',
'qty_rolling4_sum',
'qty_rolling6_mean',
'qty_rolling6_max',
'qty_rolling6_sum',
'qty_ewm_mean',
'qty_ewm_std',
'qty_ewm_corr']
gc. collect( )
102
used_features = col_list
cate_cols = [ 'HOTELID' , 'MPHM' ]
X_train = df[ df[ "DATE" ] < '2021-08-02' ] [ used_features] . reset_index( drop= True )
y_train = df[ df[ "DATE" ] < '2021-08-02' ] [ "in_hotel_num" ]
X_valid = df[ ( df[ "DATE" ] > '2021-08-02' ) & ( df[ "DATE" ] < '2021-09-01' ) ] [ used_features] . reset_index( drop= True )
y_valid = df[ ( df[ "DATE" ] > '2021-08-02' ) & ( df[ "DATE" ] < '2021-09-01' ) ] [ "in_hotel_num" ]
X_test = df[ df[ "type" ] == 'test' ] [ used_features] . reset_index( drop= True )
clf_1 = LGBMClassifier( num_leaves = 256 ,
n_estimators = 20000 ,
learning_rate = 0.005 ,
verbose = - 1 ,
max_bin = 100 ,
max_depth = 10 ,
feature_fraction_seed = 66 ,
feature_fraction = 0.7 ,
bagging_seed = 66 ,
bagging_freq = 1 ,
bagging_fraction = 0.95 ,
metric = 'auc' ,
lambda_l1 = 0.1 ,
lambda_l2 = 0.1 ,
min_child_weight = 30 ,
n_jobs= 80 )
clf_1. fit( X_train, y_train,
eval_set= [ ( X_valid, y_valid) ] ,
early_stopping_rounds= 100 , verbose= 200 )
gc. collect( )
[LightGBM] [Warning] feature_fraction is set=0.7, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7
[LightGBM] [Warning] lambda_l1 is set=0.1, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.1
[LightGBM] [Warning] bagging_fraction is set=0.95, subsample=1.0 will be ignored. Current value: bagging_fraction=0.95
[LightGBM] [Warning] lambda_l2 is set=0.1, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.1
[LightGBM] [Warning] bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10] valid_0's auc: 0.541449
357
oof_prob = clf_1. predict_proba( X_valid[ used_features] ) [ : , 1 ]
oof_prob. min ( ) , oof_prob. max ( )
(0.9610922808728843, 0.9625584615170373)
oof_prob1 = clf_1. predict_proba( X_test[ used_features] ) [ : , 1 ]
oof_prob1. min ( ) , oof_prob1. max ( )
(0.960959321386754, 0.9624418595928675)
X_test[ used_features] [ : 10 ]
STATUS FIRM guest_sum guest_sum_notnull in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE SSX CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_gap last_1_qty last_2_qty last_3_qty last_4_qty last_5_qty last_6_qty last_7_qty last_8_qty last_10_qty last_12_qty last_15_qty last_20_qty qty_rolling3_mean qty_rolling3_max qty_rolling3_sum qty_rolling4_mean qty_rolling4_max qty_rolling4_sum qty_rolling6_mean qty_rolling6_max qty_rolling6_sum qty_ewm_mean qty_ewm_std qty_ewm_corr 0 NaN NaN NaN NaN 2021 9 4 3 5 247 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 6.328848e-12 1.0 1 NaN NaN NaN NaN 2021 9 5 3 6 248 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.415174e-12 1.0 2 NaN NaN NaN NaN 2021 9 11 3 5 254 36 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.164424e-13 1.0 3 NaN NaN NaN NaN 2021 9 12 3 6 255 36 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 7.075868e-14 1.0 4 NaN NaN NaN NaN 2021 9 19 3 6 262 37 1 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.582212e-14 1.0 5 NaN NaN NaN NaN 2021 9 20 3 0 263 38 0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.537934e-15 1.0 6 NaN NaN NaN NaN 2021 9 21 3 1 264 38 0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 7.911060e-16 1.0 7 NaN NaN NaN NaN 2021 9 4 3 5 247 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 6.328848e-12 1.0 8 NaN NaN NaN NaN 2021 9 5 3 6 248 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.415174e-12 1.0 9 NaN NaN NaN NaN 2021 9 11 3 5 254 36 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.164424e-13 1.0
from sklearn. model_selection import TimeSeriesSplit
from sklearn. metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
cate_cols = [ ]
drop_col = [ 'in_hotel_num' , 'type' ]
train = df[ df[ 'type' ] == 'train' ]
labels = np. array( train[ 'in_hotel_num' ] . values. tolist( ) )
train. drop( drop_col, axis= 1 , inplace= True )
train = train[ used_features]
test = df[ df[ 'type' ] == 'test' ]
test_label = test[ 'in_hotel_num' ] . values. tolist( )
test. drop( drop_col, axis= 1 , inplace= True )
test = test[ used_features]
ts_folds = TimeSeriesSplit( n_splits = 5 )
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'
params = {
'n_estimators' : 20000 ,
'boosting' : 'gbdt' ,
'learning_rate' : 0.001 ,
'num_leaves' : 2 ** 5 ,
'bagging_fraction' : 0.95 ,
'bagging_freq' : 1 ,
'bagging_seed' : 66 ,
'feature_fraction' : 0.7 ,
'feature_fraction_seed' : 66 ,
'max_bin' : 100 ,
'max_depth' : 10 ,
'metric' : { 'auc' } ,
'verbose' : - 1
}
for fold_n, ( train_index, valid_index) in enumerate ( ts_folds. split( train) ) :
if fold_n in [ 0 , 1 , 2 , 3 ] :
continue
print ( 'Training with validation' )
trn_data = lgb. Dataset( train. iloc[ train_index] , label= labels[ train_index] ,
categorical_feature= cate_cols)
val_data = lgb. Dataset( train. iloc[ valid_index] , label= labels[ valid_index] ,
categorical_feature= cate_cols)
clf = LGBMClassifier. fit( params, trn_data, valid_sets= [ trn_data, val_data] , verbose_eval= Verbose,
early_stopping_rounds= Early_Stopping_Rounds)
val = clf. predict( train. iloc[ valid_index] )
mae_ = mean_absolute_error( labels[ valid_index] , val)
print ( 'MAE: {}' . format ( mae_) )
print ( "ReTraining on all data" )
gc. enable( )
del trn_data, val_data
gc. collect( )
Best_iteration = clf. best_iteration
print ( "Best_iteration: " , Best_iteration)
trn_data = lgb. Dataset( train, label= labels, categorical_feature= cate_cols)
Training with validation
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-535-a56605d14a52> in <module>
49 val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
50 categorical_feature=cate_cols)
---> 51 clf = LGBMClassifier.fit(params, train.iloc[train_index], df[df['type'] == 'train'].iloc[train_index]['in_hotel_num'], eval_set=[trn_data, val_data], verbose=Verbose, early_stopping_rounds=Early_Stopping_Rounds)
52 val = clf.predict(train.iloc[valid_index])
53 mae_ = mean_absolute_error(labels[valid_index], val)
F:\anaconda\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
783 _LGBMAssertAllFinite(y)
784 _LGBMCheckClassificationTargets(y)
--> 785 self._le = _LGBMLabelEncoder().fit(y)
786 _y = self._le.transform(y)
787 self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))
AttributeError: 'dict' object has no attribute '_le'
df[ 'HOTELID' ] = df[ 'HOTELID' ] . astype( 'category' )
df[ 'MPHM' ] = df[ 'MPHM' ] . astype( 'category' )
from sklearn. model_selection import TimeSeriesSplit
from sklearn. metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = [ ]
drop_col = [ 'in_hotel_num' , 'type' ]
train = df[ df[ 'type' ] == 'train' ]
labels = np. array( train[ 'in_hotel_num' ] . values. tolist( ) )
train. drop( drop_col, axis= 1 , inplace= True )
train = train[ used_features]
test = df[ df[ 'type' ] == 'test' ]
test_label = test[ 'in_hotel_num' ] . values. tolist( )
test. drop( drop_col, axis= 1 , inplace= True )
test = test[ used_features]
ts_folds = TimeSeriesSplit( n_splits = 5 )
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'
params = {
'objective' : 'regression' ,
'boosting' : 'gbdt' ,
'learning_rate' : 0.001 ,
'num_leaves' : 2 ** 5 ,
'bagging_fraction' : 0.95 ,
'bagging_freq' : 1 ,
'bagging_seed' : 66 ,
'feature_fraction' : 0.7 ,
'feature_fraction_seed' : 66 ,
'max_bin' : 100 ,
'max_depth' : 10 ,
'metric' : { 'l2' , 'l1' } ,
'verbose' : - 1
}
for fold_n, ( train_index, valid_index) in enumerate ( ts_folds. split( train) ) :
if fold_n in [ 0 , 1 , 2 , 3 ] :
continue
print ( 'Training with validation' )
trn_data = lgb. Dataset( train. iloc[ train_index] , label= labels[ train_index] ,
categorical_feature= cate_cols)
val_data = lgb. Dataset( train. iloc[ valid_index] , label= labels[ valid_index] ,
categorical_feature= cate_cols)
clf = lgb. train( params, trn_data, num_boost_round= N_round, valid_sets= [ trn_data, val_data] , verbose_eval= Verbose,
early_stopping_rounds= Early_Stopping_Rounds)
val = clf. predict( train. iloc[ valid_index] )
mae_ = mean_absolute_error( labels[ valid_index] , val)
print ( 'MAE: {}' . format ( mae_) )
print ( "ReTraining on all data" )
gc. enable( )
del trn_data, val_data
gc. collect( )
Best_iteration = clf. best_iteration
print ( "Best_iteration: " , Best_iteration)
trn_data = lgb. Dataset( train, label= labels, categorical_feature= cate_cols)
clf = lgb. train( params, trn_data, num_boost_round= int ( Best_iteration * 1.2 ) )
Training with validation
Training until validation scores don't improve for 100 rounds
[500] training's l2: 0.0357266 training's l1: 0.0733432 valid_1's l2: 0.0140039 valid_1's l1: 0.0424839
[1000] training's l2: 0.034654 training's l1: 0.0715026 valid_1's l2: 0.0136241 valid_1's l1: 0.0362867
[1500] training's l2: 0.0340906 training's l1: 0.0702526 valid_1's l2: 0.0134038 valid_1's l1: 0.0323129
[2000] training's l2: 0.0337541 training's l1: 0.0694053 valid_1's l2: 0.0132898 valid_1's l1: 0.0298233
[2500] training's l2: 0.0335207 training's l1: 0.0688021 valid_1's l2: 0.0131952 valid_1's l1: 0.028312
[3000] training's l2: 0.0333523 training's l1: 0.068367 valid_1's l2: 0.0131123 valid_1's l1: 0.0273034
[3500] training's l2: 0.033219 training's l1: 0.0680436 valid_1's l2: 0.0130372 valid_1's l1: 0.0264549
[4000] training's l2: 0.033109 training's l1: 0.0678005 valid_1's l2: 0.0129586 valid_1's l1: 0.0259314
[4500] training's l2: 0.0330175 training's l1: 0.0676386 valid_1's l2: 0.0129 valid_1's l1: 0.0256456
[5000] training's l2: 0.0329394 training's l1: 0.0675283 valid_1's l2: 0.0128614 valid_1's l1: 0.0254616
[5500] training's l2: 0.0328693 training's l1: 0.0674366 valid_1's l2: 0.0128206 valid_1's l1: 0.0253768
Early stopping, best iteration is:
[5586] training's l2: 0.0328584 training's l1: 0.067422 valid_1's l2: 0.0128154 valid_1's l1: 0.0253677
MAE: 0.025367655553334097
ReTraining on all data
Best_iteration: 5586
pred1 = clf. predict( test)
pred1. min ( ) , pred1. max ( )
(0.2106748391712024, 0.9860467852769609)
pre_d = df[ df[ 'type' ] == 'test' ]
pre_d[ 'ROOM_EMPTY' ] = pred1. tolist( )
pre_d1 = pre_d[ [ 'HOTELID' , 'DATE' , 'ROOM_EMPTY' ] ]
pp = pre_d1[ pre_d1[ 'DATE' ] == '2021-09-21' ]
pp[ : 5 ]
HOTELID DATE ROOM_EMPTY 356552 10083 2021-09-21 0.726068 355831 10125 2021-09-21 0.825109 354872 10237 2021-09-21 0.841640 356181 10273 2021-09-21 0.841640 356825 104814 2021-09-21 0.726068
np. median( pp[ 'ROOM_EMPTY' ] )
0.8079157059926819
pre_d1[ pre_d1[ 'ROOM_EMPTY' ] > 0.807 ] . shape
(2667, 3)
pre_d1[ 'ROOM_EMPTY' ] = pre_d1[ 'ROOM_EMPTY' ] . apply ( lambda x: 0 if x> 0.807 else 1 )
pre_d1[ 'ROOM_EMPTY' ] . sum ( )
644
pre_d1. to_csv( 'pre_02_reg.csv' , index= False )
pre_d1[ 'ROOM_EMPTY' ] . median( )
0.8752981387595966
pre_d1[ 'ROOM_EMPTY' ] . quantile( 0.2 )
0.8070988321951651
sns. countplot( pre_d1[ 'ROOM_EMPTY' ] )
<AxesSubplot:xlabel='ROOM_EMPTY', ylabel='count'>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-281lRSik-1646247028443)(output_177_1.png)]
pre_d1. shape
(3311, 3)
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics. roc_curve( labels[ valid_index] , val, pos_label= 2 )
metrics. auc( fpr, tpr)
nan
pred. min ( ) , pred. max ( )
(0.03637936444857589, 1.8554153046371984)
pre_df = df[ df[ 'type' ] == 'test' ]
pre_df[ 'ROOM_EMPTY' ] = pred. tolist( )
pre_df1. to_csv( 'origion.csv' , index= False )
pre_df[ : 5 ]
HOTELID in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday last_1_qty last_2_qty last_3_qty last_4_qty last_5_qty last_6_qty last_7_qty last_8_qty last_10_qty last_12_qty last_15_qty last_20_qty qty_rolling3_mean qty_rolling3_max qty_rolling3_sum qty_rolling4_mean qty_rolling4_max qty_rolling4_sum qty_rolling6_mean qty_rolling6_max qty_rolling6_sum qty_ewm_mean qty_ewm_std qty_ewm_corr ROOM_EMPTY 356546 10083 0 test 2021-09-04 2021 9 4 3 5 247 35 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 6.328848e-12 1.0 0.054686 356547 10083 0 test 2021-09-05 2021 9 5 3 6 248 35 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.415174e-12 1.0 0.055554 356548 10083 0 test 2021-09-11 2021 9 11 3 5 254 36 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.164424e-13 1.0 0.247202 356549 10083 0 test 2021-09-12 2021 9 12 3 6 255 36 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 7.075868e-14 1.0 0.105092 356550 10083 0 test 2021-09-19 2021 9 19 3 6 262 37 1 1 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.582212e-14 1.0 0.094995
pre_df1 = pre_df[ [ 'HOTELID' , 'DATE' , 'ROOM_EMPTY' ] ]
pre_df1[ pre_df1[ 'ROOM_EMPTY' ] > 0.07 ] . shape
(2380, 3)
pre_df1[ 'ROOM_EMPTY' ] = pre_df1[ 'ROOM_EMPTY' ] . apply ( lambda x: 0 if x>= 0.25 else 1 )
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics. roc_curve( labels[ valid_index] , val, pos_label= 2 )
metrics. auc( fpr, tpr)
0.7065505364632024
a = pd. DataFrame( )
a[ 'y' ] = labels[ valid_index] . tolist( )
a[ 'val' ] = val. tolist( )
fpr, tpr, thresholds = metrics. roc_curve( a[ 'y' ] , a[ 'val' ] , pos_label= 2 )
metrics. auc( fpr, tpr)
nan
np. array( a[ 'val' ] . values. tolist( ) )
array([0.98077955, 0.98028159, 0.98090901, ..., 0.99187334, 0.9917258 ,
0.99187287])
a[ 'y' ] . max ( )
1
a[ 'y' ] = a[ 'y' ] . apply ( lambda x: 1 if x== 0 else 0 )
a[ 'val' ] = a[ 'val' ] . apply ( lambda x: ( 1 - x/ ( 4.03212491 ) ) )
a[ : 5 ]
y val 0 1 0.980780 1 1 0.980282 2 1 0.980909 3 1 0.986415 4 1 0.988064
val. max ( ) - val. min ( )
4.0321249098355265
labels[ valid_index]
array([0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0.])
val[ 140 : 160 ]
array([0.02691592, 0.02701038, 0.02690596, 0.02699238, 0.02690596,
0.07806598, 0.07950708, 0.29049198, 0.77018916, 0.98434103,
0.05606983, 0.06270357, 0.04589361, 0.05028036, 0.04938166,
0.05054352, 0.04737767, 0.11532118, 0.05313139, 0.04558791])
gc. collect( )
388
from tqdm import tqdm
from sklearn. metrics import *
print ( "#############################输出第一个模型的评价参数及结果#############################" )
def find_best_threshold ( y_valid, oof_prob) :
best_f2 = 0
recall = 0
precision = 0
best_th = 0
for th in tqdm( [ i/ 2000 for i in range ( 100 , 2000 ) ] ) :
oof_prob_copy = oof_prob. copy( )
oof_prob_copy[ oof_prob_copy >= th] = 1
oof_prob_copy[ oof_prob_copy < th] = 0
fpr, tpr, thresholds = metrics. roc_curve( oof_prob_copy, y_valid, pos_label= 2 )
aucs = metrics. auc( fpr, tpr)
if aucs > best_f2:
best_th = th
best_f2 = aucs
return best_th, best_f2
val1 = val
y1 = labels[ valid_index]
best_th, aucs = find_best_threshold( val1, y1)
print ( "分界值" , best_th)
print ( "F2评价分数" , aucs)
1%|▏ | 10/1900 [00:00<00:23, 80.05it/s]
#############################输出第一个模型的评价参数及结果#############################
100%|██████████████████████████████████████| 1900/1900 [00:20<00:00, 94.61it/s]
分界值 0.05
F2评价分数 0.7065505364632024
pre_df1[ 20 : 33 ]
HOTELID DATE ROOM_EMPTY 354872 10237 2021-09-21 0 356175 10273 2021-09-04 0 356176 10273 2021-09-05 0 356177 10273 2021-09-11 0 356178 10273 2021-09-12 0 356179 10273 2021-09-19 0 356180 10273 2021-09-20 0 356181 10273 2021-09-21 0 356819 104814 2021-09-04 0 356820 104814 2021-09-05 0 356821 104814 2021-09-11 0 356822 104814 2021-09-12 0 356823 104814 2021-09-19 0
pre_df1. to_csv( 'pre_025_regression.csv' , index= False )
pre_df1[ 'ROOM_EMPTY' ] . mode( )
0 0.20966
dtype: float64
pre_df1[ 'ROOM_EMPTY' ] . median( )
0.10509209231427108
plt. figure( figsize= ( 16 , 8 ) )
sns. countplot( pre_df1[ 'ROOM_EMPTY' ] )
<AxesSubplot:xlabel='ROOM_EMPTY', ylabel='count'>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uSxKnO4p-1646247028446)(output_213_1.png)]
df4[ 'date1' ] = df4[ 'date' ] . astype( 'str' )
use_month = [ 5 , 6 , 7 , 8 , 9 ]
df_1 = df4[ ( df4[ 'in_is_wknd' ] == 1 ) & ( df4[ 'in_month' ] . isin( use_month) ) ]
df_1. shape
(11583, 59)
df_1[ 'in_month' ] . value_counts( )
7 4391
5 2520
6 2192
8 1656
9 824
Name: in_month, dtype: int64
holiday = [ '2021-05-01' , '2021-05-02' , '2021-05-03' , '2021-05-04' , '2021-05-05' , '2021-06-12' , '2021-06-13' , '2021-06-14' ,
'2021-09-19' , '2021-09-20' , '2021-09-21' ]
df4[ df4[ 'date' ] == '2021-05-03' ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd
df_2 = df4[ df4[ 'date1' ] . isin( holiday) ]
df_2[ 'holiday' ] = 1
df_1[ 'holiday' ] = 0
df5 = pd. concat( [ df_2, df_1] )
df5. shape
(14662, 61)
df5. drop( [ 'CALLED' , 'ADDRESS' , 'INSERT_TIME' , 'MODIFY_TIME' , 'IN_TIME' , 'OUT_TIME' ] , axis= 1 , inplace= True )
df5[ : 3 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd date1 holiday 9 3597092 CFD17F6BB142485DB3DEC6B5E2D0B664 100193 202105012046 1 3 NaN 19771008 132826 1 2021-05-01 20:46:00 2021-05-02 11:58:00 2021-05-01 20:46:00 2021-05-01 2021-05-02 2021-05-01 NaN 2 2 1 NaN NaN 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 1249 2281 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 2021 5 1 2 5 121 1 1 2021-05-01 1 11 3601837 5C96E986C1F04691A3EB07962BCC7B00 100195 202105021435 1 3 NaN 19951009 370203 1 2021-05-02 14:35:00 2021-05-03 11:58:00 2021-05-02 14:35:00 2021-05-02 2021-05-03 2021-05-02 NaN 2 3 1 NaN NaN 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 1249 895 4727 NaN NaN 150 2565 3429 NaN NaN 1.5 2020-05-18 2020-07-15 58.0 1 2021 5 2 2 6 122 1 1 2021-05-02 1 53 3314117 C0C5516462D3487C895DD654B836F855 10037 202104082130 1 3 NaN 19730313 120103 1 2021-04-30 13:58:00 2021-05-03 11:58:00 2021-05-01 13:58:00 2021-04-30 2021-05-03 2021-05-01 NaN 1 2 1 NaN NaN 371002 ROOM001 2020-05-18 10:22:13 2020-07-15 10:23:58 2 2 NaN 3091 2281 4727 NaN NaN 4661 2565 3429 NaN NaN 2.0 2020-05-18 2020-07-15 58.0 1 2021 5 1 2 5 121 1 1 2021-05-01 1
drop_col = df5. columns. tolist( )
drop_col. remove( 'holiday' )
df5. drop_duplicates( drop_col, keep= 'first' , inplace= True )
df5. shape
(12708, 54)
df5[ 'HOTELID' ] . nunique( )
3133
test = pd. read_csv( 'testb/submit_example_2.csv' )
test[ : 2 ]
HOTELID DATE ROOM_EMPTY 0 303760 2021-09-04 0.1 1 303760 2021-09-05 0.1
test. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3311 entries, 0 to 3310
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 HOTELID 3311 non-null object
1 DATE 3311 non-null object
2 ROOM_EMPTY 3311 non-null float64
dtypes: float64(1), object(2)
memory usage: 77.7+ KB
hotel = test[ 'HOTELID' ] . values. tolist( )
df5 = df5[ df5[ 'HOTELID' ] . isin( hotel) ]
df5. shape
(3708, 54)
d = df5[ df5[ 'in_month' ] == 9 ]
d[ 'date' ] . value_counts( )
2021-09-04 42
2021-09-05 29
2021-09-11 16
2021-09-19 14
2021-09-20 13
2021-09-18 12
2021-09-12 11
2020-09-06 9
2020-09-13 9
2020-09-05 9
2021-09-21 7
2020-09-19 7
2021-09-26 6
2020-09-27 6
2021-09-25 6
2020-09-26 6
2020-09-20 6
2020-09-12 4
Name: date, dtype: int64
test[ 'date1' ] = test[ 'DATE' ]
test1 = test. merge( df5, on= [ 'HOTELID' , 'date1' ] , how= 'left' )
test. shape
(3311, 4)
test1. shape
(3311, 56)
test1. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3311 entries, 0 to 3310
Data columns (total 56 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 HOTELID 3311 non-null object
1 DATE 3311 non-null object
2 ROOM_EMPTY 3311 non-null float64
3 date1 3311 non-null object
4 ORDER_ID 132 non-null float64
5 ORDER_PRIMARY_ID 132 non-null object
6 ORDER_TIME 132 non-null object
7 STATUS 132 non-null float64
8 FIRM 132 non-null float64
9 GUEST_ID 16 non-null object
10 BDATE 132 non-null object
11 XZQH 132 non-null object
12 guest_sum 132 non-null float64
13 in_time 132 non-null datetime64[ns]
14 out_time 132 non-null datetime64[ns]
15 datetime 132 non-null datetime64[ns]
16 in_date 132 non-null object
17 out_date 132 non-null object
18 date 132 non-null object
19 JYMJ 1 non-null float64
20 ROOM_NUM 132 non-null float64
21 BED_NUM 132 non-null float64
22 CZLY 132 non-null float64
23 BUR_CODE 86 non-null float64
24 STA_CODE 86 non-null object
25 SSX 132 non-null float64
26 MPHM 132 non-null object
27 DJSJ 132 non-null datetime64[ns]
28 BGSJ 132 non-null datetime64[ns]
29 CALLED_freq 132 non-null float64
30 ADDRESS_freq 132 non-null float64
31 JYMJ_freq 1 non-null float64
32 ROOM_NUM_freq 132 non-null float64
33 BED_NUM_freq 132 non-null float64
34 CZLY_freq 132 non-null float64
35 BUR_CODE_freq 86 non-null float64
36 STA_CODE_freq 86 non-null float64
37 SSX_freq 132 non-null float64
38 MPHM_freq 132 non-null float64
39 FIRM_freq 132 non-null float64
40 room_ratio 1 non-null float64
41 bed_ratio 1 non-null float64
42 room_bed 132 non-null float64
43 DJ_date 132 non-null object
44 BG_date 132 non-null object
45 DJ_gap 132 non-null float64
46 in_hotel_num 132 non-null float64
47 in_year 132 non-null float64
48 in_month 132 non-null float64
49 in_day 132 non-null float64
50 in_quarter 132 non-null float64
51 in_dayofweek 132 non-null float64
52 in_dayofyear 132 non-null float64
53 record_is_wknd 132 non-null float64
54 in_is_wknd 132 non-null float64
55 holiday 132 non-null float64
dtypes: datetime64[ns](5), float64(36), object(15)
memory usage: 1.4+ MB
test1[ : 5 ]
HOTELID DATE ROOM_EMPTY date1 ORDER_ID ORDER_PRIMARY_ID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd holiday 0 303760 2021-09-04 0.1 2021-09-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1 303760 2021-09-05 0.1 2021-09-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2 303760 2021-09-11 0.1 2021-09-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3 303760 2021-09-12 0.1 2021-09-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4 303760 2021-09-19 0.1 2021-09-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
test1[ 'holiday1' ] = test1[ 'holiday' ] . apply ( lambda x: 0 if x else 1 )
test1[ 'holiday1' ] . value_counts( )
0 3213
1 98
Name: holiday1, dtype: int64
test1[ : 10 ]
HOTELID DATE ROOM_EMPTY date1 ORDER_ID ORDER_PRIMARY_ID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd holiday holiday1 0 303760 2021-09-04 0.1 2021-09-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 1 303760 2021-09-05 0.1 2021-09-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 2 303760 2021-09-11 0.1 2021-09-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 3 303760 2021-09-12 0.1 2021-09-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 4 303760 2021-09-19 0.1 2021-09-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 5 303760 2021-09-20 0.1 2021-09-20 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 6 303760 2021-09-21 0.1 2021-09-21 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 7 315471 2021-09-04 0.1 2021-09-04 4798603.0 A139D5C20BB44529A66957AA491FB88E 202108212327 1.0 3.0 NaN 20001101 331021 1.0 2021-09-03 13:58:00 2021-09-05 11:58:00 2021-09-04 13:58:00 2021-09-03 2021-09-05 2021-09-04 NaN 1.0 2.0 1.0 3.710980e+11 371000000007 371002.0 ROOM001 2021-05-05 13:58:58 2021-05-05 14:02:45 2.0 1.0 NaN 3091.0 2281.0 4727.0 852.0 433.0 4661.0 2565.0 3429.0 NaN NaN 2.0 2021-05-05 2021-05-05 0.0 1.0 2021.0 9.0 4.0 3.0 5.0 247.0 1.0 1.0 0.0 1 8 315471 2021-09-05 0.1 2021-09-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 9 315471 2021-09-11 0.1 2021-09-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
test1[ 'ROOM_EMPTY' ] = test1[ 'holiday1' ]
test2 = test1[ [ 'HOTELID' , 'DATE' , 'ROOM_EMPTY' ] ]
test2[ : 5 ]
HOTELID DATE ROOM_EMPTY 0 303760 2021-09-04 0 1 303760 2021-09-05 0 2 303760 2021-09-11 0 3 303760 2021-09-12 0 4 303760 2021-09-19 0
test2. to_csv( 'pre.csv' , index= False )
pre = pd. read_csv( 'file41704172.csv' )
pre[ : 2 ]
HOTELID DATE room 0 303760 2021-09-04 0.0 1 303760 2021-09-05 0.0
pre. rename( columns= { 'ROOM_EMPTY' : 'room' } , inplace= True )
pr = test2. merge( pre, on= [ 'HOTELID' , 'DATE' ] , how= 'left' )
pr[ 'ROOM_EMPTY' ] = pr[ 'ROOM_EMPTY' ] + pr[ 'room' ]
del pr[ 'room' ]
pre[ 'room' ] . value_counts( )
0.0 2838
1.0 473
Name: room, dtype: int64
pr. to_csv( 'pre1.csv' , index= False )
dd = df5. groupby( [ 'in_year' , 'in_month' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( )
dd
in_year in_month ORDER_ID 0 2020 7 12 1 2020 8 102 2 2020 9 56 3 2021 5 735 4 2021 6 833 5 2021 7 1279 6 2021 8 535 7 2021 9 156
dd[ 'ym' ] = dd[ 'in_year' ] * 100 + dd[ 'in_month' ]
dd[ 'ym' ] . value_counts( )
202105 1
202007 1
202008 1
202009 1
202106 1
202107 1
202108 1
202109 1
Name: ym, dtype: int64
plt. figure( figsize= ( 15 , 8 ) )
plt. scatter( dd. index, dd[ 'ORDER_ID' ] )
<matplotlib.collections.PathCollection at 0x1356dc08520>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fiksPl1J-1646247028449)(output_270_1.png)]