% matplotlib inline
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
import scipy. stats as st
import os
import re
import gc
import warnings
warnings. filterwarnings( 'ignore' )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'display.max_rows' , None )
pd. set_option( 'display.max_colwidth' , 100 )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
path = 'traina/'
path1 = 'trainb/'
入住信息
cust_info = pd. read_csv( path + '网约房平台入住人表.csv' )
cust_info1 = pd. read_csv( path1 + '网约房平台入住人表.csv' )
cust = pd. concat( [ cust_info, cust_info1] )
cust. head( 2 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 NaN NaN 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 2.021071e+11 2.021071e+11
gc. collect( )
89
cust[ 'BDATE' ] = cust[ 'BDATE' ] . astype( 'str' )
cust[ 'XZQH' ] = cust[ 'XZQH' ] . astype( 'str' )
cust[ 'IN_TIME' ] = cust[ 'IN_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
cust[ 'OUT_TIME' ] = cust[ 'OUT_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
cust. head( 2 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 0.0 0.0 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 202107092103 202107111158
cust. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_PRIMARY_ID 44030 non-null object
1 GUEST_ID 5158 non-null object
2 BDATE 44030 non-null object
3 XZQH 44030 non-null object
4 IN_TIME 44030 non-null object
5 OUT_TIME 44030 non-null object
dtypes: object(6)
memory usage: 2.4+ MB
cust. nunique( )
ORDER_PRIMARY_ID 43883
GUEST_ID 3698
BDATE 9503
XZQH 2659
IN_TIME 3396
OUT_TIME 236
dtype: int64
cust[ 'ORDER_PRIMARY_ID' ] . value_counts( ) [ 5 : 10 ]
03158D5CA62E42339D697612EA347FB3 2
0A99D74F1E0248D68A729CB79FC640E6 2
1FE3E448558347D89C56B0AFCEC8ACFB 2
864E417B7BDE4C7B9449B7E087E9F21E 2
3DEFA75566934CA5929D0C47F0B95FDE 2
Name: ORDER_PRIMARY_ID, dtype: int64
cust. sort_values( [ 'ORDER_PRIMARY_ID' , 'GUEST_ID' ] , inplace= True )
stat = cust. groupby( [ 'ORDER_PRIMARY_ID' ] ) [ 'BDATE' ] . count( ) . reset_index( )
stat. columns = [ 'ORDER_PRIMARY_ID' , 'guest_sum' ]
stat[ 'guest_sum_notnull' ] = cust. groupby( [ 'ORDER_PRIMARY_ID' ] ) [ 'GUEST_ID' ] . agg( 'count' ) . values
cust = cust. merge( stat, on= [ 'ORDER_PRIMARY_ID' ] , how= 'left' )
cust. drop_duplicates( 'ORDER_PRIMARY_ID' , keep= 'first' , inplace= True )
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '70798782D6C04A438360D80AFE4845C1' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 22801 70798782D6C04A438360D80AFE4845C1 371099B389202107040001 19990314 130983 0.0 0.0 3 2
cust[ 'IN_TIME' ] . max ( ) , cust[ 'IN_TIME' ] . min ( )
('202110041413', '0.0')
订单信息
order_info = pd. read_csv( path + '网约平台旅客订单信息.csv' )
order1 = pd. read_csv( path1 + '网约平台旅客订单信息.csv' )
order = pd. concat( [ order_info, order1] )
order. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 1.596120e+11 1 NaN 20200730224152 20200730224152 3 1 923696 C72F20539AD1447D86CD1A8E5EAEC63A 282932 202008041400 202008061200 1.596121e+11 1 NaN 20200730225524 20200730225524 3
order[ order[ 'ORDER_PRIMARY_ID' ] == '3CDEDB5E03534D379687645675898CA4' ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 30970 4422027 3CDEDB5E03534D379687645675898CA4 170898 202108081358 202108101158 2.021071e+11 3 NaN 20210712210153 20210712210153 3
order[ 'PRE_IN_TIME' ] = order[ 'PRE_IN_TIME' ] . astype( 'str' )
order[ 'PRE_OUT_TIME' ] = order[ 'PRE_OUT_TIME' ] . astype( 'str' )
order[ 'INSERT_TIME' ] = order[ 'INSERT_TIME' ] . astype( 'str' )
order[ 'MODIFY_TIME' ] = order[ 'MODIFY_TIME' ] . astype( 'str' )
order[ 'CANCEL_TIME' ] = order[ 'CANCEL_TIME' ] . astype( 'str' )
order[ 'ORDER_TIME' ] = order[ 'ORDER_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
##########################没有考虑包含取消时间的订单
order = order[ ( order[ 'CANCEL_TIME' ] == 'nan' ) ]
order = order. sort_values( [ 'ORDER_ID' , 'MODIFY_TIME' ] )
order. drop_duplicates( 'ORDER_ID' , keep= 'last' , inplace= True )
order. shape
(29941, 11)
order[ order[ 'ORDER_ID' ] == 4402846 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 30141 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111358 202107121158 202107111032 1 nan 20210711140042 20210711140042 3
order. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 PRE_IN_TIME 29941 non-null object
4 PRE_OUT_TIME 29941 non-null object
5 ORDER_TIME 29941 non-null object
6 STATUS 29941 non-null int64
7 CANCEL_TIME 29941 non-null object
8 INSERT_TIME 29941 non-null object
9 MODIFY_TIME 29941 non-null object
10 FIRM 29941 non-null int64
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order. nunique( )
ORDER_ID 29941
ORDER_PRIMARY_ID 29941
HOTELID 4801
PRE_IN_TIME 6875
PRE_OUT_TIME 729
ORDER_TIME 26818
STATUS 3
CANCEL_TIME 1
INSERT_TIME 25432
MODIFY_TIME 25432
FIRM 2
dtype: int64
order_info[ 'FIRM' ] . value_counts( )
3 32029
10 8521
Name: FIRM, dtype: int64
order[ 'STATUS' ] . value_counts( )
1 26137
2 3221
3 583
Name: STATUS, dtype: int64
order[ order[ 'INSERT_TIME' ] != order[ 'MODIFY_TIME' ] ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
合并数据order和cust
df = pd. merge( order, cust, on= 'ORDER_PRIMARY_ID' )
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df[ 'IN_TIME' ] = df[ 'IN_TIME' ] . apply ( lambda x: np. nan if x== '0.0' else x)
df[ 'OUT_TIME' ] = df[ 'OUT_TIME' ] . apply ( lambda x: np. nan if x== '0.0' else x)
df[ 'IN_TIME' ] . fillna( df[ 'PRE_IN_TIME' ] , inplace= True )
df[ 'OUT_TIME' ] . fillna( df[ 'PRE_OUT_TIME' ] , inplace= True )
df[ 'OUT_TIME' ] = df. apply ( lambda x: x[ 'OUT_TIME' ] if x[ 'OUT_TIME' ] >= x[ 'PRE_OUT_TIME' ] else x[ 'PRE_OUT_TIME' ] , axis= 1 )
df[ 'IN_TIME' ] = df. apply ( lambda x: x[ 'IN_TIME' ] if x[ 'IN_TIME' ] >= x[ 'PRE_IN_TIME' ] else x[ 'PRE_IN_TIME' ] , axis= 1 )
df. drop( [ 'PRE_IN_TIME' , 'PRE_OUT_TIME' , 'CANCEL_TIME' ] , axis= 1 , inplace= True )
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0
df. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 ORDER_TIME 29941 non-null object
4 STATUS 29941 non-null int64
5 INSERT_TIME 29941 non-null object
6 MODIFY_TIME 29941 non-null object
7 FIRM 29941 non-null int64
8 GUEST_ID 3561 non-null object
9 BDATE 29941 non-null object
10 XZQH 29941 non-null object
11 IN_TIME 29941 non-null object
12 OUT_TIME 29941 non-null object
13 guest_sum 29941 non-null int64
14 guest_sum_notnull 29941 non-null int64
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df. nunique( ) ;
df[ 'in_time' ] = pd. to_datetime( df[ 'IN_TIME' ] , errors= 'coerce' , format = '%Y%m%d%H%M' )
df[ 'out_time' ] = pd. to_datetime( df[ 'OUT_TIME' ] , errors= 'coerce' , format = '%Y%m%d%H%M' )
df = df[ ~ df[ 'out_time' ] . isnull( ) ]
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 2020-07-15 13:00:00 2020-07-16 12:00:00 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0 2020-10-01 14:00:00 2020-10-02 10:11:00
#后移一天 df[‘lock_time’][0] + pd.Timedelta(days=1) #前移一天 df[‘lock_time’][0] + pd.Timedelta(days=-1) #df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
dfs = [ ]
for idx, group in df. groupby( [ 'ORDER_PRIMARY_ID' ] ) :
stat1 = pd. DataFrame( )
stat1[ 'datetime' ] = pd. date_range( start= group[ 'IN_TIME' ] . values[ 0 ] , end= group[ 'OUT_TIME' ] . values[ 0 ] , freq= 'D' , normalize= False , closed= None )
stat1[ 'ORDER_PRIMARY_ID' ] = group[ 'ORDER_PRIMARY_ID' ] . values[ 0 ]
dfs. append( stat1)
df_date = pd. concat( dfs) . reset_index( drop= True )
df1 = df. merge( df_date, on= [ 'ORDER_PRIMARY_ID' ] , how= 'left' )
df1. shape
(60973, 18)
df1[ 'in_date' ] = df1[ 'in_time' ] . dt. date
df1[ 'out_date' ] = df1[ 'out_time' ] . dt. date
df1[ 'date' ] = df1[ 'datetime' ] . dt. date
df2 = df1[ ( df1[ 'out_date' ] != df1[ 'date' ] ) | ( ( df1[ 'out_date' ] == df1[ 'date' ] ) & ( df1[ 'in_date' ] == df1[ 'out_date' ] ) ) ]
df2. shape
(49547, 21)
df2. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49547 entries, 0 to 60971
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 49547 non-null int64
1 ORDER_PRIMARY_ID 49547 non-null object
2 HOTELID 49547 non-null object
3 ORDER_TIME 49547 non-null object
4 STATUS 49547 non-null int64
5 INSERT_TIME 49547 non-null object
6 MODIFY_TIME 49547 non-null object
7 FIRM 49547 non-null int64
8 GUEST_ID 5963 non-null object
9 BDATE 49547 non-null object
10 XZQH 49547 non-null object
11 IN_TIME 49547 non-null object
12 OUT_TIME 49547 non-null object
13 guest_sum 49547 non-null int64
14 guest_sum_notnull 49547 non-null int64
15 in_time 49547 non-null datetime64[ns]
16 out_time 49547 non-null datetime64[ns]
17 datetime 49547 non-null datetime64[ns]
18 in_date 49547 non-null object
19 out_date 49547 non-null object
20 date 49547 non-null object
dtypes: datetime64[ns](3), int64(5), object(13)
memory usage: 8.3+ MB
room_info = pd. read_csv( path + '网约房注册民宿.csv' )
room_info. head( 1 )
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS 0 100177 3710830002 0.0 1 1 2 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 371000000000 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN NaN
room_info[ 'STATUS' ] . value_counts( )
1.0 2658
Name: STATUS, dtype: int64
room_info. drop( [ 'HOTELID' , 'FWLY' , 'CITY_CODE' , 'STATUS' , 'AUDITSTATUS' , 'JYQK' ] , axis= 1 , inplace= True )
room_info. rename( columns= { 'CODE' : 'HOTELID' } , inplace= True )
room_info[ 'FIRM' ] . value_counts( )
3 3429
10 1878
Name: FIRM, dtype: int64
room_info. nunique( )
HOTELID 5307
JYMJ 173
ROOM_NUM 11
BED_NUM 17
CZLY 4
CALLED 4232
BUR_CODE 7
STA_CODE 54
SSX 5
ADDRESS 4850
MPHM 584
FIRM 2
DJSJ 3753
BGSJ 3294
dtype: int64
登记时间和变更时间无变化的只有两家
room_info[ 'JYMJ' ] = room_info[ 'JYMJ' ] . apply ( lambda x: np. nan if x== 0 else x)
room_use_col = [ 'HOTELID' , 'JYMJ' , 'ROOM_NUM' , 'BED_NUM' , 'FIRM' , 'STATUS' ]
freq_col = [ 'CALLED' , 'ADDRESS' , 'JYMJ' , 'ROOM_NUM' , 'BED_NUM' , 'CZLY' , 'BUR_CODE' , 'STA_CODE' , 'SSX' , 'MPHM' , 'FIRM' ]
for col in freq_col:
st = room_info[ col] . value_counts( ) . reset_index( )
st. columns = [ col, col+ '_freq' ]
room_info = room_info. merge( st, on= col, how= 'left' )
room_info[ 'room_ratio' ] = room_info[ 'JYMJ' ] / room_info[ 'ROOM_NUM' ]
room_info[ 'bed_ratio' ] = room_info[ 'JYMJ' ] / room_info[ 'BED_NUM' ]
room_info[ 'room_bed' ] = room_info[ 'BED_NUM' ] / room_info[ 'ROOM_NUM' ]
room_info[ 'DJSJ' ] = pd. to_datetime( room_info[ 'DJSJ' ] , format = '%Y-%m-%d %H:%M:%S' , errors= 'coerce' )
room_info[ 'BGSJ' ] = pd. to_datetime( room_info[ 'BGSJ' ] , format = '%Y-%m-%d %H:%M:%S' , errors= 'coerce' )
room_info[ 'DJ_date' ] = room_info[ 'DJSJ' ] . dt. date
room_info[ 'BG_date' ] = room_info[ 'BGSJ' ] . dt. date
room_info[ 'DJ_gap' ] = ( room_info[ 'BG_date' ] - room_info[ 'DJ_date' ] ) . dt. days
room_info[ 'DJSJ' ] . min ( ) , room_info[ 'DJSJ' ] . max ( ) , room_info[ 'BGSJ' ] . min ( ) , room_info[ 'BGSJ' ] . max ( )
(Timestamp('2020-05-12 13:16:24'),
Timestamp('2021-10-19 16:05:41'),
Timestamp('2020-07-15 10:23:06'),
Timestamp('2021-10-19 16:09:15'))
plt. figure( figsize= ( 20 , 8 ) )
sns. countplot( room_info[ 'JYMJ' ] )
<AxesSubplot:xlabel='JYMJ', ylabel='count'>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ayRq9a5M-1646146611632)(output_79_1.png)]
经营面积缺失超过一半,3000多,考虑删除
合并room到df中
df3 = df2. merge( room_info, on= [ 'HOTELID' , 'FIRM' ] , how= 'left' )
df3 = df3. sort_values( by= [ 'HOTELID' , 'date' ] )
in_num = df3. groupby( [ 'HOTELID' , 'date' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( )
in_num. columns = [ 'HOTELID' , 'date' , 'in_hotel_num' ]
df4 = df3. merge( in_num, on= [ 'HOTELID' , 'date' ] , how= 'left' )
df4. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num 0 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111032 1 20210711140042 20210711140042 3 NaN 19731004 370602 202107111358 202107121158 1 0 2021-07-11 13:58:00 2021-07-12 11:58:00 2021-07-11 13:58:00 2021-07-11 2021-07-12 2021-07-11 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 1 4455098 01D7394D02B44376913536F6071151AD 100177 202107151446 1 20210724140006 20210724140006 3 NaN 19941001 429006 202107241358 202107261158 1 0 2021-07-24 13:58:00 2021-07-26 11:58:00 2021-07-24 13:58:00 2021-07-24 2021-07-26 2021-07-24 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1
df4. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 49547 entries, 0 to 49546
Data columns (total 51 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 49547 non-null int64
1 ORDER_PRIMARY_ID 49547 non-null object
2 HOTELID 49547 non-null object
3 ORDER_TIME 49547 non-null object
4 STATUS 49547 non-null int64
5 INSERT_TIME 49547 non-null object
6 MODIFY_TIME 49547 non-null object
7 FIRM 49547 non-null int64
8 GUEST_ID 5963 non-null object
9 BDATE 49547 non-null object
10 XZQH 49547 non-null object
11 IN_TIME 49547 non-null object
12 OUT_TIME 49547 non-null object
13 guest_sum 49547 non-null int64
14 guest_sum_notnull 49547 non-null int64
15 in_time 49547 non-null datetime64[ns]
16 out_time 49547 non-null datetime64[ns]
17 datetime 49547 non-null datetime64[ns]
18 in_date 49547 non-null object
19 out_date 49547 non-null object
20 date 49547 non-null object
21 JYMJ 14248 non-null float64
22 ROOM_NUM 49547 non-null int64
23 BED_NUM 49547 non-null int64
24 CZLY 49547 non-null int64
25 CALLED 49547 non-null object
26 BUR_CODE 22916 non-null float64
27 STA_CODE 22916 non-null object
28 SSX 49547 non-null int64
29 ADDRESS 49547 non-null object
30 MPHM 49547 non-null object
31 DJSJ 49497 non-null datetime64[ns]
32 BGSJ 49547 non-null datetime64[ns]
33 CALLED_freq 49547 non-null int64
34 ADDRESS_freq 49547 non-null int64
35 JYMJ_freq 14248 non-null float64
36 ROOM_NUM_freq 49547 non-null int64
37 BED_NUM_freq 49547 non-null int64
38 CZLY_freq 49547 non-null int64
39 BUR_CODE_freq 22916 non-null float64
40 STA_CODE_freq 22916 non-null float64
41 SSX_freq 49547 non-null int64
42 MPHM_freq 49547 non-null int64
43 FIRM_freq 49547 non-null int64
44 room_ratio 14248 non-null float64
45 bed_ratio 14248 non-null float64
46 room_bed 49547 non-null float64
47 DJ_date 49497 non-null object
48 BG_date 49547 non-null object
49 DJ_gap 49497 non-null float64
50 in_hotel_num 49547 non-null int64
dtypes: datetime64[ns](5), float64(9), int64(18), object(19)
memory usage: 19.7+ MB
df4[ 'date' ] . min ( ) , df4[ 'date' ] . max ( )
(datetime.date(2020, 7, 15), datetime.date(2021, 11, 2))
df4[ 'in_year' ] = df4[ 'datetime' ] . dt. year. fillna( 0 ) . astype( 'int' )
df4[ 'in_month' ] = df4[ 'datetime' ] . dt. month. fillna( 0 ) . astype( 'int' )
df4[ 'in_day' ] = df4[ 'datetime' ] . dt. day. fillna( 0 ) . astype( 'int' )
df4[ 'in_quarter' ] = df4[ 'datetime' ] . dt. quarter. fillna( 0 ) . astype( 'int' )
df4[ 'in_dayofweek' ] = df4[ 'datetime' ] . dt. dayofweek. fillna( 0 ) . astype( 'int' )
df4[ 'in_dayofyear' ] = df4[ 'datetime' ] . dt. dayofyear. fillna( 0 ) . astype( 'int' )
df4[ 'in_is_wknd' ] = df4[ 'datetime' ] . dt. dayofweek // 5
df4[ : 2 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd 0 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111032 1 20210711140042 20210711140042 3 NaN 19731004 370602 202107111358 202107121158 1 0 2021-07-11 13:58:00 2021-07-12 11:58:00 2021-07-11 13:58:00 2021-07-11 2021-07-12 2021-07-11 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 2021 7 11 3 6 192 1 1 4455098 01D7394D02B44376913536F6071151AD 100177 202107151446 1 20210724140006 20210724140006 3 NaN 19941001 429006 202107241358 202107261158 1 0 2021-07-24 13:58:00 2021-07-26 11:58:00 2021-07-24 13:58:00 2021-07-24 2021-07-26 2021-07-24 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 2021 7 24 3 5 205 1
df4[ 'date1' ] = df4[ 'date' ] . astype( 'str' )
use_month = [ 5 , 6 , 7 , 8 , 9 ]
df_1 = df4[ ( df4[ 'in_is_wknd' ] == 1 ) & ( df4[ 'in_month' ] . isin( use_month) ) ]
df_1. shape
(11583, 59)
df_1[ 'in_month' ] . value_counts( )
7 4391
5 2520
6 2192
8 1656
9 824
Name: in_month, dtype: int64
holiday = [ '2021-05-01' , '2021-05-02' , '2021-05-03' , '2021-05-04' , '2021-05-05' , '2021-06-12' , '2021-06-13' , '2021-06-14' ,
'2021-09-19' , '2021-09-20' , '2021-09-21' ]
df4[ df4[ 'date' ] == '2021-05-03' ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd
df_2 = df4[ df4[ 'date1' ] . isin( holiday) ]
df_2[ 'holiday' ] = 1
df_1[ 'holiday' ] = 0
df5 = pd. concat( [ df_2, df_1] )
df5. shape
(14662, 61)
df5. drop( [ 'CALLED' , 'ADDRESS' , 'INSERT_TIME' , 'MODIFY_TIME' , 'IN_TIME' , 'OUT_TIME' , 'guest_sum_notnull' ] , axis= 1 , inplace= True )
df5[ : 3 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd date1 holiday 9 3597092 CFD17F6BB142485DB3DEC6B5E2D0B664 100193 202105012046 1 3 NaN 19771008 132826 1 2021-05-01 20:46:00 2021-05-02 11:58:00 2021-05-01 20:46:00 2021-05-01 2021-05-02 2021-05-01 NaN 2 2 1 NaN NaN 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 1249 2281 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 2021 5 1 2 5 121 1 1 2021-05-01 1 11 3601837 5C96E986C1F04691A3EB07962BCC7B00 100195 202105021435 1 3 NaN 19951009 370203 1 2021-05-02 14:35:00 2021-05-03 11:58:00 2021-05-02 14:35:00 2021-05-02 2021-05-03 2021-05-02 NaN 2 3 1 NaN NaN 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 1249 895 4727 NaN NaN 150 2565 3429 NaN NaN 1.5 2020-05-18 2020-07-15 58.0 1 2021 5 2 2 6 122 1 1 2021-05-02 1 53 3314117 C0C5516462D3487C895DD654B836F855 10037 202104082130 1 3 NaN 19730313 120103 1 2021-04-30 13:58:00 2021-05-03 11:58:00 2021-05-01 13:58:00 2021-04-30 2021-05-03 2021-05-01 NaN 1 2 1 NaN NaN 371002 ROOM001 2020-05-18 10:22:13 2020-07-15 10:23:58 2 2 NaN 3091 2281 4727 NaN NaN 4661 2565 3429 NaN NaN 2.0 2020-05-18 2020-07-15 58.0 1 2021 5 1 2 5 121 1 1 2021-05-01 1
drop_col = df5. columns. tolist( )
drop_col. remove( 'holiday' )
df5. drop_duplicates( drop_col, keep= 'first' , inplace= True )
df5. shape
(12708, 54)
dd = df_1. groupby( [ 'in_year' , 'in_month' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( )
dd
in_year in_month ORDER_ID ym 0 2020 7 60 202007 1 2020 8 511 202008 2 2020 9 680 202009 3 2020 10 1487 202010 4 2020 11 665 202011 5 2020 12 904 202012 6 2021 1 999 202101 7 2021 2 71 202102 8 2021 3 2 202103 9 2021 4 4 202104 10 2021 5 2520 202105 11 2021 6 2192 202106 12 2021 7 4331 202107 13 2021 8 1145 202108 14 2021 9 144 202109 15 2021 10 40 202110
dd[ 'ym' ] = dd[ 'in_year' ] * 100 + dd[ 'in_month' ]
dd[ 'ym' ] . value_counts( )
202106 1
202008 1
202110 1
202109 1
202012 1
202011 1
202010 1
202009 1
202007 1
202104 1
202102 1
202101 1
202103 1
202108 1
202107 1
202105 1
202111 1
Name: ym, dtype: int64
plt. figure( figsize= ( 15 , 8 ) )
plt. scatter( dd. index, dd[ 'ORDER_ID' ] )
<matplotlib.collections.PathCollection at 0x13562173a00>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EHvuTP3g-1646146611636)(output_112_1.png)]