import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import os
import re
import warnings
import seaborn as sns
import scipy. stats as st
import gc
warnings. filterwarnings( 'ignore' )
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'display.max_rows' , None )
pd. set_option( 'max_colwidth' , 100 )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
path = 'traina/'
path1 = 'trainb/'
cust_info = pd. read_csv( path + '网约房平台入住人表.csv' )
cust_info1 = pd. read_csv( path1 + '网约房平台入住人表.csv' )
cust = pd. concat( [ cust_info, cust_info1] )
cust. head( 2 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 NaN NaN 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 2.021071e+11 2.021071e+11
gc. collect( )
17
cust[ 'BDATE' ] = cust[ 'BDATE' ] . astype( 'str' ) cust[ 'XZQH' ] = cust[ 'XZQH' ] . astype( 'str' ) cust[ 'IN_TIME' ] = cust[ 'IN_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] ) cust[ 'OUT_TIME' ] = cust[ 'OUT_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
cust. head( 3 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 0.0 0.0 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 202107092103 202107111158 2 A1414E7D71E34E1DA743AC08BCF49151 3710021822202008240001 19860311 370781 202008241401 202008251200
cust. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 44030 entries, 0 to 3343Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_PRIMARY_ID 44030 non-null object 1 GUEST_ID 5158 non-null object 2 BDATE 44030 non-null object 3 XZQH 44030 non-null object 4 IN_TIME 44030 non-null object 5 OUT_TIME 44030 non-null objectdtypes: object(6)memory usage: 2.4+ MB
cust. nunique( )
ORDER_PRIMARY_ID 43883GUEST_ID 3698BDATE 9503XZQH 2659IN_TIME 3396OUT_TIME 236dtype: int64
cust[ 'ORDER_PRIMARY_ID' ] . value_counts( ) [ 10 : 15 ]
134B74870F1D4087AB3F590D2A0AAFDC 23370A2886D8F4E11860A00FEB2289E56 223D42363F202408BA04CD3D5BBAE5508 219631D36ABA448D9B0AECF0A7200C4C5 29373325C891E42CB8A8F43AF14939954 2Name: ORDER_PRIMARY_ID, dtype: int64
cust. sort_values( [ 'ORDER_PRIMARY_ID' , 'GUEST_ID' ] , inplace= True )
stat = cust. groupby( [ 'ORDER_PRIMARY_ID' ] ) [ 'BDATE' ] . count( ) . reset_index( ) stat. columns = [ 'ORDER_PRIMARY_ID' , 'guest_sum' ] stat[ 'guest_sum_notnull' ] = cust. groupby( [ 'ORDER_PRIMARY_ID' ] ) [ 'GUEST_ID' ] . agg( 'count' ) . valuescust = cust. merge( stat, on= [ 'ORDER_PRIMARY_ID' ] , how= 'left' )
cust. drop_duplicates( 'ORDER_PRIMARY_ID' , keep= 'first' , inplace= True )
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '70798782D6C04A438360D80AFE4845C1' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 22801 70798782D6C04A438360D80AFE4845C1 371099B389202107040001 19990314 130983 0.0 0.0 3 2
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '5B94671A9390437E8E79A86D4B15A076' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 18631 5B94671A9390437E8E79A86D4B15A076 371098A226202107010001 20000114 372324 0.0 0.0 2 1
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '479006B921F24969B60B68DAEDA91909' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 14578 479006B921F24969B60B68DAEDA91909 3710020982202010020001 19841013 210103 202010021601 202010061200 2 2
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '1772C1DEE3F141D8960BB9044CA4CF3A' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 4753 1772C1DEE3F141D8960BB9044CA4CF3A 371098A783202107050001 19620916 422226 0.0 0.0 3 2
cust[ cust[ 'ORDER_PRIMARY_ID' ] == 'BD39A40287CE4E76B46D3BC85BFA1298' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 38447 BD39A40287CE4E76B46D3BC85BFA1298 371099A428202105100004 19710219 370204 0.0 202105211834 2 1
cust[ 'IN_TIME' ] . max ( )
'202110041413'
room_info = pd. read_csv( path + '网约房注册民宿.csv' )
room_info. head( 1 )
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS 0 100177 3710830002 0.0 1 1 2 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 371000000000 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN NaN
room_info[ ( room_info[ 'CALLED' ] == room_info[ 'JYQK' ] ) == False ]
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS
room_info[ 'CALLED' ] . nunique( )
4232
room_info[ 'CALLED' ] . value_counts( ) [ : 5 ]
高铁北站看海公寓 24韩乐坊/不夜城美食街/乐天世纪城 豪华大床房 景驰公寓 20国际海水浴场精致海景大床房 12九龙湾豪华一线海景loft复式 10高铁北站看海民宿 10Name: CALLED, dtype: int64
room_info[ 'HOTELID' ] . nunique( )
5307
room_info[ 'CODE' ] . nunique( )
5307
room_info. info( )
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CODE 5307 non-null object 1 HOTELID 5307 non-null object 2 JYMJ 5307 non-null float64 3 ROOM_NUM 5307 non-null int64 4 BED_NUM 5307 non-null int64 5 FWLY 5307 non-null int64 6 CZLY 5307 non-null int64 7 CALLED 5307 non-null object 8 CITY_CODE 5307 non-null int64 9 BUR_CODE 2658 non-null float64 10 STA_CODE 2658 non-null object 11 SSX 5307 non-null int64 12 ADDRESS 5307 non-null object 13 MPHM 5307 non-null object 14 JYQK 5307 non-null object 15 FIRM 5307 non-null int64 16 DJSJ 5307 non-null object 17 BGSJ 5307 non-null object 18 STATUS 2658 non-null float64 19 AUDITSTATUS 2658 non-null float64dtypes: float64(4), int64(7), object(9)memory usage: 829.3+ KB
room_info. nunique( )
CODE 5307HOTELID 5307JYMJ 173ROOM_NUM 11BED_NUM 17FWLY 1CZLY 4CALLED 4232CITY_CODE 1BUR_CODE 7STA_CODE 54SSX 5ADDRESS 4850MPHM 584JYQK 4232FIRM 2DJSJ 3753BGSJ 3294STATUS 1AUDITSTATUS 1dtype: int64
room_info[ 'STATUS' ] . value_counts( )
1.0 2658Name: STATUS, dtype: int64
room_info. drop( [ 'HOTELID' , 'FWLY' , 'CITY_CODE' , 'AUDITSTATUS' , 'JYQK' ] , axis= 1 , inplace= True )
room_info. rename( columns= { 'CODE' : 'HOTELID' } , inplace= True )
room_info[ 'FIRM' ] . value_counts( )
3 342910 1878Name: FIRM, dtype: int64
room_info. info( )
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 HOTELID 5307 non-null object 1 JYMJ 5307 non-null float64 2 ROOM_NUM 5307 non-null int64 3 BED_NUM 5307 non-null int64 4 CZLY 5307 non-null int64 5 CALLED 5307 non-null object 6 BUR_CODE 2658 non-null float64 7 STA_CODE 2658 non-null object 8 SSX 5307 non-null int64 9 ADDRESS 5307 non-null object 10 MPHM 5307 non-null object 11 FIRM 5307 non-null int64 12 DJSJ 5307 non-null object 13 BGSJ 5307 non-null object 14 STATUS 2658 non-null float64dtypes: float64(3), int64(5), object(7)memory usage: 622.0+ KB
room_info. nunique( )
HOTELID 5307JYMJ 173ROOM_NUM 11BED_NUM 17CZLY 4CALLED 4232BUR_CODE 7STA_CODE 54SSX 5ADDRESS 4850MPHM 584FIRM 2DJSJ 3753BGSJ 3294STATUS 1dtype: int64
CODE-------- 民宿编码与HOTELID关联 JYMJ-------- 经营面积 ROOM_NUM---- 房间数量 BED_NUM----- 床位数 CZLY-------- 出租类型 CALLED------ 名称 BUR_CODE---- 所属分局代码 STA_CODE---- 所属派出所代码 SSX--------- 省市县 ADDRESS----- 地址 MPHM-------- 门牌号 FIRM-------- 平台厂商 DJSJ-------- 登记时间 BGSJ-------- 变更时间 STATUS------ 状态
room_info[ : 2 ]
HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM FIRM DJSJ BGSJ STATUS 0 100177 0.0 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 100181 0.0 5 7 1 【山海边公寓】瞻星伴月阁 NaN NaN 371083 乳山市银滩 檀香丽湾小区2-4-603 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN
登记时间和变更时间无变化的只有两家
room_info[ room_info[ 'DJSJ' ] == room_info[ 'BGSJ' ] ]
HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM FIRM DJSJ BGSJ STATUS 1150 296753 0.0 1 2 1 春润庭院客栈豪华家庭套房 NaN NaN 371002 环翠区环海路金海滩花园41号楼 ROOM001 3 2020-12-05 10:22:40 2020-12-05 10:22:40 NaN 1156 296772 0.0 1 1 1 哈工大旁春润庭院大床房 NaN NaN 371002 环翠区环海路金海滩花园41号楼 ROOM001 3 2020-12-05 10:22:40 2020-12-05 10:22:40 NaN
room_info[ 'JYMJ' ] = room_info[ 'JYMJ' ] . apply ( lambda x: np. nan if x== 0 else x)
room的统计量特征
room_info[ 'DJSJ' ] = pd. to_datetime( room_info[ 'DJSJ' ] , format = '%Y-%m-%d %H:%M:%S' , errors= 'coerce' ) room_info[ 'BGSJ' ] = pd. to_datetime( room_info[ 'BGSJ' ] , format = '%Y-%m-%d %H:%M:%S' , errors= 'coerce' )
room_info[ 'DJ_date' ] = room_info[ 'DJSJ' ] . dt. dateroom_info[ 'BG_date' ] = room_info[ 'BGSJ' ] . dt. date
room_info[ 'DJ_gap' ] = ( room_info[ 'BG_date' ] - room_info[ 'DJ_date' ] ) . dt. days
room_info[ : 2 ]
HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM FIRM DJSJ BGSJ STATUS CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap 0 100177 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 100181 NaN 5 7 1 【山海边公寓】瞻星伴月阁 NaN NaN 371083 乳山市银滩 檀香丽湾小区2-4-603 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 49 45 4727 NaN NaN 150 2565 3429 NaN NaN 1.4 2020-05-18 2020-07-15 58.0
room_info[ 'DJ_gap' ] . nunique( )
29
room_info[ 'DJSJ' ] . min ( ) , room_info[ 'DJSJ' ] . max ( ) , room_info[ 'BGSJ' ] . min ( ) , room_info[ 'BGSJ' ] . max ( )
(Timestamp('2020-05-12 13:16:24'), Timestamp('2021-10-19 16:05:41'), Timestamp('2020-07-15 10:23:06'), Timestamp('2021-10-19 16:09:15'))
room = room_info. copy( )
room[ 'DJ_year' ] = room[ 'DJSJ' ] . dt. year. fillna( 0 ) . astype( 'int' ) room[ 'DJ_month' ] = room[ 'DJSJ' ] . dt. month. fillna( 0 ) . astype( 'int' ) room[ 'DJ_day' ] = room[ 'DJSJ' ] . dt. day. fillna( 0 ) . astype( 'int' ) room[ 'DJ_quarter' ] = room[ 'DJSJ' ] . dt. quarter. fillna( 0 ) . astype( 'int' ) room[ 'DJ_dayofweek' ] = room[ 'DJSJ' ] . dt. dayofweek. fillna( 0 ) . astype( 'int' ) room[ 'DJ_dayofyear' ] = room[ 'DJSJ' ] . dt. dayofyear. fillna( 0 ) . astype( 'int' ) room[ 'BG_year' ] = room[ 'DJSJ' ] . dt. year. fillna( 0 ) . astype( 'int' ) room[ 'BG_month' ] = room[ 'DJSJ' ] . dt. month. fillna( 0 ) . astype( 'int' ) room[ 'BG_day' ] = room[ 'DJSJ' ] . dt. day. fillna( 0 ) . astype( 'int' ) room[ 'BG_quarter' ] = room[ 'DJSJ' ] . dt. quarter. fillna( 0 ) . astype( 'int' ) room[ 'BG_dayofweek' ] = room[ 'DJSJ' ] . dt. dayofweek. fillna( 0 ) . astype( 'int' ) room[ 'BG_dayofyear' ] = room[ 'DJSJ' ] . dt. dayofyear. fillna( 0 ) . astype( 'int' )
room[ 'BG_dayofweek' ] . value_counts( )
1 14014 9990 7615 6672 5963 4886 395Name: BG_dayofweek, dtype: int64
#DJ_gap|DJ_year|DJ_month|DJ_day|DJ_quarter|DJ_dayofweek|DJ_dayofyear… #freq特征
room[ : 3 ]
HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM FIRM DJSJ BGSJ STATUS CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap DJ_year DJ_month DJ_day DJ_quarter DJ_dayofweek DJ_dayofyear BG_year BG_month BG_day BG_quarter BG_dayofweek BG_dayofyear 0 100177 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 2020 5 18 2 0 139 2020 5 18 2 0 139 1 100181 NaN 5 7 1 【山海边公寓】瞻星伴月阁 NaN NaN 371083 乳山市银滩 檀香丽湾小区2-4-603 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 49 45 4727 NaN NaN 150 2565 3429 NaN NaN 1.4 2020-05-18 2020-07-15 58.0 2020 5 18 2 0 139 2020 5 18 2 0 139 2 100193 NaN 2 2 1 【山海边公寓】两居电梯洋房,背依藏龙山,面朝大海 NaN NaN 371083 乳山市长江路檀香丽湾1-16-901 ROOM001 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 1249 2281 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 2020 5 18 2 0 139 2020 5 18 2 0 139
plt. figure( figsize= ( 20 , 8 ) ) sns. countplot( room[ 'JYMJ' ] )
<AxesSubplot:xlabel='JYMJ', ylabel='count'>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-O93ifdQO-1646128974748)(output_72_1.png)]
经营面积缺失超过一半,3000多,考虑删除
1 47270 5042 719 5Name: CZLY, dtype: int64
3.710990e+11 9543.710980e+11 8523.710020e+11 6683.710820e+11 1083.710810e+11 563.710830e+11 163.710960e+11 4Name: BUR_CODE, dtype: int64
371000000006 505371000000007 433371098060000 321371099100000 258371000000001 242Name: STA_CODE, dtype: int64
371002 4661371082 406371083 150371081 64371003 26Name: SSX, dtype: int64
ROOM001 2565无 1878ROOM002 57ROOM003 26ROOM004 17Name: MPHM, dtype: int64
order. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 1.596120e+11 1 NaN 20200730224152 20200730224152 3 1 923696 C72F20539AD1447D86CD1A8E5EAEC63A 282932 202008041400 202008061200 1.596121e+11 1 NaN 20200730225524 20200730225524 3
order[ order[ 'ORDER_PRIMARY_ID' ] == '3CDEDB5E03534D379687645675898CA4' ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 30970 4422027 3CDEDB5E03534D379687645675898CA4 170898 202108081358 202108101158 2.021071e+11 3 NaN 20210712210153 20210712210153 3
cust[ cust[ 'ORDER_PRIMARY_ID' ] == '3CDEDB5E03534D379687645675898CA4' ]
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 12421 3CDEDB5E03534D379687645675898CA4 371002B268202107280001 19871031 110224 0.0 202108020842 1 1
order[ 'PRE_IN_TIME' ] = order[ 'PRE_IN_TIME' ] . astype( 'str' ) order[ 'PRE_OUT_TIME' ] = order[ 'PRE_OUT_TIME' ] . astype( 'str' )
order[ 'ORDER_TIME' ] = order[ 'ORDER_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
##########################没有考虑包含取消时间的订单
order = order[ ( order[ 'CANCEL_TIME' ] == 'nan' ) ]
order = order. sort_values( [ 'ORDER_ID' , 'MODIFY_TIME' ] )
order. drop_duplicates( 'ORDER_ID' , keep= 'last' , inplace= True )
order. shape
(29941, 11)
order[ order[ 'ORDER_ID' ] == 4402846 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 30141 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111358 202107121158 202107111032 1 nan 20210711140042 20210711140042 3
order. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 29941 entries, 2 to 18590Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 29941 non-null int64 1 ORDER_PRIMARY_ID 29941 non-null object 2 HOTELID 29941 non-null object 3 PRE_IN_TIME 29941 non-null object 4 PRE_OUT_TIME 29941 non-null object 5 ORDER_TIME 29941 non-null object 6 STATUS 29941 non-null int64 7 CANCEL_TIME 29941 non-null object 8 INSERT_TIME 29941 non-null object 9 MODIFY_TIME 29941 non-null object 10 FIRM 29941 non-null int64 dtypes: int64(3), object(8)memory usage: 2.7+ MB
order. nunique( )
ORDER_ID 29941ORDER_PRIMARY_ID 29941HOTELID 4801PRE_IN_TIME 6875PRE_OUT_TIME 729ORDER_TIME 26818STATUS 3CANCEL_TIME 1INSERT_TIME 25432MODIFY_TIME 25432FIRM 2dtype: int64
order_info[ 'FIRM' ] . value_counts( )
3 3202910 8521Name: FIRM, dtype: int64
order[ 'STATUS' ] . value_counts( )
1 261372 32213 583Name: STATUS, dtype: int64
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
合并数据
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df[ 'IN_TIME' ] = df[ 'IN_TIME' ] . apply ( lambda x: np. nan if x== '0.0' else x) df[ 'OUT_TIME' ] = df[ 'OUT_TIME' ] . apply ( lambda x: np. nan if x== '0.0' else x) df[ 'IN_TIME' ] . fillna( df[ 'PRE_IN_TIME' ] , inplace= True ) df[ 'OUT_TIME' ] . fillna( df[ 'PRE_OUT_TIME' ] , inplace= True )
df[ 'OUT_TIME' ] = df. apply ( lambda x: x[ 'OUT_TIME' ] if x[ 'OUT_TIME' ] >= x[ 'PRE_OUT_TIME' ] else x[ 'PRE_OUT_TIME' ] , axis= 1 ) df[ 'IN_TIME' ] = df. apply ( lambda x: x[ 'IN_TIME' ] if x[ 'IN_TIME' ] >= x[ 'PRE_IN_TIME' ] else x[ 'PRE_IN_TIME' ] , axis= 1 )
df. drop( [ 'PRE_IN_TIME' , 'PRE_OUT_TIME' , 'CANCEL_TIME' ] , axis= 1 , inplace= True )
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0
df. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 29941 entries, 0 to 29940Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 29941 non-null int64 1 ORDER_PRIMARY_ID 29941 non-null object 2 HOTELID 29941 non-null object 3 ORDER_TIME 29941 non-null object 4 STATUS 29941 non-null int64 5 INSERT_TIME 29941 non-null object 6 MODIFY_TIME 29941 non-null object 7 FIRM 29941 non-null int64 8 GUEST_ID 3561 non-null object 9 BDATE 29941 non-null object 10 XZQH 29941 non-null object 11 IN_TIME 29941 non-null object 12 OUT_TIME 29941 non-null object 13 guest_sum 29941 non-null int64 14 guest_sum_notnull 29941 non-null int64 dtypes: int64(5), object(10)memory usage: 3.7+ MB
df. nunique( ) ;
ORDER_ID 29941ORDER_PRIMARY_ID 29941HOTELID 4801ORDER_TIME 26818STATUS 3INSERT_TIME 25432MODIFY_TIME 25432FIRM 2GUEST_ID 3561BDATE 9036XZQH 2575IN_TIME 8953OUT_TIME 732guest_sum 3guest_sum_notnull 3dtype: int64
df[ 'in_time' ] = pd. to_datetime( df[ 'IN_TIME' ] , errors= 'coerce' , format = '%Y%m%d%H%M' ) df[ 'out_time' ] = pd. to_datetime( df[ 'OUT_TIME' ] , errors= 'coerce' , format = '%Y%m%d%H%M' )
df = df[ ~ df[ 'out_time' ] . isnull( ) ]
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 2020-07-15 13:00:00 2020-07-16 12:00:00 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0 2020-10-01 14:00:00 2020-10-02 10:11:00
49261
#后移一天 df[‘lock_time’][0] + pd.Timedelta(days=1) #前移一天 df[‘lock_time’][0] + pd.Timedelta(days=-1) #df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
dfs = [ ] for idx, group in df. groupby( [ 'ORDER_PRIMARY_ID' ] ) : stat = pd. DataFrame( ) stat[ 'datetime' ] = pd. date_range( start= group[ 'IN_TIME' ] . values[ 0 ] , end= group[ 'OUT_TIME' ] . values[ 0 ] , freq= 'D' , normalize= False , closed= None ) stat[ 'ORDER_PRIMARY_ID' ] = group[ 'ORDER_PRIMARY_ID' ] . values[ 0 ] dfs. append( stat)
df_date = pd. concat( dfs) . reset_index( drop= True )
df1 = df. merge( df_date, on= [ 'ORDER_PRIMARY_ID' ] , how= 'left' )
df1. shape
(60973, 27)
df1[ ( df1[ 'out_time' ] != df1[ 'datetime' ] ) & ( df1[ 'day_gap' ] != 0 ) ] . shape
(50058, 27)
df1[ : 2 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time in_time_year out_time_year in_time_month out_time_month in_time_day out_time_day in_time_hour out_time_hour day_gap datetime 0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 2020-07-15 13:00:00 2020-07-16 12:00:00 2020 2020 7 7 15 16 13 12 1 2020-07-15 13:00:00 1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0 2020-10-01 14:00:00 2020-10-02 10:11:00 2020 2020 10 10 1 2 14 10 1 2020-10-01 14:00:00
df1[ 'in_date' ] = df1[ 'in_time' ] . dt. datedf1[ 'out_date' ] = df1[ 'out_time' ] . dt. datedf1[ 'date' ] = df1[ 'datetime' ] . dt. date
df2 = df1[ ( df1[ 'out_date' ] != df1[ 'date' ] ) | ( ( df1[ 'out_date' ] == df1[ 'date' ] ) & ( df1[ 'in_date' ] == df1[ 'out_date' ] ) ) ]
df2. shape
(49547, 30)
stat = pd.DataFrame() stat[‘date’] = pd.date_range( start=group[‘IN_TIME’].values[0],#开始时间 end=group[‘OUT_TIME’].values[0],#截止时间 periods=4,#总长度 freq=‘D’,#时间间隔 tz=None,#时区 normalize=False,#是否标准化到midnight name=None,#date名称 closed=None,#首尾是否在内 **kwargs, )
df3 = df2. merge( room, on= [ 'HOTELID' , 'FIRM' ] , how= 'left' )
df3. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 49547 entries, 0 to 49546Data columns (total 72 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 49547 non-null int64 1 ORDER_PRIMARY_ID 49547 non-null object 2 HOTELID 49547 non-null object 3 ORDER_TIME 49547 non-null object 4 STATUS_x 49547 non-null int64 5 INSERT_TIME 49547 non-null object 6 MODIFY_TIME 49547 non-null object 7 FIRM 49547 non-null int64 8 GUEST_ID 5963 non-null object 9 BDATE 49547 non-null object 10 XZQH 49547 non-null object 11 IN_TIME 49547 non-null object 12 OUT_TIME 49547 non-null object 13 guest_sum 49547 non-null int64 14 guest_sum_notnull 49547 non-null int64 15 in_time 49547 non-null datetime64[ns] 16 out_time 49547 non-null datetime64[ns] 17 in_time_year 49547 non-null int32 18 out_time_year 49547 non-null int32 19 in_time_month 49547 non-null int32 20 out_time_month 49547 non-null int32 21 in_time_day 49547 non-null int32 22 out_time_day 49547 non-null int32 23 in_time_hour 49547 non-null int32 24 out_time_hour 49547 non-null int32 25 day_gap 49547 non-null int64 26 datetime 49547 non-null datetime64[ns] 27 in_date 49547 non-null object 28 out_date 49547 non-null object 29 date 49547 non-null object 30 JYMJ 14248 non-null float64 31 ROOM_NUM 49547 non-null int64 32 BED_NUM 49547 non-null int64 33 CZLY 49547 non-null int64 34 CALLED 49547 non-null object 35 BUR_CODE 22916 non-null float64 36 STA_CODE 22916 non-null object 37 SSX 49547 non-null int64 38 ADDRESS 49547 non-null object 39 MPHM 49547 non-null object 40 DJSJ 49497 non-null datetime64[ns] 41 BGSJ 49547 non-null datetime64[ns] 42 STATUS_y 22916 non-null float64 43 CALLED_freq 49547 non-null int64 44 ADDRESS_freq 49547 non-null int64 45 JYMJ_freq 14248 non-null float64 46 ROOM_NUM_freq 49547 non-null int64 47 BED_NUM_freq 49547 non-null int64 48 CZLY_freq 49547 non-null int64 49 BUR_CODE_freq 22916 non-null float64 50 STA_CODE_freq 22916 non-null float64 51 SSX_freq 49547 non-null int64 52 MPHM_freq 49547 non-null int64 53 FIRM_freq 49547 non-null int64 54 room_ratio 14248 non-null float64 55 bed_ratio 14248 non-null float64 56 room_bed 49547 non-null float64 57 DJ_date 49497 non-null object 58 BG_date 49547 non-null object 59 DJ_gap 49497 non-null float64 60 DJ_year 49547 non-null int32 61 DJ_month 49547 non-null int32 62 DJ_day 49547 non-null int32 63 DJ_quarter 49547 non-null int32 64 DJ_dayofweek 49547 non-null int32 65 DJ_dayofyear 49547 non-null int32 66 BG_year 49547 non-null int32 67 BG_month 49547 non-null int32 68 BG_day 49547 non-null int32 69 BG_quarter 49547 non-null int32 70 BG_dayofweek 49547 non-null int32 71 BG_dayofyear 49547 non-null int32 dtypes: datetime64[ns](5), float64(10), int32(20), int64(18), object(19)memory usage: 23.8+ MB
df3. nunique( )
ORDER_ID 29930ORDER_PRIMARY_ID 29930HOTELID 4801ORDER_TIME 26807STATUS_x 3INSERT_TIME 25421MODIFY_TIME 25421FIRM 2GUEST_ID 3561BDATE 9035XZQH 2575IN_TIME 8953OUT_TIME 730guest_sum 3guest_sum_notnull 3in_time 8953out_time 730in_time_year 2out_time_year 2in_time_month 12out_time_month 12in_time_day 31out_time_day 31in_time_hour 23out_time_hour 15day_gap 33datetime 12076in_date 379out_date 387date 408JYMJ 172ROOM_NUM 11BED_NUM 17CZLY 4CALLED 3813BUR_CODE 7STA_CODE 53SSX 5ADDRESS 4377MPHM 534DJSJ 3358BGSJ 2871STATUS_y 1CALLED_freq 13ADDRESS_freq 10JYMJ_freq 40ROOM_NUM_freq 10BED_NUM_freq 15CZLY_freq 4BUR_CODE_freq 7STA_CODE_freq 27SSX_freq 5MPHM_freq 13FIRM_freq 2room_ratio 198bed_ratio 249room_bed 34DJ_date 365BG_date 321DJ_gap 29DJ_year 3DJ_month 13DJ_day 32DJ_quarter 5DJ_dayofweek 7DJ_dayofyear 322BG_year 3BG_month 13BG_day 32BG_quarter 5BG_dayofweek 7BG_dayofyear 322dtype: int64
df3 = df3. sort_values( by= [ 'HOTELID' , 'date' ] )
df3. head( 2 )
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS_x INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time in_time_year out_time_year in_time_month out_time_month in_time_day out_time_day in_time_hour out_time_hour day_gap datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ STATUS_y CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap DJ_year DJ_month DJ_day DJ_quarter DJ_dayofweek DJ_dayofyear BG_year BG_month BG_day BG_quarter BG_dayofweek BG_dayofyear 23158 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111032 1 20210711140042 20210711140042 3 NaN 19731004 370602 202107111358 202107121158 1 0 2021-07-11 13:58:00 2021-07-12 11:58:00 2021 2021 7 7 11 12 13 11 1 2021-07-11 13:58:00 2021-07-11 2021-07-12 2021-07-11 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 2020 5 18 2 0 139 2020 5 18 2 0 139 25548 4455098 01D7394D02B44376913536F6071151AD 100177 202107151446 1 20210724140006 20210724140006 3 NaN 19941001 429006 202107241358 202107261158 1 0 2021-07-24 13:58:00 2021-07-26 11:58:00 2021 2021 7 7 24 26 13 11 2 2021-07-24 13:58:00 2021-07-24 2021-07-26 2021-07-24 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 2020 5 18 2 0 139 2020 5 18 2 0 139
in_num = df3. groupby( [ 'HOTELID' , 'date' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( ) in_num. columns = [ 'HOTELID' , 'date' , 'in_hotel_num' ] df4 = df3. merge( in_num, on= [ 'HOTELID' , 'date' ] , how= 'left' )
dd = df4[ df4[ 'HOTELID' ] == '227185' ]
dd. drop( [ 'CALLED' , 'ADDRESS' ] , axis= 1 , inplace= True )
plt. figure( figsize= ( 20 , 8 ) ) plt. scatter( dd[ 'date' ] , dd[ 'in_hotel_num' ] )
<matplotlib.collections.PathCollection at 0x22da65e3640>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-5g1bAE7v-1646128974752)(output_146_1.png)]
dd[ : 10 ]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS_x INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time in_time_year out_time_year in_time_month out_time_month in_time_day out_time_day in_time_hour out_time_hour day_gap datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ STATUS_y CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap DJ_year DJ_month DJ_day DJ_quarter DJ_dayofweek DJ_dayofyear BG_year BG_month BG_day BG_quarter BG_dayofweek BG_dayofyear in_hotel_num unuse_room 3338 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 2020-07-15 13:00:00 2020-07-16 12:00:00 2020 2020 7 7 15 16 13 12 1 2020-07-15 13:00:00 2020-07-15 2020-07-16 2020-07-15 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3339 853709 73821DF4F47B401A84823D965A4A4F77 227185 202007232243 1 20200723224348 20200723224348 3 NaN 20000805 371083 202007251300 202007261200 1 0 2020-07-25 13:00:00 2020-07-26 12:00:00 2020 2020 7 7 25 26 13 12 1 2020-07-25 13:00:00 2020-07-25 2020-07-26 2020-07-25 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3340 1111054 23FCC9D42D4D4379A17F4614597D055C 227185 202008162352 1 20200816235228 20200816235228 3 NaN 19950629 452627 202008171300 202008181200 1 0 2020-08-17 13:00:00 2020-08-18 12:00:00 2020 2020 8 8 17 18 13 12 1 2020-08-17 13:00:00 2020-08-17 2020-08-18 2020-08-17 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3341 1089505 A96257531F3C40529329CE1DBE2CFE28 227185 202008142213 1 20200814221323 20200814221323 3 NaN 20011210 371523 202008181300 202008201200 1 0 2020-08-18 13:00:00 2020-08-20 12:00:00 2020 2020 8 8 18 20 13 12 2 2020-08-18 13:00:00 2020-08-18 2020-08-20 2020-08-18 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3342 1089505 A96257531F3C40529329CE1DBE2CFE28 227185 202008142213 1 20200814221323 20200814221323 3 NaN 20011210 371523 202008181300 202008201200 1 0 2020-08-18 13:00:00 2020-08-20 12:00:00 2020 2020 8 8 18 20 13 12 2 2020-08-19 13:00:00 2020-08-18 2020-08-20 2020-08-19 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3343 1160405 D7D528089DDA4804B7F7968F3E7539C0 227185 202008210828 1 20200821082858 20200821082858 3 NaN 19870304 232301 202008211300 202008221200 1 0 2020-08-21 13:00:00 2020-08-22 12:00:00 2020 2020 8 8 21 22 13 12 1 2020-08-21 13:00:00 2020-08-21 2020-08-22 2020-08-21 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3344 1184347 74F501E396554E7CA9F2BCE9F33CD0CE 227185 202008230919 1 20200823091959 20200823091959 3 NaN 19981130 120222 202008231300 202008241200 1 0 2020-08-23 13:00:00 2020-08-24 12:00:00 2020 2020 8 8 23 24 13 12 1 2020-08-23 13:00:00 2020-08-23 2020-08-24 2020-08-23 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3345 1198780 39460A2B095B47CC924CA5EFB2FAA18C 227185 202008241612 1 20200824170812 20200824170812 3 NaN 20000509 371083 202008241300 202008251200 1 0 2020-08-24 13:00:00 2020-08-25 12:00:00 2020 2020 8 8 24 25 13 12 1 2020-08-24 13:00:00 2020-08-24 2020-08-25 2020-08-24 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3346 1008589 C50E6D8FCAFD4CAEB1BBE94FA1AF66A0 227185 202008072052 1 20200807205244 20200807205244 3 NaN 20000421 371122 202008251300 202008271200 1 0 2020-08-25 13:00:00 2020-08-27 12:00:00 2020 2020 8 8 25 27 13 12 2 2020-08-25 13:00:00 2020-08-25 2020-08-27 2020-08-25 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0 3347 1008589 C50E6D8FCAFD4CAEB1BBE94FA1AF66A0 227185 202008072052 1 20200807205244 20200807205244 3 NaN 20000421 371122 202008251300 202008271200 1 0 2020-08-25 13:00:00 2020-08-27 12:00:00 2020 2020 8 8 25 27 13 12 2 2020-08-26 13:00:00 2020-08-25 2020-08-27 2020-08-26 NaN 1 1 1 NaN NaN 371002 ROOM001 2020-07-10 15:35:22 2020-07-15 10:24:14 NaN 2 1 NaN 3091 1502 4727 NaN NaN 4661 2565 3429 NaN NaN 1.0 2020-07-10 2020-07-15 5.0 2020 7 10 3 4 192 2020 7 10 3 4 192 1 0
可以预测入住量
df4[ 'unuse_room' ] = df4[ 'ROOM_NUM' ] - df4[ 'in_hotel_num' ]
df4[ 'unuse_room' ] . value_counts( )
0 28340 1 9755 2 4830-1 3180-2 1073 3 667-3 414 4 330-4 191 5 159-5 114-6 93-7 80-9 70 6 56-8 54 7 47-10 33-11 24 8 23 11 8 9 6Name: unuse_room, dtype: int64
in_num[ 'ORDER_ID' ] . value_counts( )
1 430502 20123 4084 1405 436 208 127 1110 79 611 312 2Name: ORDER_ID, dtype: int64
room_info[ room_info[ 'CODE' ] == '329845' ]
CODE HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM FIRM DJSJ BGSJ 3051 329845 371002B740 0.0 2 3 1 温馨精致观海房/loft名宿/孙家疃油画小镇/临近海水浴场 3.710020e+11 371000000001 371002 环翠区孙家疃望海·山公馆公寓13-16_B304室---__-~ ROOM001 3 2021-08-03 09:58:05 2021-08-03 10:01:02
df_month = df. groupby( [ 'in_time_year' , 'in_time_month' ] ) [ 'ORDER_ID' ] df_month. count( )
in_time_year in_time_month2020 7 182 8 885 9 1571 10 2226 11 1179 12 22792021 1 1596 2 137 3 5 4 420 5 4893 6 3856 7 8507 8 2069 9 99 10 25 11 1Name: ORDER_ID, dtype: int64
df[ 'IN_TIME' ] . min ( ) , df[ 'IN_TIME' ] . max ( )
('202007151300', '202111022358')
df. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 29930 entries, 0 to 29940Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 29930 non-null int64 1 ORDER_PRIMARY_ID 29930 non-null object 2 HOTELID 29930 non-null object 3 ORDER_TIME 29930 non-null object 4 STATUS 29930 non-null int64 5 INSERT_TIME 29930 non-null object 6 MODIFY_TIME 29930 non-null object 7 FIRM 29930 non-null int64 8 GUEST_ID 3561 non-null object 9 BDATE 29930 non-null object 10 XZQH 29930 non-null object 11 IN_TIME 29930 non-null object 12 OUT_TIME 29930 non-null object 13 guest_sum 29930 non-null int64 14 guest_sum_notnull 29930 non-null int64 15 in_time 29930 non-null datetime64[ns] 16 out_time 29930 non-null datetime64[ns] 17 in_time_year 29930 non-null int32 18 out_time_year 29930 non-null int32 19 in_time_month 29930 non-null int32 20 out_time_month 29930 non-null int32 21 in_time_day 29930 non-null int32 22 out_time_day 29930 non-null int32 23 in_time_hour 29930 non-null int32 24 out_time_hour 29930 non-null int32 25 day_gap 29930 non-null int64 dtypes: datetime64[ns](2), int32(8), int64(6), object(10)memory usage: 5.3+ MB
df. nunique( )
ORDER_ID 29930ORDER_PRIMARY_ID 29930HOTELID 4801ORDER_TIME 26807STATUS 3INSERT_TIME 25421MODIFY_TIME 25421FIRM 2GUEST_ID 3561BDATE 9035XZQH 2575IN_TIME 8953OUT_TIME 730guest_sum 3guest_sum_notnull 3in_time 8953out_time 730in_time_year 2out_time_year 2in_time_month 12out_time_month 12in_time_day 31out_time_day 31in_time_hour 23out_time_hour 15day_gap 33dtype: int64
df. rename( columns= { 'HOTELID' : 'CODE' } , inplace= True )
df = pd. merge( df, room_info, on= [ 'CODE' , 'FIRM' ] )
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID CODE PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME in_hour in_minute in_time out_hour out_minute out_time in_datetime out_datetime HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 159612011149.0 1 nan 20200730224152 20200730224152 3 NaN 19930804 320821 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 37100A1620 0.0 1 2 1 金海滩临海落地窗公寓,北欧式装修 NaN NaN 371002 环翠区高区金海滩街道新浪屿花园8号楼1104 1104 2020-07-06 17:59:35 2020-07-15 10:24:21 1 761299 8923C26CC35F42C9A7C0A08BCC22179C 278337 202007221200 202007251200 202007150017.0 1 nan 20200715001752 20200715001752 3 NaN 19810628 120101 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 37100A1620 0.0 1 2 1 金海滩临海落地窗公寓,北欧式装修 NaN NaN 371002 环翠区高区金海滩街道新浪屿花园8号楼1104 1104 2020-07-06 17:59:35 2020-07-15 10:24:21
df. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 40043 non-null int64 1 ORDER_PRIMARY_ID 40043 non-null object 2 CODE 40043 non-null object 3 PRE_IN_TIME 40043 non-null object 4 PRE_OUT_TIME 40043 non-null object 5 ORDER_TIME 40043 non-null object 6 STATUS 40043 non-null int64 7 CANCEL_TIME 40043 non-null object 8 INSERT_TIME 40043 non-null object 9 MODIFY_TIME 40043 non-null object 10 FIRM 40043 non-null int64 11 GUEST_ID 5131 non-null object 12 BDATE 40043 non-null object 13 XZQH 40043 non-null object 14 IN_TIME 40043 non-null object 15 OUT_TIME 40043 non-null object 16 in_hour 40043 non-null object 17 in_minute 40043 non-null object 18 in_time 40043 non-null object 19 out_hour 40043 non-null object 20 out_minute 40043 non-null object 21 out_time 40043 non-null object 22 in_datetime 4913 non-null datetime64[ns] 23 out_datetime 644 non-null datetime64[ns] 24 HOTELID 40043 non-null object 25 JYMJ 40043 non-null float64 26 ROOM_NUM 40043 non-null int64 27 BED_NUM 40043 non-null int64 28 CZLY 40043 non-null int64 29 CALLED 40043 non-null object 30 BUR_CODE 20204 non-null float64 31 STA_CODE 20204 non-null object 32 SSX 40043 non-null int64 33 ADDRESS 40043 non-null object 34 MPHM 40043 non-null object 35 DJSJ 40043 non-null object 36 BGSJ 40043 non-null object dtypes: datetime64[ns](2), float64(2), int64(7), object(26)memory usage: 11.6+ MB
df. nunique( )
ORDER_ID 29941ORDER_PRIMARY_ID 39901CODE 4801PRE_IN_TIME 6932PRE_OUT_TIME 741ORDER_TIME 26892STATUS 3CANCEL_TIME 1INSERT_TIME 35319MODIFY_TIME 35319FIRM 2GUEST_ID 3675BDATE 9064XZQH 2578IN_TIME 3380OUT_TIME 235in_hour 24in_minute 61in_time 264out_hour 15out_minute 25out_time 211in_datetime 263out_datetime 210HOTELID 4801JYMJ 173ROOM_NUM 11BED_NUM 17CZLY 4CALLED 3813BUR_CODE 7STA_CODE 53SSX 5ADDRESS 4377MPHM 534DJSJ 3363BGSJ 2871dtype: int64
df. columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME', 'ORDER_TIME', 'STATUS', 'CANCEL_TIME', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH', 'IN_TIME', 'OUT_TIME', 'in_hour', 'in_minute', 'in_time', 'out_hour', 'out_minute', 'out_time', 'in_datetime', 'out_datetime', 'HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'CALLED', 'BUR_CODE', 'STA_CODE', 'SSX', 'ADDRESS', 'MPHM', 'DJSJ', 'BGSJ'], dtype='object')
删除HOTELID,CANCEL_TIME
df1 = df[ [ 'ORDER_ID' , 'CODE' , 'PRE_IN_TIME' , 'PRE_OUT_TIME' , 'ORDER_TIME' , 'INSERT_TIME' , 'MODIFY_TIME' , 'IN_TIME' , 'OUT_TIME' , 'JYMJ' , 'ROOM_NUM' , 'BED_NUM' , 'STATUS' ] ]
0-撤销;1-预定;2-入住;3-离开
df1[ 'STATUS' ] . value_counts( )
1 344612 49213 661Name: STATUS, dtype: int64
df1. head( 50 )
ORDER_ID CODE PRE_IN_TIME PRE_OUT_TIME ORDER_TIME INSERT_TIME MODIFY_TIME IN_TIME OUT_TIME JYMJ ROOM_NUM BED_NUM STATUS 0 923521 278337 202008011200 202008021200 159612011149.0 20200730224152 20200730224152 0.0 0.0 0.0 1 2 1 1 761299 278337 202007221200 202007251200 202007150017.0 20200715001752 20200715001752 0.0 0.0 0.0 1 2 1 2 943703 278337 202008091200 202008121200 202008012010.0 20200801201019 20200801201019 0.0 0.0 0.0 1 2 1 3 1012389 278337 202008141200 202008171200 202008080900.0 20200808090100 20200808090100 0.0 0.0 0.0 1 2 1 4 1351873 278337 202010051200 202010061200 202009091245.0 20200909124547 20200909124547 0.0 0.0 0.0 1 2 1 5 1481730 278337 202010051200 202010071200 202009221039.0 20200922103911 20200922103911 202010051624 202010071200 0.0 1 2 3 6 1516263 278337 202010011200 202010041200 202009251502.0 20200925150231 20200925150231 0.0 0.0 0.0 1 2 1 7 1539785 278337 202010041200 202010051200 202009271942.0 20200927194258 20200927194258 0.0 0.0 0.0 1 2 1 8 1568043 278337 202010041200 202010051200 202009301837.0 20200930183801 20200930183801 0.0 0.0 0.0 1 2 1 9 1830343 278337 202010311200 202011011200 202010301016.0 20201031120256 20201031120256 0.0 0.0 0.0 1 2 1 10 1830343 278337 202010311200 202011011200 202010301016.0 20201030101655 20201030101655 0.0 0.0 0.0 1 2 1 11 1836522 278337 202010301200 202010311200 202010301933.0 20201030193317 20201030193317 0.0 0.0 0.0 1 2 1 12 1850449 278337 202011011200 202011021200 202011010951.0 20201101121128 20201101121128 0.0 0.0 0.0 1 2 1 13 1850449 278337 202011011200 202011021200 202011010951.0 20201101095111 20201101095111 0.0 0.0 0.0 1 2 1 14 2035663 278337 202011210022 202011211200 202011210022.0 20201121002216 20201121002216 0.0 0.0 0.0 1 2 1 15 2363298 278337 202101021200 202101041200 202012230028.0 20210102120800 20210102120800 0.0 0.0 0.0 1 2 1 16 2363298 278337 202101021200 202101041200 202012230028.0 20201223002859 20201223002859 0.0 0.0 0.0 1 2 1 17 2465555 278337 202101021200 202101031200 202101011632.0 20210101163250 20210101163250 0.0 0.0 0.0 1 2 1 18 2465555 278337 202101021200 202101031200 202101011632.0 20210102120701 20210102120701 0.0 0.0 0.0 1 2 1 19 3356024 278337 202104291158 202105051158 202104121231.0 20210429120002 20210429120002 0.0 0.0 0.0 1 2 1 20 3576969 278337 202105061158 202105091158 202104291925.0 20210506120002 20210506120002 0.0 0.0 0.0 1 2 1 21 3599404 278337 202105051158 202105061158 202105020724.0 20210505120008 20210505120008 0.0 0.0 0.0 1 2 1 22 3599404 278337 202105051158 202105061158 202105020724.0 20210502072638 20210502072638 0.0 0.0 0.0 1 2 1 23 3665000 278337 202106121158 202106151158 202105081926.0 20210508192825 20210508192825 0.0 0.0 0.0 1 2 1 24 3686275 278337 202105102004 202105111158 202105102004.0 20210510201002 20210510201002 0.0 0.0 0.0 1 2 1 25 3750952 278337 202105161158 202105171158 202105161150.0 20210516120228 20210516120228 0.0 0.0 0.0 1 2 1 26 3777571 278337 202105181904 202105191158 202105181904.0 20210518190621 20210518190621 0.0 0.0 0.0 1 2 1 27 3787561 278337 202105211158 202105231158 202105191621.0 20210519162332 20210519162332 0.0 0.0 0.0 1 2 1 28 3787561 278337 202105211158 202105231158 202105191621.0 20210521120006 20210521120006 0.0 0.0 0.0 1 2 1 29 3941122 278337 202106022128 202106031158 202106022128.0 20210602214001 20210602214001 0.0 0.0 0.0 1 2 1 30 3941122 278337 202106022128 202106031158 202106022128.0 20210602213051 20210602213051 0.0 0.0 0.0 1 2 1 31 3988599 278337 202106101158 202106111158 202106070140.0 20210610120016 20210610120016 0.0 0.0 0.0 1 2 1 32 3988599 278337 202106101158 202106111158 202106070140.0 20210607014246 20210607014246 0.0 0.0 0.0 1 2 1 33 4300717 278337 202107061158 202107071158 202107031110.0 20210706120003 20210706120003 0.0 0.0 0.0 1 2 1 34 4308464 278337 202107051158 202107061158 202107032053.0 20210703205553 20210703205553 0.0 0.0 0.0 1 2 1 35 4308464 278337 202107051158 202107061158 202107032053.0 20210705120003 20210705120003 0.0 0.0 0.0 1 2 1 36 923696 282932 202008041400 202008061200 159612092325.0 20200730225524 20200730225524 0.0 0.0 0.0 2 3 1 37 1065945 282932 202008131400 202008141200 202008122227.0 20200812222725 20200812222725 0.0 0.0 0.0 2 3 1 38 1370082 282932 202009181400 202009191200 202009110933.0 20200911093350 20200911093350 0.0 0.0 0.0 2 3 1 39 1370328 282932 202009191400 202009211200 202009111005.0 20200911100510 20200911100510 0.0 0.0 0.0 2 3 1 40 1550691 282932 202010011400 202010021200 202009282156.0 20200928215621 20200928215621 0.0 0.0 0.0 2 3 1 41 1612645 282932 202010081400 202010091200 202010061132.0 20201006113229 20201006113229 0.0 0.0 0.0 2 3 1 42 4497012 282932 202107221358 202107291158 202107181945.0 20210718194727 20210718194727 0.0 0.0 0.0 2 3 1 43 4505766 282932 202107201358 202107221158 202107191521.0 20210719152358 20210719152358 0.0 0.0 0.0 2 3 1 44 4505766 282932 202107201358 202107221158 202107191521.0 20210720140022 20210720140022 0.0 0.0 0.0 2 3 1 45 706648 227185 202007151300 202007161200 202007092241.0 20200714171021 20200714171021 0.0 0.0 0.0 1 1 1 46 853709 227185 202007251300 202007261200 202007232243.0 20200723224348 20200723224348 0.0 0.0 0.0 1 1 1 47 1008589 227185 202008251300 202008271200 202008072052.0 20200807205244 20200807205244 0.0 0.0 0.0 1 1 1 48 1089505 227185 202008181300 202008201200 202008142213.0 20200814221323 20200814221323 0.0 0.0 0.0 1 1 1 49 1111054 227185 202008171300 202008181200 202008162352.0 20200816235228 20200816235228 0.0 0.0 0.0 1 1 1
df[ 'CODE' ] . value_counts( ) [ : 10 ]
B106535897 420315790 278296093 222296537 187282520 177B105273690 142298705 124298707 122289326 116294473 116Name: CODE, dtype: int64
df[ 'HOTELID' ] . value_counts( ) [ : 10 ]
37100A3821 420371098A808 27837100A9707 222371098B022 18737100A1898 17737100A3565 142371099A016 124371099A013 12237100A9501 11637100A8976 116Name: HOTELID, dtype: int64
df_g = df. groupby( [ 'CODE' , 'PRE_IN_TIME' , 'PRE_OUT_TIME' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( )
df_g. head( )
CODE PRE_IN_TIME PRE_OUT_TIME ORDER_ID 0 100177 202107111358 202107121158 2 1 100177 202107241358 202107261158 2 2 100181 202106271358 202106291158 2 3 100181 202107191358 202107201158 1 4 100181 202107221358 202107231158 2
ORDER_ID有重复,MODIFY_TIME修改过,按照最后一次的修改确定时间
df1[ df1[ 'CODE' ] == '100177' ]
ORDER_ID CODE PRE_IN_TIME PRE_OUT_TIME ORDER_TIME INSERT_TIME MODIFY_TIME IN_TIME OUT_TIME JYMJ ROOM_NUM BED_NUM STATUS 36937 4402846 100177 202107111358 202107121158 202107111032.0 20210711103458 20210711103458 0.0 0.0 0.0 1 1 1 36938 4402846 100177 202107111358 202107121158 202107111032.0 20210711140042 20210711140042 0.0 0.0 0.0 1 1 1 36939 4455098 100177 202107241358 202107261158 202107151446.0 20210724140006 20210724140006 0.0 0.0 0.0 1 1 1 36940 4455098 100177 202107241358 202107261158 202107151446.0 20210715144850 20210715144850 0.0 0.0 0.0 1 1 1
df1. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40043 entries, 0 to 40042
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 40043 non-null int64
1 CODE 40043 non-null object
2 PRE_IN_TIME 40043 non-null object
3 PRE_OUT_TIME 40043 non-null object
4 ORDER_TIME 40043 non-null object
5 INSERT_TIME 40043 non-null object
6 MODIFY_TIME 40043 non-null object
7 IN_TIME 40043 non-null object
8 OUT_TIME 40043 non-null object
9 JYMJ 40043 non-null float64
10 ROOM_NUM 40043 non-null int64
11 BED_NUM 40043 non-null int64
12 STATUS 40043 non-null int64
dtypes: float64(1), int64(4), object(8)
memory usage: 4.3+ MB