import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import os
import re
import warnings
import seaborn as sns
import scipy. stats as st
import gc
warnings. filterwarnings( 'ignore' )
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'display.max_rows' , None )
pd. set_option( 'max_colwidth' , 100 )
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
path = 'traina/'
path1 = 'trainb/'
cust_info = pd. read_csv( path + '网约房平台入住人表.csv' )
cust_info1 = pd. read_csv( path1 + '网约房平台入住人表.csv' )
cust = pd. concat( [ cust_info, cust_info1] )
cust. head( 2 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 NaN NaN 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 2.021071e+11 2.021071e+11
gc. collect( )
133
cust[ 'BDATE' ] = cust[ 'BDATE' ] . astype( 'str' ) cust[ 'XZQH' ] = cust[ 'XZQH' ] . astype( 'str' ) cust[ 'IN_TIME' ] = cust[ 'IN_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] ) cust[ 'OUT_TIME' ] = cust[ 'OUT_TIME' ] . fillna( 0 ) . astype( 'str' ) . apply ( lambda x: x[ : 12 ] )
cust. head( 3 )
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME 0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 0.0 0.0 1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 202107092103 202107111158 2 A1414E7D71E34E1DA743AC08BCF49151 3710021822202008240001 19860311 370781 202008241401 202008251200
cust[ 'in_hour' ] = cust[ 'IN_TIME' ] . apply ( lambda x: x[ - 4 : - 2 ] ) cust[ 'in_minute' ] = cust[ 'IN_TIME' ] . apply ( lambda x: x[ - 2 : ] ) cust[ 'in_time' ] = cust[ 'IN_TIME' ] . apply ( lambda x: x[ : 8 ] ) cust[ 'out_hour' ] = cust[ 'OUT_TIME' ] . apply ( lambda x: x[ - 4 : - 2 ] ) cust[ 'out_minute' ] = cust[ 'OUT_TIME' ] . apply ( lambda x: x[ - 2 : ] ) cust[ 'out_time' ] = cust[ 'OUT_TIME' ] . apply ( lambda x: x[ : 8 ] )
cust[ 'in_datetime' ] = pd. to_datetime( cust[ 'in_time' ] , errors= 'coerce' ) cust[ 'out_datetime' ] = pd. to_datetime( cust[ 'out_time' ] , errors= 'coerce' )
cust. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 44030 entries, 0 to 3343Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_PRIMARY_ID 44030 non-null object 1 GUEST_ID 5158 non-null object 2 BDATE 44030 non-null object 3 XZQH 44030 non-null object 4 IN_TIME 44030 non-null object 5 OUT_TIME 44030 non-null object 6 in_hour 44030 non-null object 7 in_minute 44030 non-null object 8 in_time 44030 non-null object 9 out_hour 44030 non-null object 10 out_minute 44030 non-null object 11 out_time 44030 non-null object 12 in_datetime 4933 non-null datetime64[ns] 13 out_datetime 649 non-null datetime64[ns]dtypes: datetime64[ns](2), object(12)memory usage: 5.0+ MB
cust. nunique( )
ORDER_PRIMARY_ID 43883GUEST_ID 3698BDATE 9503XZQH 2659IN_TIME 3396OUT_TIME 236in_hour 24in_minute 61in_time 264out_hour 15out_minute 25out_time 212in_datetime 263out_datetime 211dtype: int64
cust[ 'IN_TIME' ] . max ( )
'202110041413'
room_info = pd. read_csv( path + '网约房注册民宿.csv' )
room_info. head( 2 )
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS 0 100177 3710830002 0.0 1 1 2 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 371000000000 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN NaN 1 100181 3710830003 0.0 5 7 2 1 【山海边公寓】瞻星伴月阁 371000000000 NaN NaN 371083 乳山市银滩 檀香丽湾小区2-4-603 ROOM001 【山海边公寓】瞻星伴月阁 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN NaN
room_info[ ( room_info[ 'CALLED' ] == room_info[ 'JYQK' ] ) == False ]
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS
room_info[ 'CALLED' ] . nunique( )
4232
room_info[ 'CALLED' ] . value_counts( ) [ : 50 ]
高铁北站看海公寓 24韩乐坊/不夜城美食街/乐天世纪城 豪华大床房 景驰公寓 20国际海水浴场精致海景大床房 12高铁北站看海民宿 10九龙湾豪华一线海景loft复式 10超大山景海景房、近威海北站、小石岛赶海公园、国际海水浴场、威高海洋馆、景区中心 9【海居优选】海景房 威海北站海洋公园店 可住2~4人,知名经典云集。 8海之约民宿 7威海北站民宿 7【朵笆】纵览山海/山东大学/海水浴场/密码开门/落地大窗/望海一居室 7豪华海景家庭四人间 7观海一居室 7简约风格,观山看海,近高铁站。 7【海居优选】山景房,高铁北站海水浴场店,可住3人,景点云集。 7【朵笆】紧临山东大学/海水浴场/落地飘窗/密码锁/实木大床 榻榻米一居 6【温莎】观海大床房近/山大/国际海水浴场/金海滩海水浴场/小石岛/高铁北站/威高海洋馆/刘公岛/幸福门/东城夜市 6【遇见?AMOR】国际海水浴场/山东大学/温馨大床房 5简约风格,观山看海,近高铁站 5【暖居】国际海水浴场 海信海景壹号公寓 赶海 大白帆 火炬八街 林海公园 距海滩6米 情侣海景房 可做饭 亲子家庭房 5康业现代城优质房源 5【捷诚民宿】简约风格,观山看海,近高铁站。 5【温莎】温馨一室亲子房房近/山大/国际海水浴场/金海滩海水浴场/小石岛/高铁北站/威高海洋馆/刘公岛/幸福门/东城夜市 4【橙途】威海站/华夏城/舒适一居 4【已消毒】(地暖开放)月租环翠楼市中心智能门锁 小清新公寓 免费停车场 4海之星温馨家庭复式公寓 4荣成市那香海/成山头/鸡鸣岛/西霞口野生动物园港西陌野民宿 4北站温馨民宿,简约风。 4威海寨子大润发 北欧风格 全景落地窗大床公寓 免费停车场 4【橙途】威高广场/幸福门/临海时尚一居 4一线观海大床房 4威海浩然日租房普通大床 4威海市中心两居可做饭,近幸福门/刘公岛/海水浴场/韩国城/滨海公园,新鲜海鲜,白马南居懂你的舒适生活家 4一线观海简约整洁单间,内有独立卫生间 4国际海水浴场一线观海大床房(已消毒) 3韩乐坊不夜城精装公寓 3国际海水浴场两卧室四人套房 3山海相依复式民宿/近海水浴场/日式清新风格/可住4人 3孙家疃看海公寓,近沙滩 家庭房 3威海浩然日租房单人房 3一线海景房 3海里的民宿 3国际海景城 观海阳台 一线海景 空调双床可做饭 3韩乐坊豪华舒适大床房 3外滩临海市房屋,出门十分钟到达葡萄滩。交通便利,风景优美。当地人热情好客 3威海九龙湾海边齐鲁商城南田园小院民宿客栈 3【捷诚民宿】简约风格,观山看海,近高铁站 3【已消毒】寨子大润发 智能门锁 投影浪漫大床公寓 免费停车场 3悦客之家~欢迎你! 3荣成自在澜湾海景房大床房可以做饭观海 3九龙湾豪华一线海景Loft公寓 3Name: CALLED, dtype: int64
room_info[ 'HOTELID' ] . nunique( )
5307
room_info[ 'CODE' ] . nunique( )
5307
room_info. info( )
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CODE 5307 non-null object 1 HOTELID 5307 non-null object 2 JYMJ 5307 non-null float64 3 ROOM_NUM 5307 non-null int64 4 BED_NUM 5307 non-null int64 5 FWLY 5307 non-null int64 6 CZLY 5307 non-null int64 7 CALLED 5307 non-null object 8 CITY_CODE 5307 non-null int64 9 BUR_CODE 2658 non-null float64 10 STA_CODE 2658 non-null object 11 SSX 5307 non-null int64 12 ADDRESS 5307 non-null object 13 MPHM 5307 non-null object 14 JYQK 5307 non-null object 15 FIRM 5307 non-null int64 16 DJSJ 5307 non-null object 17 BGSJ 5307 non-null object 18 STATUS 2658 non-null float64 19 AUDITSTATUS 2658 non-null float64dtypes: float64(4), int64(7), object(9)memory usage: 829.3+ KB
room_info. nunique( )
CODE 5307HOTELID 5307JYMJ 173ROOM_NUM 11BED_NUM 17FWLY 1CZLY 4CALLED 4232CITY_CODE 1BUR_CODE 7STA_CODE 54SSX 5ADDRESS 4850MPHM 584JYQK 4232FIRM 2DJSJ 3753BGSJ 3294STATUS 1AUDITSTATUS 1dtype: int64
room_info. drop( [ 'FWLY' , 'CITY_CODE' , 'STATUS' , 'AUDITSTATUS' , 'JYQK' ] , axis= 1 , inplace= True )
room_info[ 'FIRM' ] . value_counts( )
3 342910 1878Name: FIRM, dtype: int64
order. head( )
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 1.596120e+11 1 NaN 20200730224152 20200730224152 3 1 923696 C72F20539AD1447D86CD1A8E5EAEC63A 282932 202008041400 202008061200 1.596121e+11 1 NaN 20200730225524 20200730225524 3 2 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007151300 202007161200 2.020071e+11 1 NaN 20200714171021 20200714171021 3 3 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202010011400 202010021011 2.020071e+11 1 NaN 20201002101040 20201002101040 3 4 757455 43F69E6DFE004103BC4741C355D2A793 275834 202007151400 202007161200 2.020071e+11 2 NaN 20200714181929 20200714181929 3
order[ 'PRE_IN_TIME' ] = order[ 'PRE_IN_TIME' ] . astype( 'str' ) order[ 'PRE_OUT_TIME' ] = order[ 'PRE_OUT_TIME' ] . astype( 'str' ) order[ 'ORDER_TIME' ] = order[ 'ORDER_TIME' ] . astype( 'str' ) order[ 'INSERT_TIME' ] = order[ 'INSERT_TIME' ] . astype( 'str' ) order[ 'MODIFY_TIME' ] = order[ 'MODIFY_TIME' ] . astype( 'str' ) order[ 'CANCEL_TIME' ] = order[ 'CANCEL_TIME' ] . astype( 'str' )
order = order[ ( order[ 'CANCEL_TIME' ] == 'nan' ) ]
order. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 39901 entries, 0 to 3332Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 39901 non-null int64 1 ORDER_PRIMARY_ID 39901 non-null object 2 HOTELID 39901 non-null object 3 PRE_IN_TIME 39901 non-null object 4 PRE_OUT_TIME 39901 non-null object 5 ORDER_TIME 39901 non-null object 6 STATUS 39901 non-null int64 7 CANCEL_TIME 39901 non-null object 8 INSERT_TIME 39901 non-null object 9 MODIFY_TIME 39901 non-null object 10 FIRM 39901 non-null int64 dtypes: int64(3), object(8)memory usage: 3.7+ MB
order. nunique( )
ORDER_ID 29941ORDER_PRIMARY_ID 39901HOTELID 4801PRE_IN_TIME 6932PRE_OUT_TIME 741ORDER_TIME 26892STATUS 3CANCEL_TIME 1INSERT_TIME 35319MODIFY_TIME 35319FIRM 2dtype: int64
order_info[ 'FIRM' ] . value_counts( )
3 3202910 8521Name: FIRM, dtype: int64
order[ 'STATUS' ] . value_counts( )
1 344592 48333 609Name: STATUS, dtype: int64
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
合并数据
df. head( )
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME in_hour in_minute in_time out_hour out_minute out_time in_datetime out_datetime 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 159612011149.0 1 nan 20200730224152 20200730224152 3 NaN 19930804 320821 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 1 923696 C72F20539AD1447D86CD1A8E5EAEC63A 282932 202008041400 202008061200 159612092325.0 1 nan 20200730225524 20200730225524 3 NaN 19970614 370302 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 2 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007151300 202007161200 202007092241.0 1 nan 20200714171021 20200714171021 3 NaN 20010409 371002 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 3 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202010011400 202010021011 202007132109.0 1 nan 20201002101040 20201002101040 3 NaN 19881023 341202 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 4 757455 43F69E6DFE004103BC4741C355D2A793 275834 202007151400 202007161200 202007141819.0 2 nan 20200714181929 20200714181929 3 3710021366202007150001 19920216 370784 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT
df. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 40043 non-null int64 1 ORDER_PRIMARY_ID 40043 non-null object 2 HOTELID 40043 non-null object 3 PRE_IN_TIME 40043 non-null object 4 PRE_OUT_TIME 40043 non-null object 5 ORDER_TIME 40043 non-null object 6 STATUS 40043 non-null int64 7 CANCEL_TIME 40043 non-null object 8 INSERT_TIME 40043 non-null object 9 MODIFY_TIME 40043 non-null object 10 FIRM 40043 non-null int64 11 GUEST_ID 5131 non-null object 12 BDATE 40043 non-null object 13 XZQH 40043 non-null object 14 IN_TIME 40043 non-null object 15 OUT_TIME 40043 non-null object 16 in_hour 40043 non-null object 17 in_minute 40043 non-null object 18 in_time 40043 non-null object 19 out_hour 40043 non-null object 20 out_minute 40043 non-null object 21 out_time 40043 non-null object 22 in_datetime 4913 non-null datetime64[ns] 23 out_datetime 644 non-null datetime64[ns]dtypes: datetime64[ns](2), int64(3), object(19)memory usage: 7.6+ MB
df. rename( columns= { 'HOTELID' : 'CODE' } , inplace= True )
df = pd. merge( df, room_info, on= [ 'CODE' , 'FIRM' ] )
df. head( 2 )
ORDER_ID ORDER_PRIMARY_ID CODE PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME in_hour in_minute in_time out_hour out_minute out_time in_datetime out_datetime HOTELID JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ 0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 159612011149.0 1 nan 20200730224152 20200730224152 3 NaN 19930804 320821 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 37100A1620 0.0 1 2 1 金海滩临海落地窗公寓,北欧式装修 NaN NaN 371002 环翠区高区金海滩街道新浪屿花园8号楼1104 1104 2020-07-06 17:59:35 2020-07-15 10:24:21 1 761299 8923C26CC35F42C9A7C0A08BCC22179C 278337 202007221200 202007251200 202007150017.0 1 nan 20200715001752 20200715001752 3 NaN 19810628 120101 0.0 0.0 0 .0 0.0 0 .0 0.0 NaT NaT 37100A1620 0.0 1 2 1 金海滩临海落地窗公寓,北欧式装修 NaN NaN 371002 环翠区高区金海滩街道新浪屿花园8号楼1104 1104 2020-07-06 17:59:35 2020-07-15 10:24:21
df. info( )
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ORDER_ID 40043 non-null int64 1 ORDER_PRIMARY_ID 40043 non-null object 2 CODE 40043 non-null object 3 PRE_IN_TIME 40043 non-null object 4 PRE_OUT_TIME 40043 non-null object 5 ORDER_TIME 40043 non-null object 6 STATUS 40043 non-null int64 7 CANCEL_TIME 40043 non-null object 8 INSERT_TIME 40043 non-null object 9 MODIFY_TIME 40043 non-null object 10 FIRM 40043 non-null int64 11 GUEST_ID 5131 non-null object 12 BDATE 40043 non-null object 13 XZQH 40043 non-null object 14 IN_TIME 40043 non-null object 15 OUT_TIME 40043 non-null object 16 in_hour 40043 non-null object 17 in_minute 40043 non-null object 18 in_time 40043 non-null object 19 out_hour 40043 non-null object 20 out_minute 40043 non-null object 21 out_time 40043 non-null object 22 in_datetime 4913 non-null datetime64[ns] 23 out_datetime 644 non-null datetime64[ns] 24 HOTELID 40043 non-null object 25 JYMJ 40043 non-null float64 26 ROOM_NUM 40043 non-null int64 27 BED_NUM 40043 non-null int64 28 CZLY 40043 non-null int64 29 CALLED 40043 non-null object 30 BUR_CODE 20204 non-null float64 31 STA_CODE 20204 non-null object 32 SSX 40043 non-null int64 33 ADDRESS 40043 non-null object 34 MPHM 40043 non-null object 35 DJSJ 40043 non-null object 36 BGSJ 40043 non-null object dtypes: datetime64[ns](2), float64(2), int64(7), object(26)memory usage: 11.6+ MB
df. nunique( )
ORDER_ID 29941ORDER_PRIMARY_ID 39901CODE 4801PRE_IN_TIME 6932PRE_OUT_TIME 741ORDER_TIME 26892STATUS 3CANCEL_TIME 1INSERT_TIME 35319MODIFY_TIME 35319FIRM 2GUEST_ID 3675BDATE 9064XZQH 2578IN_TIME 3380OUT_TIME 235in_hour 24in_minute 61in_time 264out_hour 15out_minute 25out_time 211in_datetime 263out_datetime 210HOTELID 4801JYMJ 173ROOM_NUM 11BED_NUM 17CZLY 4CALLED 3813BUR_CODE 7STA_CODE 53SSX 5ADDRESS 4377MPHM 534DJSJ 3363BGSJ 2871dtype: int64
df. columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME', 'ORDER_TIME', 'STATUS', 'CANCEL_TIME', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH', 'IN_TIME', 'OUT_TIME', 'in_hour', 'in_minute', 'in_time', 'out_hour', 'out_minute', 'out_time', 'in_datetime', 'out_datetime', 'HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'CALLED', 'BUR_CODE', 'STA_CODE', 'SSX', 'ADDRESS', 'MPHM', 'DJSJ', 'BGSJ'], dtype='object')
删除HOTELID,CANCEL_TIME
df1 = df[ [ 'ORDER_ID' , 'CODE' , 'PRE_IN_TIME' , 'PRE_OUT_TIME' , 'ORDER_TIME' , 'INSERT_TIME' , 'MODIFY_TIME' , 'IN_TIME' , 'OUT_TIME' , 'JYMJ' , 'ROOM_NUM' , 'BED_NUM' , 'STATUS' ] ]
0-撤销;1-预定;2-入住;3-离开
df1[ 'STATUS' ] . value_counts( )
1 344612 49213 661Name: STATUS, dtype: int64
df1. head( 50 )
ORDER_ID CODE PRE_IN_TIME PRE_OUT_TIME ORDER_TIME INSERT_TIME MODIFY_TIME IN_TIME OUT_TIME JYMJ ROOM_NUM BED_NUM STATUS 0 923521 278337 202008011200 202008021200 159612011149.0 20200730224152 20200730224152 0.0 0.0 0.0 1 2 1 1 761299 278337 202007221200 202007251200 202007150017.0 20200715001752 20200715001752 0.0 0.0 0.0 1 2 1 2 943703 278337 202008091200 202008121200 202008012010.0 20200801201019 20200801201019 0.0 0.0 0.0 1 2 1 3 1012389 278337 202008141200 202008171200 202008080900.0 20200808090100 20200808090100 0.0 0.0 0.0 1 2 1 4 1351873 278337 202010051200 202010061200 202009091245.0 20200909124547 20200909124547 0.0 0.0 0.0 1 2 1 5 1481730 278337 202010051200 202010071200 202009221039.0 20200922103911 20200922103911 202010051624 202010071200 0.0 1 2 3 6 1516263 278337 202010011200 202010041200 202009251502.0 20200925150231 20200925150231 0.0 0.0 0.0 1 2 1 7 1539785 278337 202010041200 202010051200 202009271942.0 20200927194258 20200927194258 0.0 0.0 0.0 1 2 1 8 1568043 278337 202010041200 202010051200 202009301837.0 20200930183801 20200930183801 0.0 0.0 0.0 1 2 1 9 1830343 278337 202010311200 202011011200 202010301016.0 20201031120256 20201031120256 0.0 0.0 0.0 1 2 1 10 1830343 278337 202010311200 202011011200 202010301016.0 20201030101655 20201030101655 0.0 0.0 0.0 1 2 1 11 1836522 278337 202010301200 202010311200 202010301933.0 20201030193317 20201030193317 0.0 0.0 0.0 1 2 1 12 1850449 278337 202011011200 202011021200 202011010951.0 20201101121128 20201101121128 0.0 0.0 0.0 1 2 1 13 1850449 278337 202011011200 202011021200 202011010951.0 20201101095111 20201101095111 0.0 0.0 0.0 1 2 1 14 2035663 278337 202011210022 202011211200 202011210022.0 20201121002216 20201121002216 0.0 0.0 0.0 1 2 1 15 2363298 278337 202101021200 202101041200 202012230028.0 20210102120800 20210102120800 0.0 0.0 0.0 1 2 1 16 2363298 278337 202101021200 202101041200 202012230028.0 20201223002859 20201223002859 0.0 0.0 0.0 1 2 1 17 2465555 278337 202101021200 202101031200 202101011632.0 20210101163250 20210101163250 0.0 0.0 0.0 1 2 1 18 2465555 278337 202101021200 202101031200 202101011632.0 20210102120701 20210102120701 0.0 0.0 0.0 1 2 1 19 3356024 278337 202104291158 202105051158 202104121231.0 20210429120002 20210429120002 0.0 0.0 0.0 1 2 1 20 3576969 278337 202105061158 202105091158 202104291925.0 20210506120002 20210506120002 0.0 0.0 0.0 1 2 1 21 3599404 278337 202105051158 202105061158 202105020724.0 20210505120008 20210505120008 0.0 0.0 0.0 1 2 1 22 3599404 278337 202105051158 202105061158 202105020724.0 20210502072638 20210502072638 0.0 0.0 0.0 1 2 1 23 3665000 278337 202106121158 202106151158 202105081926.0 20210508192825 20210508192825 0.0 0.0 0.0 1 2 1 24 3686275 278337 202105102004 202105111158 202105102004.0 20210510201002 20210510201002 0.0 0.0 0.0 1 2 1 25 3750952 278337 202105161158 202105171158 202105161150.0 20210516120228 20210516120228 0.0 0.0 0.0 1 2 1 26 3777571 278337 202105181904 202105191158 202105181904.0 20210518190621 20210518190621 0.0 0.0 0.0 1 2 1 27 3787561 278337 202105211158 202105231158 202105191621.0 20210519162332 20210519162332 0.0 0.0 0.0 1 2 1 28 3787561 278337 202105211158 202105231158 202105191621.0 20210521120006 20210521120006 0.0 0.0 0.0 1 2 1 29 3941122 278337 202106022128 202106031158 202106022128.0 20210602214001 20210602214001 0.0 0.0 0.0 1 2 1 30 3941122 278337 202106022128 202106031158 202106022128.0 20210602213051 20210602213051 0.0 0.0 0.0 1 2 1 31 3988599 278337 202106101158 202106111158 202106070140.0 20210610120016 20210610120016 0.0 0.0 0.0 1 2 1 32 3988599 278337 202106101158 202106111158 202106070140.0 20210607014246 20210607014246 0.0 0.0 0.0 1 2 1 33 4300717 278337 202107061158 202107071158 202107031110.0 20210706120003 20210706120003 0.0 0.0 0.0 1 2 1 34 4308464 278337 202107051158 202107061158 202107032053.0 20210703205553 20210703205553 0.0 0.0 0.0 1 2 1 35 4308464 278337 202107051158 202107061158 202107032053.0 20210705120003 20210705120003 0.0 0.0 0.0 1 2 1 36 923696 282932 202008041400 202008061200 159612092325.0 20200730225524 20200730225524 0.0 0.0 0.0 2 3 1 37 1065945 282932 202008131400 202008141200 202008122227.0 20200812222725 20200812222725 0.0 0.0 0.0 2 3 1 38 1370082 282932 202009181400 202009191200 202009110933.0 20200911093350 20200911093350 0.0 0.0 0.0 2 3 1 39 1370328 282932 202009191400 202009211200 202009111005.0 20200911100510 20200911100510 0.0 0.0 0.0 2 3 1 40 1550691 282932 202010011400 202010021200 202009282156.0 20200928215621 20200928215621 0.0 0.0 0.0 2 3 1 41 1612645 282932 202010081400 202010091200 202010061132.0 20201006113229 20201006113229 0.0 0.0 0.0 2 3 1 42 4497012 282932 202107221358 202107291158 202107181945.0 20210718194727 20210718194727 0.0 0.0 0.0 2 3 1 43 4505766 282932 202107201358 202107221158 202107191521.0 20210719152358 20210719152358 0.0 0.0 0.0 2 3 1 44 4505766 282932 202107201358 202107221158 202107191521.0 20210720140022 20210720140022 0.0 0.0 0.0 2 3 1 45 706648 227185 202007151300 202007161200 202007092241.0 20200714171021 20200714171021 0.0 0.0 0.0 1 1 1 46 853709 227185 202007251300 202007261200 202007232243.0 20200723224348 20200723224348 0.0 0.0 0.0 1 1 1 47 1008589 227185 202008251300 202008271200 202008072052.0 20200807205244 20200807205244 0.0 0.0 0.0 1 1 1 48 1089505 227185 202008181300 202008201200 202008142213.0 20200814221323 20200814221323 0.0 0.0 0.0 1 1 1 49 1111054 227185 202008171300 202008181200 202008162352.0 20200816235228 20200816235228 0.0 0.0 0.0 1 1 1
df[ 'CODE' ] . value_counts( ) [ : 10 ]
B106535897 420315790 278296093 222296537 187282520 177B105273690 142298705 124298707 122289326 116294473 116Name: CODE, dtype: int64
df[ 'HOTELID' ] . value_counts( ) [ : 10 ]
37100A3821 420371098A808 27837100A9707 222371098B022 18737100A1898 17737100A3565 142371099A016 124371099A013 12237100A9501 11637100A8976 116Name: HOTELID, dtype: int64
df_g = df. groupby( [ 'CODE' , 'PRE_IN_TIME' , 'PRE_OUT_TIME' ] ) [ 'ORDER_ID' ] . count( ) . reset_index( )
df_g. head( )
CODE PRE_IN_TIME PRE_OUT_TIME ORDER_ID 0 100177 202107111358 202107121158 2 1 100177 202107241358 202107261158 2 2 100181 202106271358 202106291158 2 3 100181 202107191358 202107201158 1 4 100181 202107221358 202107231158 2
ORDER_ID有重复,MODIFY_TIME修改过,按照最后一次的修改确定时间
df1[ df1[ 'CODE' ] == '100177' ]
ORDER_ID CODE PRE_IN_TIME PRE_OUT_TIME ORDER_TIME INSERT_TIME MODIFY_TIME IN_TIME OUT_TIME JYMJ ROOM_NUM BED_NUM STATUS 36937 4402846 100177 202107111358 202107121158 202107111032.0 20210711103458 20210711103458 0.0 0.0 0.0 1 1 1 36938 4402846 100177 202107111358 202107121158 202107111032.0 20210711140042 20210711140042 0.0 0.0 0.0 1 1 1 36939 4455098 100177 202107241358 202107261158 202107151446.0 20210724140006 20210724140006 0.0 0.0 0.0 1 1 1 36940 4455098 100177 202107241358 202107261158 202107151446.0 20210715144850 20210715144850 0.0 0.0 0.0 1 1 1
df1. info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40043 entries, 0 to 40042
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 40043 non-null int64
1 CODE 40043 non-null object
2 PRE_IN_TIME 40043 non-null object
3 PRE_OUT_TIME 40043 non-null object
4 ORDER_TIME 40043 non-null object
5 INSERT_TIME 40043 non-null object
6 MODIFY_TIME 40043 non-null object
7 IN_TIME 40043 non-null object
8 OUT_TIME 40043 non-null object
9 JYMJ 40043 non-null float64
10 ROOM_NUM 40043 non-null int64
11 BED_NUM 40043 non-null int64
12 STATUS 40043 non-null int64
dtypes: float64(1), int64(4), object(8)
memory usage: 4.3+ MB