111111111111

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import warnings
import seaborn as sns
import scipy.stats as st
import gc

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# d.date_range("2018/01/01", "2018/01/10", freq="D")
path = 'traina/'  
path1 = 'trainb/'
cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN19800627222405NaNNaN
13525D57CAE104A078E4962B2B89377B0371099C301202107090001199510253705232.021071e+112.021071e+11
gc.collect()
17
cust['BDATE'] = cust['BDATE'].astype('str')cust['XZQH'] = cust['XZQH'].astype('str')cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(3)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN198006272224050.00.0
13525D57CAE104A078E4962B2B89377B0371099C30120210709000119951025370523202107092103202107111158
2A1414E7D71E34E1DA743AC08BCF49151371002182220200824000119860311370781202008241401202008251200
# cust['in_hour'] = cust['IN_TIME'].apply(lambda x:x[-4:-2])# cust['in_minute'] = cust['IN_TIME'].apply(lambda x:x[-2:])# cust['in_time'] = cust['IN_TIME'].apply(lambda x:x[:8])# cust['out_hour'] = cust['OUT_TIME'].apply(lambda x:x[-4:-2])# cust['out_minute'] = cust['OUT_TIME'].apply(lambda x:x[-2:])# cust['out_time'] = cust['OUT_TIME'].apply(lambda x:x[:8])# cust['in_datetime'] = pd.to_datetime(cust['in_time'], errors='coerce')# cust['out_datetime'] = pd.to_datetime(cust['out_time'], errors='coerce')
cust.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 44030 entries, 0 to 3343Data columns (total 6 columns): #   Column            Non-Null Count  Dtype ---  ------            --------------  -----  0   ORDER_PRIMARY_ID  44030 non-null  object 1   GUEST_ID          5158 non-null   object 2   BDATE             44030 non-null  object 3   XZQH              44030 non-null  object 4   IN_TIME           44030 non-null  object 5   OUT_TIME          44030 non-null  objectdtypes: object(6)memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID    43883GUEST_ID             3698BDATE                9503XZQH                 2659IN_TIME              3396OUT_TIME              236dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[10:15]
134B74870F1D4087AB3F590D2A0AAFDC    23370A2886D8F4E11860A00FEB2289E56    223D42363F202408BA04CD3D5BBAE5508    219631D36ABA448D9B0AECF0A7200C4C5    29373325C891E42CB8A8F43AF14939954    2Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').valuescust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
2280170798782D6C04A438360D80AFE4845C1371099B389202107040001199903141309830.00.032
cust[cust['ORDER_PRIMARY_ID']=='5B94671A9390437E8E79A86D4B15A076']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
186315B94671A9390437E8E79A86D4B15A076371098A226202107010001200001143723240.00.021
cust[cust['ORDER_PRIMARY_ID']=='479006B921F24969B60B68DAEDA91909']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
14578479006B921F24969B60B68DAEDA9190937100209822020100200011984101321010320201002160120201006120022
cust[cust['ORDER_PRIMARY_ID']=='1772C1DEE3F141D8960BB9044CA4CF3A']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
47531772C1DEE3F141D8960BB9044CA4CF3A371098A783202107050001196209164222260.00.032
cust[cust['ORDER_PRIMARY_ID']=='BD39A40287CE4E76B46D3BC85BFA1298']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
38447BD39A40287CE4E76B46D3BC85BFA1298371099A428202105100004197102193702040.020210521183421

#前4位到市,前6位到县# cust_info['XZ_4'] = cust_info['XZQH'].apply(lambda x:x//100)
# cust_info['birth_date'] = pd.to_datetime(cust_info['BDATE'].astype(str), errors='coerce')# cust_info['birth_year'] = cust_info['birth_date'].dt.year.astype(int)# cust_info['birth_year'] = cust_info['birth_date'].dt.month.astype(int)# cust_info['birth_year'] = cust_info['birth_date'].dt.day.astype(int)# cust_info['birth_year'] = cust_info['birth_date'].dt.quarter.astype(int)#cust_info['birth_year'] = pd['birth_date'].dt.years#cust_info['birth_year'] = pd['birth_date'].dt.years
# cust_info['in_time_year'] = cust_info['IN_TIME'].apply(lambda x: x//10**8)# cust_info['in_time_month'] = cust_info['IN_TIME'].apply(lambda x: x//10**6 - (x//10**8)*10**2)# cust_info['in_time_day'] = cust_info['IN_TIME'].apply(lambda x: x//10**4 - (x//10**6)*10**2)# cust_info['in_time_hour'] = cust_info['IN_TIME'].apply(lambda x: x//10**2 - (x//10**4)*10**2)# cust_info['in_time_minute'] = cust_info['IN_TIME'].apply(lambda x: x - (x//10**2)*10**2)# cust_info['out_time_year'] = cust_info['OUT_TIME'].apply(lambda x: x//10**8)# cust_info['out_time_month'] = cust_info['OUT_TIME'].apply(lambda x: x//10**6 - (x//10**8)*10**2)# cust_info['out_time_day'] = cust_info['OUT_TIME'].apply(lambda x: x//10**4 - (x//10**6)*10**2)# cust_info['out_time_hour'] = cust_info['OUT_TIME'].apply(lambda x: x//10**2 - (x//10**4)*10**2)# cust_info['out_time_minute'] = cust_info['OUT_TIME'].apply(lambda x: x - (x//10**2)*10**2)
# cust_info['check_in_day'] = (cust_info['out_time_year'] - cust_info['in_time_year'])*365 + (cust_info['out_time_month'] - cust_info['in_time_month'])*30 + (cust_info['out_time_day'] - cust_info['in_time_day'])# cust_info['check_in_day'] = cust_info['check_in_day'].apply(lambda x: x if (x == np.nan) else x+1)
# cust_info['check_in_hour'] = (cust_info['out_time_year'] - cust_info['in_time_year'])*365*30*24 + (cust_info['out_time_month'] - cust_info['in_time_month'])*30*24 + (cust_info['out_time_day'] - cust_info['in_time_day'])*24 + (cust_info['out_time_hour'] - cust_info['in_time_hour'])# cust_info['check_in_hour'] = cust_info['check_in_hour'].apply(lambda x: x if (x == np.nan and x > 0) else x+24)
# cust_info.head()
# cust_info_1 = cust_info.groupby(['in_time_year', 'in_time_month'])['ORDER_PRIMARY_ID']
# 各个月份订单数量# cust_info_1.count()
cust['IN_TIME'].max()
'202110041413'
#cust_info['in_time'] = pd.to_datetime(map(cust_info['IN_TIME'].astype(int), str), errors='coerce')
# cust_info['IN_TIME'][2]

room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
CODEHOTELIDJYMJROOM_NUMBED_NUMFWLYCZLYCALLEDCITY_CODEBUR_CODESTA_CODESSXADDRESSMPHMJYQKFIRMDJSJBGSJSTATUSAUDITSTATUS
010017737108300020.01121【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽371000000000NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM001【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽32020-05-18 10:33:552020-07-15 10:23:58NaNNaN
room_info[(room_info['CALLED']==room_info['JYQK'])==False]
CODEHOTELIDJYMJROOM_NUMBED_NUMFWLYCZLYCALLEDCITY_CODEBUR_CODESTA_CODESSXADDRESSMPHMJYQKFIRMDJSJBGSJSTATUSAUDITSTATUS
room_info['CALLED'].nunique()
4232
room_info['CALLED'].value_counts()[:5]
高铁北站看海公寓                       24韩乐坊/不夜城美食街/乐天世纪城 豪华大床房 景驰公寓    20国际海水浴场精致海景大床房                  12九龙湾豪华一线海景loft复式                10高铁北站看海民宿                       10Name: CALLED, dtype: int64
room_info['HOTELID'].nunique()
5307
room_info['CODE'].nunique()
5307
room_info.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 20 columns): #   Column       Non-Null Count  Dtype  ---  ------       --------------  -----   0   CODE         5307 non-null   object  1   HOTELID      5307 non-null   object  2   JYMJ         5307 non-null   float64 3   ROOM_NUM     5307 non-null   int64   4   BED_NUM      5307 non-null   int64   5   FWLY         5307 non-null   int64   6   CZLY         5307 non-null   int64   7   CALLED       5307 non-null   object  8   CITY_CODE    5307 non-null   int64   9   BUR_CODE     2658 non-null   float64 10  STA_CODE     2658 non-null   object  11  SSX          5307 non-null   int64   12  ADDRESS      5307 non-null   object  13  MPHM         5307 non-null   object  14  JYQK         5307 non-null   object  15  FIRM         5307 non-null   int64   16  DJSJ         5307 non-null   object  17  BGSJ         5307 non-null   object  18  STATUS       2658 non-null   float64 19  AUDITSTATUS  2658 non-null   float64dtypes: float64(4), int64(7), object(9)memory usage: 829.3+ KB
room_info.nunique()
CODE           5307HOTELID        5307JYMJ            173ROOM_NUM         11BED_NUM          17FWLY              1CZLY              4CALLED         4232CITY_CODE         1BUR_CODE          7STA_CODE         54SSX               5ADDRESS        4850MPHM            584JYQK           4232FIRM              2DJSJ           3753BGSJ           3294STATUS            1AUDITSTATUS       1dtype: int64
# ro = room_info[room_info['STATUS'].notnull()]
# ro[ro['STATUS']!=ro['AUDITSTATUS']]
room_info['STATUS'].value_counts()
1.0    2658Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)
room_info['FIRM'].value_counts()
3     342910    1878Name: FIRM, dtype: int64
room_info.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 15 columns): #   Column    Non-Null Count  Dtype  ---  ------    --------------  -----   0   HOTELID   5307 non-null   object  1   JYMJ      5307 non-null   float64 2   ROOM_NUM  5307 non-null   int64   3   BED_NUM   5307 non-null   int64   4   CZLY      5307 non-null   int64   5   CALLED    5307 non-null   object  6   BUR_CODE  2658 non-null   float64 7   STA_CODE  2658 non-null   object  8   SSX       5307 non-null   int64   9   ADDRESS   5307 non-null   object  10  MPHM      5307 non-null   object  11  FIRM      5307 non-null   int64   12  DJSJ      5307 non-null   object  13  BGSJ      5307 non-null   object  14  STATUS    2658 non-null   float64dtypes: float64(3), int64(5), object(7)memory usage: 622.0+ KB
room_info.nunique()
HOTELID     5307JYMJ         173ROOM_NUM      11BED_NUM       17CZLY           4CALLED      4232BUR_CODE       7STA_CODE      54SSX            5ADDRESS     4850MPHM         584FIRM           2DJSJ        3753BGSJ        3294STATUS         1dtype: int64
  • CODE-------- 民宿编码与HOTELID关联
  • JYMJ-------- 经营面积
  • ROOM_NUM---- 房间数量
  • BED_NUM----- 床位数
  • CZLY-------- 出租类型
  • CALLED------ 名称
  • BUR_CODE---- 所属分局代码
  • STA_CODE---- 所属派出所代码
  • SSX--------- 省市县
  • ADDRESS----- 地址
  • MPHM-------- 门牌号
  • FIRM-------- 平台厂商
  • DJSJ-------- 登记时间
  • BGSJ-------- 变更时间
  • STATUS------ 状态
room_info[:2]
HOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMFIRMDJSJBGSJSTATUS
01001770.0111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN
11001810.0571【山海边公寓】瞻星伴月阁NaNNaN371083乳山市银滩 檀香丽湾小区2-4-603ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN

登记时间和变更时间无变化的只有两家

room_info[room_info['DJSJ']==room_info['BGSJ']]
HOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMFIRMDJSJBGSJSTATUS
11502967530.0121春润庭院客栈豪华家庭套房NaNNaN371002环翠区环海路金海滩花园41号楼ROOM00132020-12-05 10:22:402020-12-05 10:22:40NaN
11562967720.0111哈工大旁春润庭院大床房NaNNaN371002环翠区环海路金海滩花园41号楼ROOM00132020-12-05 10:22:402020-12-05 10:22:40NaN
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
room的统计量特征
#登记时间和变更时间的差值room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']for col in freq_col:    st = room_info[col].value_counts().reset_index()    st.columns = [col, col+'_freq']    room_info = room_info.merge(st, on=col, how='left')
#房间面积率和床位率\单间房床位量room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.dateroom_info['BG_date'] = room_info['BGSJ'].dt.date
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info[:2]
HOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMFIRMDJSJBGSJSTATUSCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gap
0100177NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN11NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0
1100181NaN571【山海边公寓】瞻星伴月阁NaNNaN371083乳山市银滩 檀香丽湾小区2-4-603ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN11NaN49454727NaNNaN15025653429NaNNaN1.42020-05-182020-07-1558.0
room_info['DJ_gap'].nunique()
29
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'), Timestamp('2021-10-19 16:05:41'), Timestamp('2020-07-15 10:23:06'), Timestamp('2021-10-19 16:09:15'))
room = room_info.copy()
room['DJ_year'] = room['DJSJ'].dt.year.fillna(0).astype('int')room['DJ_month'] = room['DJSJ'].dt.month.fillna(0).astype('int')room['DJ_day'] = room['DJSJ'].dt.day.fillna(0).astype('int')room['DJ_quarter'] = room['DJSJ'].dt.quarter.fillna(0).astype('int')room['DJ_dayofweek'] = room['DJSJ'].dt.dayofweek.fillna(0).astype('int')room['DJ_dayofyear'] = room['DJSJ'].dt.dayofyear.fillna(0).astype('int')room['BG_year'] = room['DJSJ'].dt.year.fillna(0).astype('int')room['BG_month'] = room['DJSJ'].dt.month.fillna(0).astype('int')room['BG_day'] = room['DJSJ'].dt.day.fillna(0).astype('int')room['BG_quarter'] = room['DJSJ'].dt.quarter.fillna(0).astype('int')room['BG_dayofweek'] = room['DJSJ'].dt.dayofweek.fillna(0).astype('int')room['BG_dayofyear'] = room['DJSJ'].dt.dayofyear.fillna(0).astype('int')
room['BG_dayofweek'].value_counts()
1    14014     9990     7615     6672     5963     4886     395Name: BG_dayofweek, dtype: int64
  • #DJ_gap|DJ_year|DJ_month|DJ_day|DJ_quarter|DJ_dayofweek|DJ_dayofyear…
  • #freq特征
room[:3]
HOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMFIRMDJSJBGSJSTATUSCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapDJ_yearDJ_monthDJ_dayDJ_quarterDJ_dayofweekDJ_dayofyearBG_yearBG_monthBG_dayBG_quarterBG_dayofweekBG_dayofyear
0100177NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN11NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0202051820139202051820139
1100181NaN571【山海边公寓】瞻星伴月阁NaNNaN371083乳山市银滩 檀香丽湾小区2-4-603ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN11NaN49454727NaNNaN15025653429NaNNaN1.42020-05-182020-07-1558.0202051820139202051820139
2100193NaN221【山海边公寓】两居电梯洋房,背依藏龙山,面朝大海NaNNaN371083乳山市长江路檀香丽湾1-16-901ROOM00132020-05-18 10:33:552020-07-15 10:23:58NaN11NaN124922814727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0202051820139202051820139
# room['JYMJ'] = room['JYMJ'].apply(lambda x: np.nan if x==0 else x)
plt.figure(figsize=(20, 8))sns.countplot(room['JYMJ'])
<AxesSubplot:xlabel='JYMJ', ylabel='count'>


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-O93ifdQO-1646128974748)(output_72_1.png)]

经营面积缺失超过一半,3000多,考虑删除

#出租类型room['CZLY'].value_counts()
1    47270     5042      719       5Name: CZLY, dtype: int64
#所属分局代码room['BUR_CODE'].value_counts()
3.710990e+11    9543.710980e+11    8523.710020e+11    6683.710820e+11    1083.710810e+11     563.710830e+11     163.710960e+11      4Name: BUR_CODE, dtype: int64
#所属派出所代码room['STA_CODE'].value_counts()[:5]
371000000006    505371000000007    433371098060000    321371099100000    258371000000001    242Name: STA_CODE, dtype: int64
#省市县room['SSX'].value_counts()
371002    4661371082     406371083     150371081      64371003      26Name: SSX, dtype: int64
#门牌号码room['MPHM'].value_counts()[:5]
ROOM001    2565无          1878ROOM002      57ROOM003      26ROOM004      17Name: MPHM, dtype: int64



# platform infomationorder_info = pd.read_csv(path + '网约平台旅客订单信息.csv') order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')order = pd.concat([order_info, order1])
order.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
092352196BBDB7CC049421C85826AE07020B1392783372020080112002020080212001.596120e+111NaN20200730224152202007302241523
1923696C72F20539AD1447D86CD1A8E5EAEC63A2829322020080414002020080612001.596121e+111NaN20200730225524202007302255243
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
3097044220273CDEDB5E03534D379687645675898CA41708982021080813582021081011582.021071e+113NaN20210712210153202107122101533
cust[cust['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
124213CDEDB5E03534D379687645675898CA4371002B268202107280001198710311102240.020210802084211
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

##########################没有考虑包含取消时间的订单

order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order[order['ORDER_ID']==4402846]
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
301414402846BC0E8E3602434EA6A5F29A6F6FF422331001772021071113582021071211582021071110321nan20210711140042202107111400423
order.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 29941 entries, 2 to 18590Data columns (total 11 columns): #   Column            Non-Null Count  Dtype ---  ------            --------------  -----  0   ORDER_ID          29941 non-null  int64  1   ORDER_PRIMARY_ID  29941 non-null  object 2   HOTELID           29941 non-null  object 3   PRE_IN_TIME       29941 non-null  object 4   PRE_OUT_TIME      29941 non-null  object 5   ORDER_TIME        29941 non-null  object 6   STATUS            29941 non-null  int64  7   CANCEL_TIME       29941 non-null  object 8   INSERT_TIME       29941 non-null  object 9   MODIFY_TIME       29941 non-null  object 10  FIRM              29941 non-null  int64 dtypes: int64(3), object(8)memory usage: 2.7+ MB
order.nunique()
ORDER_ID            29941ORDER_PRIMARY_ID    29941HOTELID              4801PRE_IN_TIME          6875PRE_OUT_TIME          729ORDER_TIME          26818STATUS                  3CANCEL_TIME             1INSERT_TIME         25432MODIFY_TIME         25432FIRM                    2dtype: int64
order_info['FIRM'].value_counts()
3     3202910     8521Name: FIRM, dtype: int64
order['STATUS'].value_counts()
1    261372     32213      583Name: STATUS, dtype: int64
#插入时间不等于修改时间,就是进行过修改过的订单,最后发现为空,等于没有修改过order[order['INSERT_TIME']!=order['MODIFY_TIME']]
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
#订单时间# order_info['ORDER_TIME'][2]
# order_info_cancel = order_info[~order_info['CANCEL_TIME'].isnull()]# order_info_cancel.head()
# order_info_cancel.shape
# order_info_cancel[order_info_cancel['CANCEL_TIME'] < order_info_cancel['PRE_IN_TIME']].shape
# cancel_index = order_info_cancel[order_info_cancel['CANCEL_TIME'] < order_info_cancel['PRE_IN_TIME']].index.tolist()
# order_info = order_info[~order_info.index.isin(cancel_index)]
# order_info.reset_index(inplace=True)
# order_info = order_info.drop(columns='index')
# order_info.tail()

合并数据

# order时间包含了cust时间df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')

月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天

df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)
df.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN2001040937100220200715130020200716120010
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN1988102334120220201001140020201002101110
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 29941 entries, 0 to 29940Data columns (total 15 columns): #   Column             Non-Null Count  Dtype ---  ------             --------------  -----  0   ORDER_ID           29941 non-null  int64  1   ORDER_PRIMARY_ID   29941 non-null  object 2   HOTELID            29941 non-null  object 3   ORDER_TIME         29941 non-null  object 4   STATUS             29941 non-null  int64  5   INSERT_TIME        29941 non-null  object 6   MODIFY_TIME        29941 non-null  object 7   FIRM               29941 non-null  int64  8   GUEST_ID           3561 non-null   object 9   BDATE              29941 non-null  object 10  XZQH               29941 non-null  object 11  IN_TIME            29941 non-null  object 12  OUT_TIME           29941 non-null  object 13  guest_sum          29941 non-null  int64  14  guest_sum_notnull  29941 non-null  int64 dtypes: int64(5), object(10)memory usage: 3.7+ MB
df.nunique();
ORDER_ID             29941ORDER_PRIMARY_ID     29941HOTELID               4801ORDER_TIME           26818STATUS                   3INSERT_TIME          25432MODIFY_TIME          25432FIRM                     2GUEST_ID              3561BDATE                 9036XZQH                  2575IN_TIME               8953OUT_TIME               732guest_sum                3guest_sum_notnull        3dtype: int64
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_time
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN20010409371002202007151300202007161200102020-07-15 13:00:002020-07-16 12:00:00
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN19881023341202202010011400202010021011102020-10-01 14:00:002020-10-02 10:11:00
# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')                 #入住时间的年份# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int')               #退房时间的年份# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int')               #入住时间的月份# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int')             #退房时间的月份# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')                   #入住时间的日期# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')                 #退房时间的日期# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int')                 #入住时间的小时# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int')               #退房时间的小时# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')
#2月份没有# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)
# df[df['in_time_month']==12]['day_gap'] = df[df['in_time_month']==12]['day_gap'].apply(lambda x: x+31)# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)
# month_ = [1, 3, 5, 7, 8, 10]# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)
# df['day_gap'].sum()
49261

#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房

# df_0 = df[(df['day_gap']==0)]# df_0['date'] = df_0['IN_TIME']# 这里边包含了钟点房和当天入住并退房的。
dfs = []for idx, group in df.groupby(['ORDER_PRIMARY_ID']):    stat = pd.DataFrame()    stat['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)    stat['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]    dfs.append(stat)
df_date = pd.concat(dfs).reset_index(drop=True)
df1 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
df1.shape
(60973, 27)
df1[(df1['out_time']!=df1['datetime']) & (df1['day_gap']!=0)].shape
(50058, 27)
df1[:2]
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timein_time_yearout_time_yearin_time_monthout_time_monthin_time_dayout_time_dayin_time_hourout_time_hourday_gapdatetime
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN20010409371002202007151300202007161200102020-07-15 13:00:002020-07-16 12:00:0020202020771516131212020-07-15 13:00:00
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN19881023341202202010011400202010021011102020-10-01 14:00:002020-10-02 10:11:0020202020101012141012020-10-01 14:00:00
df1['in_date'] = df1['in_time'].dt.datedf1['out_date'] = df1['out_time'].dt.datedf1['date'] = df1['datetime'].dt.date
df2 = df1[(df1['out_date']!=df1['date']) | ((df1['out_date']==df1['date']) & (df1['in_date']==df1['out_date']))]
df2.shape
(49547, 30)

stat = pd.DataFrame()
stat[‘date’] = pd.date_range(
start=group[‘IN_TIME’].values[0],#开始时间
end=group[‘OUT_TIME’].values[0],#截止时间
periods=4,#总长度
freq=‘D’,#时间间隔
tz=None,#时区
normalize=False,#是否标准化到midnight
name=None,#date名称
closed=None,#首尾是否在内
**kwargs,
)

df3 = df2.merge(room, on=['HOTELID', 'FIRM'], how='left')
df3.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 49547 entries, 0 to 49546Data columns (total 72 columns): #   Column             Non-Null Count  Dtype         ---  ------             --------------  -----          0   ORDER_ID           49547 non-null  int64          1   ORDER_PRIMARY_ID   49547 non-null  object         2   HOTELID            49547 non-null  object         3   ORDER_TIME         49547 non-null  object         4   STATUS_x           49547 non-null  int64          5   INSERT_TIME        49547 non-null  object         6   MODIFY_TIME        49547 non-null  object         7   FIRM               49547 non-null  int64          8   GUEST_ID           5963 non-null   object         9   BDATE              49547 non-null  object         10  XZQH               49547 non-null  object         11  IN_TIME            49547 non-null  object         12  OUT_TIME           49547 non-null  object         13  guest_sum          49547 non-null  int64          14  guest_sum_notnull  49547 non-null  int64          15  in_time            49547 non-null  datetime64[ns] 16  out_time           49547 non-null  datetime64[ns] 17  in_time_year       49547 non-null  int32          18  out_time_year      49547 non-null  int32          19  in_time_month      49547 non-null  int32          20  out_time_month     49547 non-null  int32          21  in_time_day        49547 non-null  int32          22  out_time_day       49547 non-null  int32          23  in_time_hour       49547 non-null  int32          24  out_time_hour      49547 non-null  int32          25  day_gap            49547 non-null  int64          26  datetime           49547 non-null  datetime64[ns] 27  in_date            49547 non-null  object         28  out_date           49547 non-null  object         29  date               49547 non-null  object         30  JYMJ               14248 non-null  float64        31  ROOM_NUM           49547 non-null  int64          32  BED_NUM            49547 non-null  int64          33  CZLY               49547 non-null  int64          34  CALLED             49547 non-null  object         35  BUR_CODE           22916 non-null  float64        36  STA_CODE           22916 non-null  object         37  SSX                49547 non-null  int64          38  ADDRESS            49547 non-null  object         39  MPHM               49547 non-null  object         40  DJSJ               49497 non-null  datetime64[ns] 41  BGSJ               49547 non-null  datetime64[ns] 42  STATUS_y           22916 non-null  float64        43  CALLED_freq        49547 non-null  int64          44  ADDRESS_freq       49547 non-null  int64          45  JYMJ_freq          14248 non-null  float64        46  ROOM_NUM_freq      49547 non-null  int64          47  BED_NUM_freq       49547 non-null  int64          48  CZLY_freq          49547 non-null  int64          49  BUR_CODE_freq      22916 non-null  float64        50  STA_CODE_freq      22916 non-null  float64        51  SSX_freq           49547 non-null  int64          52  MPHM_freq          49547 non-null  int64          53  FIRM_freq          49547 non-null  int64          54  room_ratio         14248 non-null  float64        55  bed_ratio          14248 non-null  float64        56  room_bed           49547 non-null  float64        57  DJ_date            49497 non-null  object         58  BG_date            49547 non-null  object         59  DJ_gap             49497 non-null  float64        60  DJ_year            49547 non-null  int32          61  DJ_month           49547 non-null  int32          62  DJ_day             49547 non-null  int32          63  DJ_quarter         49547 non-null  int32          64  DJ_dayofweek       49547 non-null  int32          65  DJ_dayofyear       49547 non-null  int32          66  BG_year            49547 non-null  int32          67  BG_month           49547 non-null  int32          68  BG_day             49547 non-null  int32          69  BG_quarter         49547 non-null  int32          70  BG_dayofweek       49547 non-null  int32          71  BG_dayofyear       49547 non-null  int32         dtypes: datetime64[ns](5), float64(10), int32(20), int64(18), object(19)memory usage: 23.8+ MB
df3.nunique()
ORDER_ID             29930ORDER_PRIMARY_ID     29930HOTELID               4801ORDER_TIME           26807STATUS_x                 3INSERT_TIME          25421MODIFY_TIME          25421FIRM                     2GUEST_ID              3561BDATE                 9035XZQH                  2575IN_TIME               8953OUT_TIME               730guest_sum                3guest_sum_notnull        3in_time               8953out_time               730in_time_year             2out_time_year            2in_time_month           12out_time_month          12in_time_day             31out_time_day            31in_time_hour            23out_time_hour           15day_gap                 33datetime             12076in_date                379out_date               387date                   408JYMJ                   172ROOM_NUM                11BED_NUM                 17CZLY                     4CALLED                3813BUR_CODE                 7STA_CODE                53SSX                      5ADDRESS               4377MPHM                   534DJSJ                  3358BGSJ                  2871STATUS_y                 1CALLED_freq             13ADDRESS_freq            10JYMJ_freq               40ROOM_NUM_freq           10BED_NUM_freq            15CZLY_freq                4BUR_CODE_freq            7STA_CODE_freq           27SSX_freq                 5MPHM_freq               13FIRM_freq                2room_ratio             198bed_ratio              249room_bed                34DJ_date                365BG_date                321DJ_gap                  29DJ_year                  3DJ_month                13DJ_day                  32DJ_quarter               5DJ_dayofweek             7DJ_dayofyear           322BG_year                  3BG_month                13BG_day                  32BG_quarter               5BG_dayofweek             7BG_dayofyear           322dtype: int64
df3 = df3.sort_values(by=['HOTELID', 'date'])
df3.head(2)
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUS_xINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timein_time_yearout_time_yearin_time_monthout_time_monthin_time_dayout_time_dayin_time_hourout_time_hourday_gapdatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJSTATUS_yCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapDJ_yearDJ_monthDJ_dayDJ_quarterDJ_dayofweekDJ_dayofyearBG_yearBG_monthBG_dayBG_quarterBG_dayofweekBG_dayofyear
231584402846BC0E8E3602434EA6A5F29A6F6FF42233100177202107111032120210711140042202107111400423NaN19731004370602202107111358202107121158102021-07-11 13:58:002021-07-12 11:58:0020212021771112131112021-07-11 13:58:002021-07-112021-07-122021-07-11NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:58NaN11NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0202051820139202051820139
25548445509801D7394D02B44376913536F6071151AD100177202107151446120210724140006202107241400063NaN19941001429006202107241358202107261158102021-07-24 13:58:002021-07-26 11:58:0020212021772426131122021-07-24 13:58:002021-07-242021-07-262021-07-24NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:58NaN11NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0202051820139202051820139
in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()in_num.columns = ['HOTELID', 'date', 'in_hotel_num']df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')
dd = df4[df4['HOTELID']=='227185']
dd.drop(['CALLED', 'ADDRESS'], axis=1, inplace=True)
plt.figure(figsize=(20, 8))plt.scatter(dd['date'], dd['in_hotel_num'])
<matplotlib.collections.PathCollection at 0x22da65e3640>


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-5g1bAE7v-1646128974752)(output_146_1.png)]

dd[:10]
ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUS_xINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timein_time_yearout_time_yearin_time_monthout_time_monthin_time_dayout_time_dayin_time_hourout_time_hourday_gapdatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYBUR_CODESTA_CODESSXMPHMDJSJBGSJSTATUS_yCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapDJ_yearDJ_monthDJ_dayDJ_quarterDJ_dayofweekDJ_dayofyearBG_yearBG_monthBG_dayBG_quarterBG_dayofweekBG_dayofyearin_hotel_numunuse_room
3338706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN20010409371002202007151300202007161200102020-07-15 13:00:002020-07-16 12:00:0020202020771516131212020-07-15 13:00:002020-07-152020-07-162020-07-15NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
333985370973821DF4F47B401A84823D965A4A4F77227185202007232243120200723224348202007232243483NaN20000805371083202007251300202007261200102020-07-25 13:00:002020-07-26 12:00:0020202020772526131212020-07-25 13:00:002020-07-252020-07-262020-07-25NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
3340111105423FCC9D42D4D4379A17F4614597D055C227185202008162352120200816235228202008162352283NaN19950629452627202008171300202008181200102020-08-17 13:00:002020-08-18 12:00:0020202020881718131212020-08-17 13:00:002020-08-172020-08-182020-08-17NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
33411089505A96257531F3C40529329CE1DBE2CFE28227185202008142213120200814221323202008142213233NaN20011210371523202008181300202008201200102020-08-18 13:00:002020-08-20 12:00:0020202020881820131222020-08-18 13:00:002020-08-182020-08-202020-08-18NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
33421089505A96257531F3C40529329CE1DBE2CFE28227185202008142213120200814221323202008142213233NaN20011210371523202008181300202008201200102020-08-18 13:00:002020-08-20 12:00:0020202020881820131222020-08-19 13:00:002020-08-182020-08-202020-08-19NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
33431160405D7D528089DDA4804B7F7968F3E7539C0227185202008210828120200821082858202008210828583NaN19870304232301202008211300202008221200102020-08-21 13:00:002020-08-22 12:00:0020202020882122131212020-08-21 13:00:002020-08-212020-08-222020-08-21NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
3344118434774F501E396554E7CA9F2BCE9F33CD0CE227185202008230919120200823091959202008230919593NaN19981130120222202008231300202008241200102020-08-23 13:00:002020-08-24 12:00:0020202020882324131212020-08-23 13:00:002020-08-232020-08-242020-08-23NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
3345119878039460A2B095B47CC924CA5EFB2FAA18C227185202008241612120200824170812202008241708123NaN20000509371083202008241300202008251200102020-08-24 13:00:002020-08-25 12:00:0020202020882425131212020-08-24 13:00:002020-08-242020-08-252020-08-24NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
33461008589C50E6D8FCAFD4CAEB1BBE94FA1AF66A0227185202008072052120200807205244202008072052443NaN20000421371122202008251300202008271200102020-08-25 13:00:002020-08-27 12:00:0020202020882527131222020-08-25 13:00:002020-08-252020-08-272020-08-25NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210
33471008589C50E6D8FCAFD4CAEB1BBE94FA1AF66A0227185202008072052120200807205244202008072052443NaN20000421371122202008251300202008271200102020-08-25 13:00:002020-08-27 12:00:0020202020882527131222020-08-26 13:00:002020-08-252020-08-272020-08-26NaN111NaNNaN371002ROOM0012020-07-10 15:35:222020-07-15 10:24:14NaN21NaN309115024727NaNNaN466125653429NaNNaN1.02020-07-102020-07-155.020207103419220207103419210

可以预测入住量

df4['unuse_room'] = df4['ROOM_NUM'] - df4['in_hotel_num']
df4['unuse_room'].value_counts()
 0     28340 1      9755 2      4830-1      3180-2      1073 3       667-3       414 4       330-4       191 5       159-5       114-6        93-7        80-9        70 6        56-8        54 7        47-10       33-11       24 8        23 11        8 9         6Name: unuse_room, dtype: int64
in_num['ORDER_ID'].value_counts()
1     430502      20123       4084       1405        436        208        127        1110        79         611        312        2Name: ORDER_ID, dtype: int64



room_info[room_info['CODE']=='329845']
CODEHOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMFIRMDJSJBGSJ
3051329845371002B7400.0231温馨精致观海房/loft名宿/孙家疃油画小镇/临近海水浴场3.710020e+11371000000001371002环翠区孙家疃望海·山公馆公寓13-16_B304室---__-~ROOM00132021-08-03 09:58:052021-08-03 10:01:02
df_month = df.groupby(['in_time_year', 'in_time_month'])['ORDER_ID']df_month.count()
in_time_year  in_time_month2020          7                 182              8                 885              9                1571              10               2226              11               1179              12               22792021          1                1596              2                 137              3                   5              4                 420              5                4893              6                3856              7                8507              8                2069              9                  99              10                 25              11                  1Name: ORDER_ID, dtype: int64




df['IN_TIME'].min(), df['IN_TIME'].max()
('202007151300', '202111022358')
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 29930 entries, 0 to 29940Data columns (total 26 columns): #   Column             Non-Null Count  Dtype         ---  ------             --------------  -----          0   ORDER_ID           29930 non-null  int64          1   ORDER_PRIMARY_ID   29930 non-null  object         2   HOTELID            29930 non-null  object         3   ORDER_TIME         29930 non-null  object         4   STATUS             29930 non-null  int64          5   INSERT_TIME        29930 non-null  object         6   MODIFY_TIME        29930 non-null  object         7   FIRM               29930 non-null  int64          8   GUEST_ID           3561 non-null   object         9   BDATE              29930 non-null  object         10  XZQH               29930 non-null  object         11  IN_TIME            29930 non-null  object         12  OUT_TIME           29930 non-null  object         13  guest_sum          29930 non-null  int64          14  guest_sum_notnull  29930 non-null  int64          15  in_time            29930 non-null  datetime64[ns] 16  out_time           29930 non-null  datetime64[ns] 17  in_time_year       29930 non-null  int32          18  out_time_year      29930 non-null  int32          19  in_time_month      29930 non-null  int32          20  out_time_month     29930 non-null  int32          21  in_time_day        29930 non-null  int32          22  out_time_day       29930 non-null  int32          23  in_time_hour       29930 non-null  int32          24  out_time_hour      29930 non-null  int32          25  day_gap            29930 non-null  int64         dtypes: datetime64[ns](2), int32(8), int64(6), object(10)memory usage: 5.3+ MB
df.nunique()
ORDER_ID             29930ORDER_PRIMARY_ID     29930HOTELID               4801ORDER_TIME           26807STATUS                   3INSERT_TIME          25421MODIFY_TIME          25421FIRM                     2GUEST_ID              3561BDATE                 9035XZQH                  2575IN_TIME               8953OUT_TIME               730guest_sum                3guest_sum_notnull        3in_time               8953out_time               730in_time_year             2out_time_year            2in_time_month           12out_time_month          12in_time_day             31out_time_day            31in_time_hour            23out_time_hour           15day_gap                 33dtype: int64
df.rename(columns={'HOTELID':'CODE'}, inplace=True)
df = pd.merge(df, room_info, on=['CODE', 'FIRM'])
df.head(2)
ORDER_IDORDER_PRIMARY_IDCODEPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEin_hourin_minutein_timeout_hourout_minuteout_timein_datetimeout_datetimeHOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJ
092352196BBDB7CC049421C85826AE07020B139278337202008011200202008021200159612011149.01nan20200730224152202007302241523NaN199308043208210.00.00.00.00.00.0NaTNaT37100A16200.0121金海滩临海落地窗公寓,北欧式装修NaNNaN371002环翠区高区金海滩街道新浪屿花园8号楼110411042020-07-06 17:59:352020-07-15 10:24:21
17612998923C26CC35F42C9A7C0A08BCC22179C278337202007221200202007251200202007150017.01nan20200715001752202007150017523NaN198106281201010.00.00.00.00.00.0NaTNaT37100A16200.0121金海滩临海落地窗公寓,北欧式装修NaNNaN371002环翠区高区金海滩街道新浪屿花园8号楼110411042020-07-06 17:59:352020-07-15 10:24:21
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 37 columns): #   Column            Non-Null Count  Dtype         ---  ------            --------------  -----          0   ORDER_ID          40043 non-null  int64          1   ORDER_PRIMARY_ID  40043 non-null  object         2   CODE              40043 non-null  object         3   PRE_IN_TIME       40043 non-null  object         4   PRE_OUT_TIME      40043 non-null  object         5   ORDER_TIME        40043 non-null  object         6   STATUS            40043 non-null  int64          7   CANCEL_TIME       40043 non-null  object         8   INSERT_TIME       40043 non-null  object         9   MODIFY_TIME       40043 non-null  object         10  FIRM              40043 non-null  int64          11  GUEST_ID          5131 non-null   object         12  BDATE             40043 non-null  object         13  XZQH              40043 non-null  object         14  IN_TIME           40043 non-null  object         15  OUT_TIME          40043 non-null  object         16  in_hour           40043 non-null  object         17  in_minute         40043 non-null  object         18  in_time           40043 non-null  object         19  out_hour          40043 non-null  object         20  out_minute        40043 non-null  object         21  out_time          40043 non-null  object         22  in_datetime       4913 non-null   datetime64[ns] 23  out_datetime      644 non-null    datetime64[ns] 24  HOTELID           40043 non-null  object         25  JYMJ              40043 non-null  float64        26  ROOM_NUM          40043 non-null  int64          27  BED_NUM           40043 non-null  int64          28  CZLY              40043 non-null  int64          29  CALLED            40043 non-null  object         30  BUR_CODE          20204 non-null  float64        31  STA_CODE          20204 non-null  object         32  SSX               40043 non-null  int64          33  ADDRESS           40043 non-null  object         34  MPHM              40043 non-null  object         35  DJSJ              40043 non-null  object         36  BGSJ              40043 non-null  object        dtypes: datetime64[ns](2), float64(2), int64(7), object(26)memory usage: 11.6+ MB
df.nunique()
ORDER_ID            29941ORDER_PRIMARY_ID    39901CODE                 4801PRE_IN_TIME          6932PRE_OUT_TIME          741ORDER_TIME          26892STATUS                  3CANCEL_TIME             1INSERT_TIME         35319MODIFY_TIME         35319FIRM                    2GUEST_ID             3675BDATE                9064XZQH                 2578IN_TIME              3380OUT_TIME              235in_hour                24in_minute              61in_time               264out_hour               15out_minute             25out_time              211in_datetime           263out_datetime          210HOTELID              4801JYMJ                  173ROOM_NUM               11BED_NUM                17CZLY                    4CALLED               3813BUR_CODE                7STA_CODE               53SSX                     5ADDRESS              4377MPHM                  534DJSJ                 3363BGSJ                 2871dtype: int64
df.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME',       'ORDER_TIME', 'STATUS', 'CANCEL_TIME', 'INSERT_TIME', 'MODIFY_TIME',       'FIRM', 'GUEST_ID', 'BDATE', 'XZQH', 'IN_TIME', 'OUT_TIME', 'in_hour',       'in_minute', 'in_time', 'out_hour', 'out_minute', 'out_time',       'in_datetime', 'out_datetime', 'HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM',       'CZLY', 'CALLED', 'BUR_CODE', 'STA_CODE', 'SSX', 'ADDRESS', 'MPHM',       'DJSJ', 'BGSJ'],      dtype='object')

删除HOTELID,CANCEL_TIME

df1 = df[['ORDER_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME',       'ORDER_TIME', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'STATUS']]

0-撤销;1-预定;2-入住;3-离开

df1['STATUS'].value_counts()
1    344612     49213      661Name: STATUS, dtype: int64
df1.head(50)
ORDER_IDCODEPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMEINSERT_TIMEMODIFY_TIMEIN_TIMEOUT_TIMEJYMJROOM_NUMBED_NUMSTATUS
0923521278337202008011200202008021200159612011149.020200730224152202007302241520.00.00.0121
1761299278337202007221200202007251200202007150017.020200715001752202007150017520.00.00.0121
2943703278337202008091200202008121200202008012010.020200801201019202008012010190.00.00.0121
31012389278337202008141200202008171200202008080900.020200808090100202008080901000.00.00.0121
41351873278337202010051200202010061200202009091245.020200909124547202009091245470.00.00.0121
51481730278337202010051200202010071200202009221039.020200922103911202009221039112020100516242020100712000.0123
61516263278337202010011200202010041200202009251502.020200925150231202009251502310.00.00.0121
71539785278337202010041200202010051200202009271942.020200927194258202009271942580.00.00.0121
81568043278337202010041200202010051200202009301837.020200930183801202009301838010.00.00.0121
91830343278337202010311200202011011200202010301016.020201031120256202010311202560.00.00.0121
101830343278337202010311200202011011200202010301016.020201030101655202010301016550.00.00.0121
111836522278337202010301200202010311200202010301933.020201030193317202010301933170.00.00.0121
121850449278337202011011200202011021200202011010951.020201101121128202011011211280.00.00.0121
131850449278337202011011200202011021200202011010951.020201101095111202011010951110.00.00.0121
142035663278337202011210022202011211200202011210022.020201121002216202011210022160.00.00.0121
152363298278337202101021200202101041200202012230028.020210102120800202101021208000.00.00.0121
162363298278337202101021200202101041200202012230028.020201223002859202012230028590.00.00.0121
172465555278337202101021200202101031200202101011632.020210101163250202101011632500.00.00.0121
182465555278337202101021200202101031200202101011632.020210102120701202101021207010.00.00.0121
193356024278337202104291158202105051158202104121231.020210429120002202104291200020.00.00.0121
203576969278337202105061158202105091158202104291925.020210506120002202105061200020.00.00.0121
213599404278337202105051158202105061158202105020724.020210505120008202105051200080.00.00.0121
223599404278337202105051158202105061158202105020724.020210502072638202105020726380.00.00.0121
233665000278337202106121158202106151158202105081926.020210508192825202105081928250.00.00.0121
243686275278337202105102004202105111158202105102004.020210510201002202105102010020.00.00.0121
253750952278337202105161158202105171158202105161150.020210516120228202105161202280.00.00.0121
263777571278337202105181904202105191158202105181904.020210518190621202105181906210.00.00.0121
273787561278337202105211158202105231158202105191621.020210519162332202105191623320.00.00.0121
283787561278337202105211158202105231158202105191621.020210521120006202105211200060.00.00.0121
293941122278337202106022128202106031158202106022128.020210602214001202106022140010.00.00.0121
303941122278337202106022128202106031158202106022128.020210602213051202106022130510.00.00.0121
313988599278337202106101158202106111158202106070140.020210610120016202106101200160.00.00.0121
323988599278337202106101158202106111158202106070140.020210607014246202106070142460.00.00.0121
334300717278337202107061158202107071158202107031110.020210706120003202107061200030.00.00.0121
344308464278337202107051158202107061158202107032053.020210703205553202107032055530.00.00.0121
354308464278337202107051158202107061158202107032053.020210705120003202107051200030.00.00.0121
36923696282932202008041400202008061200159612092325.020200730225524202007302255240.00.00.0231
371065945282932202008131400202008141200202008122227.020200812222725202008122227250.00.00.0231
381370082282932202009181400202009191200202009110933.020200911093350202009110933500.00.00.0231
391370328282932202009191400202009211200202009111005.020200911100510202009111005100.00.00.0231
401550691282932202010011400202010021200202009282156.020200928215621202009282156210.00.00.0231
411612645282932202010081400202010091200202010061132.020201006113229202010061132290.00.00.0231
424497012282932202107221358202107291158202107181945.020210718194727202107181947270.00.00.0231
434505766282932202107201358202107221158202107191521.020210719152358202107191523580.00.00.0231
444505766282932202107201358202107221158202107191521.020210720140022202107201400220.00.00.0231
45706648227185202007151300202007161200202007092241.020200714171021202007141710210.00.00.0111
46853709227185202007251300202007261200202007232243.020200723224348202007232243480.00.00.0111
471008589227185202008251300202008271200202008072052.020200807205244202008072052440.00.00.0111
481089505227185202008181300202008201200202008142213.020200814221323202008142213230.00.00.0111
491111054227185202008171300202008181200202008162352.020200816235228202008162352280.00.00.0111
df['CODE'].value_counts()[:10]
B106535897    420315790        278296093        222296537        187282520        177B105273690    142298705        124298707        122289326        116294473        116Name: CODE, dtype: int64
df['HOTELID'].value_counts()[:10]
37100A3821    420371098A808    27837100A9707    222371098B022    18737100A1898    17737100A3565    142371099A016    124371099A013    12237100A9501    11637100A8976    116Name: HOTELID, dtype: int64


df_g = df.groupby(['CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME'])['ORDER_ID'].count().reset_index()
df_g.head()
CODEPRE_IN_TIMEPRE_OUT_TIMEORDER_ID
01001772021071113582021071211582
11001772021072413582021072611582
21001812021062713582021062911582
31001812021071913582021072011581
41001812021072213582021072311582

ORDER_ID有重复,MODIFY_TIME修改过,按照最后一次的修改确定时间

df1[df1['CODE']=='100177']
ORDER_IDCODEPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMEINSERT_TIMEMODIFY_TIMEIN_TIMEOUT_TIMEJYMJROOM_NUMBED_NUMSTATUS
369374402846100177202107111358202107121158202107111032.020210711103458202107111034580.00.00.0111
369384402846100177202107111358202107121158202107111032.020210711140042202107111400420.00.00.0111
369394455098100177202107241358202107261158202107151446.020210724140006202107241400060.00.00.0111
369404455098100177202107241358202107261158202107151446.020210715144850202107151448500.00.00.0111
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40043 entries, 0 to 40042
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ORDER_ID      40043 non-null  int64  
 1   CODE          40043 non-null  object 
 2   PRE_IN_TIME   40043 non-null  object 
 3   PRE_OUT_TIME  40043 non-null  object 
 4   ORDER_TIME    40043 non-null  object 
 5   INSERT_TIME   40043 non-null  object 
 6   MODIFY_TIME   40043 non-null  object 
 7   IN_TIME       40043 non-null  object 
 8   OUT_TIME      40043 non-null  object 
 9   JYMJ          40043 non-null  float64
 10  ROOM_NUM      40043 non-null  int64  
 11  BED_NUM       40043 non-null  int64  
 12  STATUS        40043 non-null  int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 4.3+ MB












  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值