【无标题】22222

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings


warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'  
path1 = 'trainb/'

入住信息

cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN19800627222405NaNNaN
13525D57CAE104A078E4962B2B89377B0371099C301202107090001199510253705232.021071e+112.021071e+11
gc.collect()
89
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

cust.head(2)

ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN198006272224050.00.0
13525D57CAE104A078E4962B2B89377B0371099C30120210709000119951025370523202107092103202107111158
cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_PRIMARY_ID  44030 non-null  object
 1   GUEST_ID          5158 non-null   object
 2   BDATE             44030 non-null  object
 3   XZQH              44030 non-null  object
 4   IN_TIME           44030 non-null  object
 5   OUT_TIME          44030 non-null  object
dtypes: object(6)
memory usage: 2.4+ MB
cust.nunique()

ORDER_PRIMARY_ID    43883
GUEST_ID             3698
BDATE                9503
XZQH                 2659
IN_TIME              3396
OUT_TIME              236
dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[5:10]

03158D5CA62E42339D697612EA347FB3    2
0A99D74F1E0248D68A729CB79FC640E6    2
1FE3E448558347D89C56B0AFCEC8ACFB    2
864E417B7BDE4C7B9449B7E087E9F21E    2
3DEFA75566934CA5929D0C47F0B95FDE    2
Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)

stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')

cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)

cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']

ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
2280170798782D6C04A438360D80AFE4845C1371099B389202107040001199903141309830.00.032
cust['IN_TIME'].max(), cust['IN_TIME'].min()

('202110041413', '0.0')

订单信息

# platform infomation
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv') 
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)

ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
092352196BBDB7CC049421C85826AE07020B1392783372020080112002020080212001.596120e+111NaN20200730224152202007302241523
1923696C72F20539AD1447D86CD1A8E5EAEC63A2829322020080414002020080612001.596121e+111NaN20200730225524202007302255243
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']

ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
3097044220273CDEDB5E03534D379687645675898CA41708982021080813582021081011582.021071e+113NaN20210712210153202107122101533
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')

order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

##########################没有考虑包含取消时间的订单

order = order[(order['CANCEL_TIME']=='nan')]

order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])

order.drop_duplicates('ORDER_ID', keep='last', inplace=True)

order.shape

(29941, 11)
order[order['ORDER_ID']==4402846]

ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
301414402846BC0E8E3602434EA6A5F29A6F6FF422331001772021071113582021071211582021071110321nan20210711140042202107111400423
order.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_ID          29941 non-null  int64 
 1   ORDER_PRIMARY_ID  29941 non-null  object
 2   HOTELID           29941 non-null  object
 3   PRE_IN_TIME       29941 non-null  object
 4   PRE_OUT_TIME      29941 non-null  object
 5   ORDER_TIME        29941 non-null  object
 6   STATUS            29941 non-null  int64 
 7   CANCEL_TIME       29941 non-null  object
 8   INSERT_TIME       29941 non-null  object
 9   MODIFY_TIME       29941 non-null  object
 10  FIRM              29941 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order.nunique()

ORDER_ID            29941
ORDER_PRIMARY_ID    29941
HOTELID              4801
PRE_IN_TIME          6875
PRE_OUT_TIME          729
ORDER_TIME          26818
STATUS                  3
CANCEL_TIME             1
INSERT_TIME         25432
MODIFY_TIME         25432
FIRM                    2
dtype: int64
order_info['FIRM'].value_counts()

3     32029
10     8521
Name: FIRM, dtype: int64
order['STATUS'].value_counts()

1    26137
2     3221
3      583
Name: STATUS, dtype: int64
#插入时间不等于修改时间,就是进行过修改过的订单,最后发现为空,等于没有修改过
order[order['INSERT_TIME']!=order['MODIFY_TIME']]

ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM

合并数据order和cust

# order时间包含了cust时间
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')

月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天

df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)

df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)

df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)

df.head(2)

ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnull
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN2001040937100220200715130020200716120010
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN1988102334120220201001140020201002101110
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ORDER_ID           29941 non-null  int64 
 1   ORDER_PRIMARY_ID   29941 non-null  object
 2   HOTELID            29941 non-null  object
 3   ORDER_TIME         29941 non-null  object
 4   STATUS             29941 non-null  int64 
 5   INSERT_TIME        29941 non-null  object
 6   MODIFY_TIME        29941 non-null  object
 7   FIRM               29941 non-null  int64 
 8   GUEST_ID           3561 non-null   object
 9   BDATE              29941 non-null  object
 10  XZQH               29941 non-null  object
 11  IN_TIME            29941 non-null  object
 12  OUT_TIME           29941 non-null  object
 13  guest_sum          29941 non-null  int64 
 14  guest_sum_notnull  29941 non-null  int64 
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df.nunique();

df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')

df = df[~df['out_time'].isnull()]

df.head(2)

ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_time
0706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007092241120200714171021202007141710213NaN20010409371002202007151300202007161200102020-07-15 13:00:002020-07-16 12:00:00
174864767D551AACFD049AE9CC2AB65E98706789483202007132109120201002101040202010021010403NaN19881023341202202010011400202010021011102020-10-01 14:00:002020-10-02 10:11:00
# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')                 #入住时间的年份
# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int')               #退房时间的年份
# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int')               #入住时间的月份
# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int')             #退房时间的月份
# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')                   #入住时间的日期
# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')                 #退房时间的日期
# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int')                 #入住时间的小时
# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int')               #退房时间的小时

#2月份没有
# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)

# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)

# month_ = [1, 3, 5, 7, 8, 10]
# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)

# df['day_gap'].sum()

#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房

# df_0 = df[(df['day_gap']==0)]
# df_0['date'] = df_0['IN_TIME']
# 这里边包含了钟点房和当天入住并退房的。

dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
    stat1 = pd.DataFrame()
    stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
    stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
    dfs.append(stat1)

df_date = pd.concat(dfs).reset_index(drop=True)
df1 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')

df1.shape

(60973, 18)
# df1[(df1['out_time']!=df1['datetime']) & (df1['day_gap']!=0)].shape

# df1[df1['day_gap']==0][:2]

df1['in_date'] = df1['in_time'].dt.date
df1['out_date'] = df1['out_time'].dt.date
df1['date'] = df1['datetime'].dt.date

df2 = df1[(df1['out_date']!=df1['date']) | ((df1['out_date']==df1['date']) & (df1['in_date']==df1['out_date']))]

df2.shape

(49547, 21)
# len(df1[(df1['out_date']!=df1['date'])]), len(df1[df1['day_gap']==0]), len(df1[(df1['out_date']==df1['date']) & (df1['day_gap']==0)])

# pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)

df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49547 entries, 0 to 60971
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ORDER_ID           49547 non-null  int64         
 1   ORDER_PRIMARY_ID   49547 non-null  object        
 2   HOTELID            49547 non-null  object        
 3   ORDER_TIME         49547 non-null  object        
 4   STATUS             49547 non-null  int64         
 5   INSERT_TIME        49547 non-null  object        
 6   MODIFY_TIME        49547 non-null  object        
 7   FIRM               49547 non-null  int64         
 8   GUEST_ID           5963 non-null   object        
 9   BDATE              49547 non-null  object        
 10  XZQH               49547 non-null  object        
 11  IN_TIME            49547 non-null  object        
 12  OUT_TIME           49547 non-null  object        
 13  guest_sum          49547 non-null  int64         
 14  guest_sum_notnull  49547 non-null  int64         
 15  in_time            49547 non-null  datetime64[ns]
 16  out_time           49547 non-null  datetime64[ns]
 17  datetime           49547 non-null  datetime64[ns]
 18  in_date            49547 non-null  object        
 19  out_date           49547 non-null  object        
 20  date               49547 non-null  object        
dtypes: datetime64[ns](3), int64(5), object(13)
memory usage: 8.3+ MB

room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)

CODEHOTELIDJYMJROOM_NUMBED_NUMFWLYCZLYCALLEDCITY_CODEBUR_CODESTA_CODESSXADDRESSMPHMJYQKFIRMDJSJBGSJSTATUSAUDITSTATUS
010017737108300020.01121【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽371000000000NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM001【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽32020-05-18 10:33:552020-07-15 10:23:58NaNNaN
room_info['STATUS'].value_counts()

1.0    2658
Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS','AUDITSTATUS', 'JYQK'], axis=1, inplace=True)

room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)

room_info['FIRM'].value_counts()

3     3429
10    1878
Name: FIRM, dtype: int64
room_info.nunique()

HOTELID     5307
JYMJ         173
ROOM_NUM      11
BED_NUM       17
CZLY           4
CALLED      4232
BUR_CODE       7
STA_CODE      54
SSX            5
ADDRESS     4850
MPHM         584
FIRM           2
DJSJ        3753
BGSJ        3294
dtype: int64

登记时间和变更时间无变化的只有两家

#经营面积为0的修改为空值
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)

#登记时间和变更时间的差值
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
    st = room_info[col].value_counts().reset_index()
    st.columns = [col, col+'_freq']
    room_info = room_info.merge(st, on=col, how='left')

#房间面积率和床位率\单间房床位量
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']

room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date

#等级时间和变更时间的时间差
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days

room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()

(Timestamp('2020-05-12 13:16:24'),
 Timestamp('2021-10-19 16:05:41'),
 Timestamp('2020-07-15 10:23:06'),
 Timestamp('2021-10-19 16:09:15'))
# room['BG_dayofweek'].value_counts()

plt.figure(figsize=(20, 8))
sns.countplot(room_info['JYMJ'])

<AxesSubplot:xlabel='JYMJ', ylabel='count'>


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ayRq9a5M-1646146611632)(output_79_1.png)]

经营面积缺失超过一半,3000多,考虑删除

合并room到df中

df3 = df2.merge(room_info, on=['HOTELID', 'FIRM'], how='left')

df3 = df3.sort_values(by=['HOTELID', 'date'])

#增加民宿单日的总订单量
in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')

df4.head(2)

ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_num
04402846BC0E8E3602434EA6A5F29A6F6FF42233100177202107111032120210711140042202107111400423NaN19731004370602202107111358202107121158102021-07-11 13:58:002021-07-12 11:58:002021-07-11 13:58:002021-07-112021-07-122021-07-11NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.01
1445509801D7394D02B44376913536F6071151AD100177202107151446120210724140006202107241400063NaN19941001429006202107241358202107261158102021-07-24 13:58:002021-07-26 11:58:002021-07-24 13:58:002021-07-242021-07-262021-07-24NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.01
df4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49547 entries, 0 to 49546
Data columns (total 51 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ORDER_ID           49547 non-null  int64         
 1   ORDER_PRIMARY_ID   49547 non-null  object        
 2   HOTELID            49547 non-null  object        
 3   ORDER_TIME         49547 non-null  object        
 4   STATUS             49547 non-null  int64         
 5   INSERT_TIME        49547 non-null  object        
 6   MODIFY_TIME        49547 non-null  object        
 7   FIRM               49547 non-null  int64         
 8   GUEST_ID           5963 non-null   object        
 9   BDATE              49547 non-null  object        
 10  XZQH               49547 non-null  object        
 11  IN_TIME            49547 non-null  object        
 12  OUT_TIME           49547 non-null  object        
 13  guest_sum          49547 non-null  int64         
 14  guest_sum_notnull  49547 non-null  int64         
 15  in_time            49547 non-null  datetime64[ns]
 16  out_time           49547 non-null  datetime64[ns]
 17  datetime           49547 non-null  datetime64[ns]
 18  in_date            49547 non-null  object        
 19  out_date           49547 non-null  object        
 20  date               49547 non-null  object        
 21  JYMJ               14248 non-null  float64       
 22  ROOM_NUM           49547 non-null  int64         
 23  BED_NUM            49547 non-null  int64         
 24  CZLY               49547 non-null  int64         
 25  CALLED             49547 non-null  object        
 26  BUR_CODE           22916 non-null  float64       
 27  STA_CODE           22916 non-null  object        
 28  SSX                49547 non-null  int64         
 29  ADDRESS            49547 non-null  object        
 30  MPHM               49547 non-null  object        
 31  DJSJ               49497 non-null  datetime64[ns]
 32  BGSJ               49547 non-null  datetime64[ns]
 33  CALLED_freq        49547 non-null  int64         
 34  ADDRESS_freq       49547 non-null  int64         
 35  JYMJ_freq          14248 non-null  float64       
 36  ROOM_NUM_freq      49547 non-null  int64         
 37  BED_NUM_freq       49547 non-null  int64         
 38  CZLY_freq          49547 non-null  int64         
 39  BUR_CODE_freq      22916 non-null  float64       
 40  STA_CODE_freq      22916 non-null  float64       
 41  SSX_freq           49547 non-null  int64         
 42  MPHM_freq          49547 non-null  int64         
 43  FIRM_freq          49547 non-null  int64         
 44  room_ratio         14248 non-null  float64       
 45  bed_ratio          14248 non-null  float64       
 46  room_bed           49547 non-null  float64       
 47  DJ_date            49497 non-null  object        
 48  BG_date            49547 non-null  object        
 49  DJ_gap             49497 non-null  float64       
 50  in_hotel_num       49547 non-null  int64         
dtypes: datetime64[ns](5), float64(9), int64(18), object(19)
memory usage: 19.7+ MB
df4['date'].min(), df4['date'].max()

(datetime.date(2020, 7, 15), datetime.date(2021, 11, 2))
df4['in_year'] = df4['datetime'].dt.year.fillna(0).astype('int')
df4['in_month'] = df4['datetime'].dt.month.fillna(0).astype('int')
df4['in_day'] = df4['datetime'].dt.day.fillna(0).astype('int')
df4['in_quarter'] = df4['datetime'].dt.quarter.fillna(0).astype('int')
df4['in_dayofweek'] = df4['datetime'].dt.dayofweek.fillna(0).astype('int')
df4['in_dayofyear'] = df4['datetime'].dt.dayofyear.fillna(0).astype('int')
df4['in_is_wknd'] = df4['datetime'].dt.dayofweek // 5                 #是否周末

df4[:2]

ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wknd
04402846BC0E8E3602434EA6A5F29A6F6FF42233100177202107111032120210711140042202107111400423NaN19731004370602202107111358202107121158102021-07-11 13:58:002021-07-12 11:58:002021-07-11 13:58:002021-07-112021-07-122021-07-11NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.012021711361921
1445509801D7394D02B44376913536F6071151AD100177202107151446120210724140006202107241400063NaN19941001429006202107241358202107261158102021-07-24 13:58:002021-07-26 11:58:002021-07-24 13:58:002021-07-242021-07-262021-07-24NaN111【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN309115024727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.012021724352051
df4['date1'] = df4['date'].astype('str')

#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]

df_1.shape

(11583, 59)
df_1['in_month'].value_counts()

7    4391
5    2520
6    2192
8    1656
9     824
Name: in_month, dtype: int64
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']

df4[df4['date']=='2021-05-03']

ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEguest_sumguest_sum_notnullin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wkndin_is_wknd
df_2 = df4[df4['date1'].isin(holiday)]

df_2['holiday'] = 1

df_1['holiday'] = 0

df5 = pd.concat([df_2, df_1])

df5.shape

(14662, 61)
df5.drop(['CALLED', 'ADDRESS', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME', 'guest_sum_notnull'], axis=1, inplace=True)

df5[:3]

ORDER_IDORDER_PRIMARY_IDHOTELIDORDER_TIMESTATUSFIRMGUEST_IDBDATEXZQHguest_sumin_timeout_timedatetimein_dateout_datedateJYMJROOM_NUMBED_NUMCZLYBUR_CODESTA_CODESSXMPHMDJSJBGSJCALLED_freqADDRESS_freqJYMJ_freqROOM_NUM_freqBED_NUM_freqCZLY_freqBUR_CODE_freqSTA_CODE_freqSSX_freqMPHM_freqFIRM_freqroom_ratiobed_ratioroom_bedDJ_dateBG_dateDJ_gapin_hotel_numin_yearin_monthin_dayin_quarterin_dayofweekin_dayofyearrecord_is_wkndin_is_wknddate1holiday
93597092CFD17F6BB142485DB3DEC6B5E2D0B66410019320210501204613NaN1977100813282612021-05-01 20:46:002021-05-02 11:58:002021-05-01 20:46:002021-05-012021-05-022021-05-01NaN221NaNNaN371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN124922814727NaNNaN15025653429NaNNaN1.02020-05-182020-07-1558.0120215125121112021-05-011
1136018375C96E986C1F04691A3EB07962BCC7B0010019520210502143513NaN1995100937020312021-05-02 14:35:002021-05-03 11:58:002021-05-02 14:35:002021-05-022021-05-032021-05-02NaN231NaNNaN371083ROOM0012020-05-18 10:33:552020-07-15 10:23:5811NaN12498954727NaNNaN15025653429NaNNaN1.52020-05-182020-07-1558.0120215226122112021-05-021
533314117C0C5516462D3487C895DD654B836F8551003720210408213013NaN1973031312010312021-04-30 13:58:002021-05-03 11:58:002021-05-01 13:58:002021-04-302021-05-032021-05-01NaN121NaNNaN371002ROOM0012020-05-18 10:22:132020-07-15 10:23:5822NaN309122814727NaNNaN466125653429NaNNaN2.02020-05-182020-07-1558.0120215125121112021-05-011
drop_col = df5.columns.tolist()

drop_col.remove('holiday')

df5.drop_duplicates(drop_col, keep='first', inplace=True)

df5.shape

(12708, 54)

dd = df_1.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()

dd

in_yearin_monthORDER_IDym
02020760202007
120208511202008
220209680202009
32020101487202010
4202011665202011
5202012904202012
620211999202101
72021271202102
8202132202103
9202144202104
10202152520202105
11202162192202106
12202174331202107
13202181145202108
1420219144202109
1520211040202110
dd['ym'] = dd['in_year']*100+dd['in_month']
dd['ym'].value_counts()
202106    1
202008    1
202110    1
202109    1
202012    1
202011    1
202010    1
202009    1
202007    1
202104    1
202102    1
202101    1
202103    1
202108    1
202107    1
202105    1
202111    1
Name: ym, dtype: int64
plt.figure(figsize=(15,8))
plt.scatter(dd.index, dd['ORDER_ID'])
<matplotlib.collections.PathCollection at 0x13562173a00>


[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EHvuTP3g-1646146611636)(output_112_1.png)]










  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值