import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import warnings
import seaborn as sns
import scipy.stats as st
import gc
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'
path1 = 'trainb/'
cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_PRIMARY_ID</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>IN_TIME</th>
<th>OUT_TIME</th>
</tr>
<tr>
<th>0</th>
<td>07C5BF73B18B44B0877DEED007F8771D</td>
<td>NaN</td>
<td>19800627</td>
<td>222405</td>
<td>NaN</td>
<td>NaN</td>
</tr>
<tr>
<th>1</th>
<td>3525D57CAE104A078E4962B2B89377B0</td>
<td>371099C301202107090001</td>
<td>19951025</td>
<td>370523</td>
<td>2.021071e+11</td>
<td>2.021071e+11</td>
</tr>
gc.collect()
119
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(3)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_PRIMARY_ID</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>IN_TIME</th>
<th>OUT_TIME</th>
</tr>
<tr>
<th>0</th>
<td>07C5BF73B18B44B0877DEED007F8771D</td>
<td>NaN</td>
<td>19800627</td>
<td>222405</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>1</th>
<td>3525D57CAE104A078E4962B2B89377B0</td>
<td>371099C301202107090001</td>
<td>19951025</td>
<td>370523</td>
<td>202107092103</td>
<td>202107111158</td>
</tr>
<tr>
<th>2</th>
<td>A1414E7D71E34E1DA743AC08BCF49151</td>
<td>3710021822202008240001</td>
<td>19860311</td>
<td>370781</td>
<td>202008241401</td>
<td>202008251200</td>
</tr>
cust.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_PRIMARY_ID 44030 non-null object
1 GUEST_ID 5158 non-null object
2 BDATE 44030 non-null object
3 XZQH 44030 non-null object
4 IN_TIME 44030 non-null object
5 OUT_TIME 44030 non-null object
dtypes: object(6)
memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID 43883
GUEST_ID 3698
BDATE 9503
XZQH 2659
IN_TIME 3396
OUT_TIME 236
dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
romm信息
room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>CODE</th>
<th>HOTELID</th>
<th>JYMJ</th>
<th>ROOM_NUM</th>
<th>BED_NUM</th>
<th>FWLY</th>
<th>CZLY</th>
<th>CALLED</th>
<th>CITY_CODE</th>
<th>BUR_CODE</th>
<th>STA_CODE</th>
<th>SSX</th>
<th>ADDRESS</th>
<th>MPHM</th>
<th>JYQK</th>
<th>FIRM</th>
<th>DJSJ</th>
<th>BGSJ</th>
<th>STATUS</th>
<th>AUDITSTATUS</th>
</tr>
<tr>
<th>0</th>
<td>100177</td>
<td>3710830002</td>
<td>0.0</td>
<td>1</td>
<td>1</td>
<td>2</td>
<td>1</td>
<td>【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽</td>
<td>371000000000</td>
<td>NaN</td>
<td>NaN</td>
<td>371083</td>
<td>乳山市长江路 银泰海景花园 55-301</td>
<td>ROOM001</td>
<td>【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽</td>
<td>3</td>
<td>2020-05-18 10:33:55</td>
<td>2020-07-15 10:23:58</td>
<td>NaN</td>
<td>NaN</td>
</tr>
room_info['CALLED'].nunique()
4232
room_info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5307 entries, 0 to 5306
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CODE 5307 non-null object
1 HOTELID 5307 non-null object
2 JYMJ 5307 non-null float64
3 ROOM_NUM 5307 non-null int64
4 BED_NUM 5307 non-null int64
5 FWLY 5307 non-null int64
6 CZLY 5307 non-null int64
7 CALLED 5307 non-null object
8 CITY_CODE 5307 non-null int64
9 BUR_CODE 2658 non-null float64
10 STA_CODE 2658 non-null object
11 SSX 5307 non-null int64
12 ADDRESS 5307 non-null object
13 MPHM 5307 non-null object
14 JYQK 5307 non-null object
15 FIRM 5307 non-null int64
16 DJSJ 5307 non-null object
17 BGSJ 5307 non-null object
18 STATUS 2658 non-null float64
19 AUDITSTATUS 2658 non-null float64
dtypes: float64(4), int64(7), object(9)
memory usage: 829.3+ KB
room_info.nunique()
CODE 5307
HOTELID 5307
JYMJ 173
ROOM_NUM 11
BED_NUM 17
FWLY 1
CZLY 4
CALLED 4232
CITY_CODE 1
BUR_CODE 7
STA_CODE 54
SSX 5
ADDRESS 4850
MPHM 584
JYQK 4232
FIRM 2
DJSJ 3753
BGSJ 3294
STATUS 1
AUDITSTATUS 1
dtype: int64
room_info['STATUS'].value_counts()
1.0 2658
Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS', 'AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)
room_info['FIRM'].value_counts()
3 3429
10 1878
Name: FIRM, dtype: int64
room_info.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5307 entries, 0 to 5306
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 HOTELID 5307 non-null object
1 JYMJ 5307 non-null float64
2 ROOM_NUM 5307 non-null int64
3 BED_NUM 5307 non-null int64
4 CZLY 5307 non-null int64
5 CALLED 5307 non-null object
6 BUR_CODE 2658 non-null float64
7 STA_CODE 2658 non-null object
8 SSX 5307 non-null int64
9 ADDRESS 5307 non-null object
10 MPHM 5307 non-null object
11 FIRM 5307 non-null int64
12 DJSJ 5307 non-null object
13 BGSJ 5307 non-null object
dtypes: float64(2), int64(5), object(7)
memory usage: 580.6+ KB
room_info.nunique()
HOTELID 5307
JYMJ 173
ROOM_NUM 11
BED_NUM 17
CZLY 4
CALLED 4232
BUR_CODE 7
STA_CODE 54
SSX 5
ADDRESS 4850
MPHM 584
FIRM 2
DJSJ 3753
BGSJ 3294
dtype: int64
- CODE-------- 民宿编码与HOTELID关联
- JYMJ-------- 经营面积
- ROOM_NUM---- 房间数量
- BED_NUM----- 床位数
- CZLY-------- 出租类型
- CALLED------ 名称
- BUR_CODE---- 所属分局代码
- STA_CODE---- 所属派出所代码
- SSX--------- 省市县
- ADDRESS----- 地址
- MPHM-------- 门牌号
- FIRM-------- 平台厂商
- DJSJ-------- 登记时间
- BGSJ-------- 变更时间
- STATUS------ 状态
登记时间和变更时间无变化的只有两家
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
room的统计量特征
#登记时间和变更时间的差值
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
st = room_info[col].value_counts().reset_index()
st.columns = [col, col+'_freq']
room_info = room_info.merge(st, on=col, how='left')
#房间面积率和床位率\单间房床位量
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info[:2]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>HOTELID</th>
<th>JYMJ</th>
<th>ROOM_NUM</th>
<th>BED_NUM</th>
<th>CZLY</th>
<th>CALLED</th>
<th>BUR_CODE</th>
<th>STA_CODE</th>
<th>SSX</th>
<th>ADDRESS</th>
<th>MPHM</th>
<th>FIRM</th>
<th>DJSJ</th>
<th>BGSJ</th>
<th>CALLED_freq</th>
<th>ADDRESS_freq</th>
<th>JYMJ_freq</th>
<th>ROOM_NUM_freq</th>
<th>BED_NUM_freq</th>
<th>CZLY_freq</th>
<th>BUR_CODE_freq</th>
<th>STA_CODE_freq</th>
<th>SSX_freq</th>
<th>MPHM_freq</th>
<th>FIRM_freq</th>
<th>room_ratio</th>
<th>bed_ratio</th>
<th>room_bed</th>
<th>DJ_date</th>
<th>BG_date</th>
<th>DJ_gap</th>
</tr>
<tr>
<th>0</th>
<td>100177</td>
<td>NaN</td>
<td>1</td>
<td>1</td>
<td>1</td>
<td>【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽</td>
<td>NaN</td>
<td>NaN</td>
<td>371083</td>
<td>乳山市长江路 银泰海景花园 55-301</td>
<td>ROOM001</td>
<td>3</td>
<td>2020-05-18 10:33:55</td>
<td>2020-07-15 10:23:58</td>
<td>1</td>
<td>1</td>
<td>NaN</td>
<td>3091</td>
<td>1502</td>
<td>4727</td>
<td>NaN</td>
<td>NaN</td>
<td>150</td>
<td>2565</td>
<td>3429</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>2020-05-18</td>
<td>2020-07-15</td>
<td>58.0</td>
</tr>
<tr>
<th>1</th>
<td>100181</td>
<td>NaN</td>
<td>5</td>
<td>7</td>
<td>1</td>
<td>【山海边公寓】瞻星伴月阁</td>
<td>NaN</td>
<td>NaN</td>
<td>371083</td>
<td>乳山市银滩 檀香丽湾小区2-4-603</td>
<td>ROOM001</td>
<td>3</td>
<td>2020-05-18 10:33:55</td>
<td>2020-07-15 10:23:58</td>
<td>1</td>
<td>1</td>
<td>NaN</td>
<td>49</td>
<td>45</td>
<td>4727</td>
<td>NaN</td>
<td>NaN</td>
<td>150</td>
<td>2565</td>
<td>3429</td>
<td>NaN</td>
<td>NaN</td>
<td>1.4</td>
<td>2020-05-18</td>
<td>2020-07-15</td>
<td>58.0</td>
</tr>
room_info['DJ_gap'].nunique()
29
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'),
Timestamp('2021-10-19 16:05:41'),
Timestamp('2020-07-15 10:23:06'),
Timestamp('2021-10-19 16:09:15'))
room = room_info.copy()
# room['DJ_year'] = room['DJSJ'].dt.year.fillna(0).astype('int')
# room['DJ_month'] = room['DJSJ'].dt.month.fillna(0).astype('int')
# room['DJ_day'] = room['DJSJ'].dt.day.fillna(0).astype('int')
# room['DJ_quarter'] = room['DJSJ'].dt.quarter.fillna(0).astype('int')
# room['DJ_dayofweek'] = room['DJSJ'].dt.dayofweek.fillna(0).astype('int')
# room['DJ_dayofyear'] = room['DJSJ'].dt.dayofyear.fillna(0).astype('int')
room_info['BG_year'] = room_info['DJSJ'].dt.year.fillna(0).astype('int')
room_info['BG_month'] = room_info['DJSJ'].dt.month.fillna(0).astype('int')
room_info['BG_day'] = room_info['DJSJ'].dt.day.fillna(0).astype('int')
room_info['BG_quarter'] = room_info['DJSJ'].dt.quarter.fillna(0).astype('int')
room_info['BG_dayofweek'] = room_info['DJSJ'].dt.dayofweek.fillna(0).astype('int')
room_info['BG_dayofyear'] = room_info['DJSJ'].dt.dayofyear.fillna(0).astype('int')
room_info.drop(['CALLED', 'ADDRESS', 'DJSJ', 'BGSJ', 'MPHM', 'BUR_CODE', 'STA_CODE', 'SSX'], axis=1, inplace=True)
- #DJ_gap|DJ_year|DJ_month|DJ_day|DJ_quarter|DJ_dayofweek|DJ_dayofyear…
- #freq特征
plt.figure(figsize=(20, 8))
sns.countplot(room['JYMJ'])
<AxesSubplot:xlabel='JYMJ', ylabel='count'>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-969fe4rr-1646214149649)(output_42_1.png)]
经营面积缺失超过一半,3000多,考虑删除
#所属分局代码
room['BUR_CODE'].value_counts()
3.710990e+11 954
3.710980e+11 852
3.710020e+11 668
3.710820e+11 108
3.710810e+11 56
3.710830e+11 16
3.710960e+11 4
Name: BUR_CODE, dtype: int64
#所属派出所代码
room['STA_CODE'].value_counts()[:5]
371000000006 505
371000000007 433
371098060000 321
371099100000 258
371000000001 242
Name: STA_CODE, dtype: int64
#省市县
room['SSX'].value_counts()
371002 4661
371082 406
371083 150
371081 64
371003 26
Name: SSX, dtype: int64
#门牌号码
room['MPHM'].value_counts()[:5]
ROOM001 2565
无 1878
ROOM002 57
ROOM003 26
ROOM004 17
Name: MPHM, dtype: int64
订单信息
# platform infomation
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv')
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>PRE_IN_TIME</th>
<th>PRE_OUT_TIME</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>CANCEL_TIME</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
</tr>
<tr>
<th>0</th>
<td>923521</td>
<td>96BBDB7CC049421C85826AE07020B139</td>
<td>278337</td>
<td>202008011200</td>
<td>202008021200</td>
<td>1.596120e+11</td>
<td>1</td>
<td>NaN</td>
<td>20200730224152</td>
<td>20200730224152</td>
<td>3</td>
</tr>
<tr>
<th>1</th>
<td>923696</td>
<td>C72F20539AD1447D86CD1A8E5EAEC63A</td>
<td>282932</td>
<td>202008041400</td>
<td>202008061200</td>
<td>1.596121e+11</td>
<td>1</td>
<td>NaN</td>
<td>20200730225524</td>
<td>20200730225524</td>
<td>3</td>
</tr>
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
##########################没有考虑包含取消时间的订单
order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 PRE_IN_TIME 29941 non-null object
4 PRE_OUT_TIME 29941 non-null object
5 ORDER_TIME 29941 non-null object
6 STATUS 29941 non-null int64
7 CANCEL_TIME 29941 non-null object
8 INSERT_TIME 29941 non-null object
9 MODIFY_TIME 29941 non-null object
10 FIRM 29941 non-null int64
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order.nunique()
ORDER_ID 29941
ORDER_PRIMARY_ID 29941
HOTELID 4801
PRE_IN_TIME 6875
PRE_OUT_TIME 729
ORDER_TIME 26818
STATUS 3
CANCEL_TIME 1
INSERT_TIME 25432
MODIFY_TIME 25432
FIRM 2
dtype: int64
order_info['FIRM'].value_counts()
3 32029
10 8521
Name: FIRM, dtype: int64
############统计量特征
order['STATUS'].value_counts()
1 26137
2 3221
3 583
Name: STATUS, dtype: int64
#插入时间不等于修改时间,就是进行过修改过的订单,最后发现为空,等于没有修改过
order[order['INSERT_TIME']!=order['MODIFY_TIME']]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>PRE_IN_TIME</th>
<th>PRE_OUT_TIME</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>CANCEL_TIME</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
</tr>
del order['CANCEL_TIME']
合并数据
# order时间包含了cust时间
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME'], axis=1, inplace=True)
df.head(2)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>IN_TIME</th>
<th>OUT_TIME</th>
<th>guest_sum</th>
<th>guest_sum_notnull</th>
</tr>
<tr>
<th>0</th>
<td>706648</td>
<td>A4AACE06B518418C8A8CA1935DDD1C5A</td>
<td>227185</td>
<td>202007092241</td>
<td>1</td>
<td>20200714171021</td>
<td>20200714171021</td>
<td>3</td>
<td>NaN</td>
<td>20010409</td>
<td>371002</td>
<td>202007151300</td>
<td>202007161200</td>
<td>1</td>
<td>0</td>
</tr>
<tr>
<th>1</th>
<td>748647</td>
<td>67D551AACFD049AE9CC2AB65E9870678</td>
<td>9483</td>
<td>202007132109</td>
<td>1</td>
<td>20201002101040</td>
<td>20201002101040</td>
<td>3</td>
<td>NaN</td>
<td>19881023</td>
<td>341202</td>
<td>202010011400</td>
<td>202010021011</td>
<td>1</td>
<td>0</td>
</tr>
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 ORDER_TIME 29941 non-null object
4 STATUS 29941 non-null int64
5 INSERT_TIME 29941 non-null object
6 MODIFY_TIME 29941 non-null object
7 FIRM 29941 non-null int64
8 GUEST_ID 3561 non-null object
9 BDATE 29941 non-null object
10 XZQH 29941 non-null object
11 IN_TIME 29941 non-null object
12 OUT_TIME 29941 non-null object
13 guest_sum 29941 non-null int64
14 guest_sum_notnull 29941 non-null int64
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df.nunique();
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>IN_TIME</th>
<th>OUT_TIME</th>
<th>guest_sum</th>
<th>guest_sum_notnull</th>
<th>in_time</th>
<th>out_time</th>
</tr>
<tr>
<th>0</th>
<td>706648</td>
<td>A4AACE06B518418C8A8CA1935DDD1C5A</td>
<td>227185</td>
<td>202007092241</td>
<td>1</td>
<td>20200714171021</td>
<td>20200714171021</td>
<td>3</td>
<td>NaN</td>
<td>20010409</td>
<td>371002</td>
<td>202007151300</td>
<td>202007161200</td>
<td>1</td>
<td>0</td>
<td>2020-07-15 13:00:00</td>
<td>2020-07-16 12:00:00</td>
</tr>
<tr>
<th>1</th>
<td>748647</td>
<td>67D551AACFD049AE9CC2AB65E9870678</td>
<td>9483</td>
<td>202007132109</td>
<td>1</td>
<td>20201002101040</td>
<td>20201002101040</td>
<td>3</td>
<td>NaN</td>
<td>19881023</td>
<td>341202</td>
<td>202010011400</td>
<td>202010021011</td>
<td>1</td>
<td>0</td>
<td>2020-10-01 14:00:00</td>
<td>2020-10-02 10:11:00</td>
</tr>
# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int') #入住时间的年份
# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int') #退房时间的年份
# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int') #入住时间的月份
# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int') #退房时间的月份
# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int') #入住时间的日期
# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int') #退房时间的日期
# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int') #入住时间的小时
# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int') #退房时间的小时
#2月份没有
# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)
# df[df['in_time_month']==12]['day_gap'] = df[df['in_time_month']==12]['day_gap'].apply(lambda x: x+31)
# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)
# month_ = [1, 3, 5, 7, 8, 10]
# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)
#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
# df_0 = df[(df['day_gap']==0)]
# df_0['date'] = df_0['IN_TIME']
# 这里边包含了钟点房和当天入住并退房的。
dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
stat = pd.DataFrame()
stat['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
stat['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
dfs.append(stat)
df_date = pd.concat(dfs).reset_index(drop=True)
df_date[:3]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>datetime</th>
<th>ORDER_PRIMARY_ID</th>
</tr>
<tr>
<th>0</th>
<td>2021-06-30 13:58:00</td>
<td>00003FC18B254E86803C00F4BBA382E4</td>
</tr>
<tr>
<th>1</th>
<td>2021-07-01 13:58:00</td>
<td>00003FC18B254E86803C00F4BBA382E4</td>
</tr>
<tr>
<th>2</th>
<td>2021-07-02 13:58:00</td>
<td>00003FC18B254E86803C00F4BBA382E4</td>
</tr>
dfs1 = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
stat = pd.DataFrame()
stat['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
stat['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
dfs.append(stat)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 60973 entries, 0 to 60972
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 60973 non-null int64
1 ORDER_PRIMARY_ID 60973 non-null object
2 HOTELID 60973 non-null object
3 ORDER_TIME 60973 non-null object
4 STATUS 60973 non-null int64
5 INSERT_TIME 60973 non-null object
6 MODIFY_TIME 60973 non-null object
7 FIRM 60973 non-null int64
8 GUEST_ID 6181 non-null object
9 BDATE 60973 non-null object
10 XZQH 60973 non-null object
11 IN_TIME 60973 non-null object
12 OUT_TIME 60973 non-null object
13 guest_sum 60973 non-null int64
14 guest_sum_notnull 60973 non-null int64
15 in_time 60973 non-null datetime64[ns]
16 out_time 60973 non-null datetime64[ns]
17 datetime 60973 non-null datetime64[ns]
18 in_date 60973 non-null object
19 out_date 60973 non-null object
20 date 60973 non-null object
dtypes: datetime64[ns](3), int64(5), object(13)
memory usage: 10.2+ MB
df1 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
df1.shape
(60973, 18)
df1['in_date'] = df1['in_time'].dt.date
df1['out_date'] = df1['out_time'].dt.date
df1['date'] = df1['datetime'].dt.date
df1[:2]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>IN_TIME</th>
<th>OUT_TIME</th>
<th>guest_sum</th>
<th>guest_sum_notnull</th>
<th>in_time</th>
<th>out_time</th>
<th>datetime</th>
<th>in_date</th>
<th>out_date</th>
<th>date</th>
</tr>
<tr>
<th>0</th>
<td>706648</td>
<td>A4AACE06B518418C8A8CA1935DDD1C5A</td>
<td>227185</td>
<td>202007092241</td>
<td>1</td>
<td>20200714171021</td>
<td>20200714171021</td>
<td>3</td>
<td>NaN</td>
<td>20010409</td>
<td>371002</td>
<td>202007151300</td>
<td>202007161200</td>
<td>1</td>
<td>0</td>
<td>2020-07-15 13:00:00</td>
<td>2020-07-16 12:00:00</td>
<td>2020-07-15 13:00:00</td>
<td>2020-07-15</td>
<td>2020-07-16</td>
<td>2020-07-15</td>
</tr>
<tr>
<th>1</th>
<td>748647</td>
<td>67D551AACFD049AE9CC2AB65E9870678</td>
<td>9483</td>
<td>202007132109</td>
<td>1</td>
<td>20201002101040</td>
<td>20201002101040</td>
<td>3</td>
<td>NaN</td>
<td>19881023</td>
<td>341202</td>
<td>202010011400</td>
<td>202010021011</td>
<td>1</td>
<td>0</td>
<td>2020-10-01 14:00:00</td>
<td>2020-10-02 10:11:00</td>
<td>2020-10-01 14:00:00</td>
<td>2020-10-01</td>
<td>2020-10-02</td>
<td>2020-10-01</td>
</tr>
#去掉日期中所有入住超过一天的最后一天
df2 = df1[(df1['out_date']!=df1['date']) | ((df1['out_date']==df1['date']) & (df1['in_date']==df1['out_date']))]
df2.shape
(49547, 21)
stat = pd.DataFrame()
stat[‘date’] = pd.date_range(
start=group['IN_TIME'].values[0],#开始时间
end=group['OUT_TIME'].values[0],#截止时间
periods=4,#总长度
freq='D',#时间间隔
tz=None,#时区
normalize=False,#是否标准化到midnight
name=None,#date名称
closed=None,#首尾是否在内
**kwargs,
)
gc.collect()
161
test
test = pd.read_csv('testb/submit_example_2.csv')
test.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>HOTELID</th>
<th>DATE</th>
<th>ROOM_EMPTY</th>
</tr>
<tr>
<th>0</th>
<td>303760</td>
<td>2021-09-04</td>
<td>0.1</td>
</tr>
<tr>
<th>1</th>
<td>303760</td>
<td>2021-09-05</td>
<td>0.1</td>
</tr>
<tr>
<th>2</th>
<td>303760</td>
<td>2021-09-11</td>
<td>0.1</td>
</tr>
<tr>
<th>3</th>
<td>303760</td>
<td>2021-09-12</td>
<td>0.1</td>
</tr>
<tr>
<th>4</th>
<td>303760</td>
<td>2021-09-19</td>
<td>0.1</td>
</tr>
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3311 entries, 0 to 3310
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 HOTELID 3311 non-null object
1 DATE 3311 non-null object
2 ROOM_EMPTY 3311 non-null float64
dtypes: float64(1), object(2)
memory usage: 77.7+ KB
#增加特征:民宿单日的总订单量
in_num = df2.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
df2 = df2.merge(in_num, on=['HOTELID', 'date'], how='left')
test['type'] = 'test'
df2['type'] = 'train'
df2['DATE'] = df2['date'].astype('str')
df2 = df2[df2['DATE'] < '2021-09-01']
df2['in_year'] = df2['datetime'].dt.year.fillna(0).astype('int')
df2['in_month'] = df2['datetime'].dt.month.fillna(0).astype('int')
df2['in_day'] = df2['datetime'].dt.day.fillna(0).astype('int')
df2['in_quarter'] = df2['datetime'].dt.quarter.fillna(0).astype('int')
df2['in_dayofweek'] = df2['datetime'].dt.dayofweek.fillna(0).astype('int')
df2['in_dayofyear'] = df2['datetime'].dt.dayofyear.fillna(0).astype('int')
df2['in_weekofyear'] = df2['datetime'].dt.weekofyear.fillna(0).astype('int')
df2['in_is_wknd'] = df2['datetime'].dt.dayofweek // 5 #是否周末
#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df2[(df2['in_is_wknd']==1) & (df2['in_month'].isin(use_month))]
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14',
'2021-09-19', '2021-09-20', '2021-09-21']
df_2 = df2[df2['DATE'].isin(holiday)]
# 节假日为1
df_2['holiday'] = 1
df_1['holiday'] = 0
df3 = pd.concat([df_2, df_1])
df3.shape
(14482, 33)
drop_col = df3.columns.tolist()
drop_col.remove('holiday')
df3.drop_duplicates(drop_col, keep='first', inplace=True)
df3.shape
(12543, 33)
- 挑选周末和节假日(5、6、7、8、9月份)
test['datetime'] = pd.to_datetime(test['DATE'], errors='coerce', format='%Y-%m-%d')
test['in_year'] = test['datetime'].dt.year.fillna(0).astype('int')
test['in_month'] = test['datetime'].dt.month.fillna(0).astype('int')
test['in_day'] = test['datetime'].dt.day.fillna(0).astype('int')
test['in_quarter'] = test['datetime'].dt.quarter.fillna(0).astype('int')
test['in_dayofweek'] = test['datetime'].dt.dayofweek.fillna(0).astype('int')
test['in_dayofyear'] = test['datetime'].dt.dayofyear.fillna(0).astype('int')
test['in_weekofyear'] = test['datetime'].dt.weekofyear.fillna(0).astype('int')
test['in_is_wknd'] = test['datetime'].dt.dayofweek // 5 #是否周末
test[:3]
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>HOTELID</th>
<th>DATE</th>
<th>ROOM_EMPTY</th>
<th>type</th>
<th>datetime</th>
<th>in_year</th>
<th>in_month</th>
<th>in_day</th>
<th>in_quarter</th>
<th>in_dayofweek</th>
<th>in_dayofyear</th>
<th>in_weekofyear</th>
<th>in_is_wknd</th>
</tr>
<tr>
<th>0</th>
<td>303760</td>
<td>2021-09-04</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-04</td>
<td>2021</td>
<td>9</td>
<td>4</td>
<td>3</td>
<td>5</td>
<td>247</td>
<td>35</td>
<td>1</td>
</tr>
<tr>
<th>1</th>
<td>303760</td>
<td>2021-09-05</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-05</td>
<td>2021</td>
<td>9</td>
<td>5</td>
<td>3</td>
<td>6</td>
<td>248</td>
<td>35</td>
<td>1</td>
</tr>
<tr>
<th>2</th>
<td>303760</td>
<td>2021-09-11</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-11</td>
<td>2021</td>
<td>9</td>
<td>11</td>
<td>3</td>
<td>5</td>
<td>254</td>
<td>36</td>
<td>1</td>
</tr>
test['holiday'] = 0
test['holiday'] = test.apply(lambda x: x['holiday']+1 if x['DATE'] in holiday else x['holiday'], axis=1)
test.head()
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>HOTELID</th>
<th>DATE</th>
<th>ROOM_EMPTY</th>
<th>type</th>
<th>datetime</th>
<th>in_year</th>
<th>in_month</th>
<th>in_day</th>
<th>in_quarter</th>
<th>in_dayofweek</th>
<th>in_dayofyear</th>
<th>in_weekofyear</th>
<th>in_is_wknd</th>
<th>holiday</th>
</tr>
<tr>
<th>0</th>
<td>303760</td>
<td>2021-09-04</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-04</td>
<td>2021</td>
<td>9</td>
<td>4</td>
<td>3</td>
<td>5</td>
<td>247</td>
<td>35</td>
<td>1</td>
<td>0</td>
</tr>
<tr>
<th>1</th>
<td>303760</td>
<td>2021-09-05</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-05</td>
<td>2021</td>
<td>9</td>
<td>5</td>
<td>3</td>
<td>6</td>
<td>248</td>
<td>35</td>
<td>1</td>
<td>0</td>
</tr>
<tr>
<th>2</th>
<td>303760</td>
<td>2021-09-11</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-11</td>
<td>2021</td>
<td>9</td>
<td>11</td>
<td>3</td>
<td>5</td>
<td>254</td>
<td>36</td>
<td>1</td>
<td>0</td>
</tr>
<tr>
<th>3</th>
<td>303760</td>
<td>2021-09-12</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-12</td>
<td>2021</td>
<td>9</td>
<td>12</td>
<td>3</td>
<td>6</td>
<td>255</td>
<td>36</td>
<td>1</td>
<td>0</td>
</tr>
<tr>
<th>4</th>
<td>303760</td>
<td>2021-09-19</td>
<td>0.1</td>
<td>test</td>
<td>2021-09-19</td>
<td>2021</td>
<td>9</td>
<td>19</td>
<td>3</td>
<td>6</td>
<td>262</td>
<td>37</td>
<td>1</td>
<td>1</td>
</tr>
del test['datetime'], df3['datetime'], test['ROOM_EMPTY']
df3.drop(['IN_TIME', 'OUT_TIME', 'in_time', 'out_time', 'in_date', 'out_date', 'date'], axis=1, inplace=True)
df4 = df3.merge(test, on=['HOTELID', 'DATE', 'type', 'in_year', 'in_month', 'in_day', 'in_quarter', 'in_dayofweek',
'in_dayofyear', 'in_weekofyear', 'in_is_wknd', 'holiday'], how='outer')
df4.shape
(15854, 25)
df4[df4['type']=='test'].shape
(3311, 25)
dd = df4[df4['DATE']>'2021-09-01']
dd['type'].value_counts()
test 3311
Name: type, dtype: int64
df4.head(3)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>guest_sum</th>
<th>guest_sum_notnull</th>
<th>in_hotel_num</th>
<th>type</th>
<th>DATE</th>
<th>in_year</th>
<th>in_month</th>
<th>in_day</th>
<th>in_quarter</th>
<th>in_dayofweek</th>
<th>in_dayofyear</th>
<th>in_weekofyear</th>
<th>in_is_wknd</th>
<th>holiday</th>
</tr>
<tr>
<th>0</th>
<td>2676555.0</td>
<td>9B22CA8BB9124D6AA82D4B48D7573124</td>
<td>278976</td>
<td>202102012238</td>
<td>1.0</td>
<td>20210201223924</td>
<td>20210201223924</td>
<td>3.0</td>
<td>NaN</td>
<td>19991020</td>
<td>360782</td>
<td>1.0</td>
<td>0.0</td>
<td>1.0</td>
<td>train</td>
<td>2021-05-01</td>
<td>2021</td>
<td>5</td>
<td>1</td>
<td>2</td>
<td>5</td>
<td>121</td>
<td>17</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>1</th>
<td>2676555.0</td>
<td>9B22CA8BB9124D6AA82D4B48D7573124</td>
<td>278976</td>
<td>202102012238</td>
<td>1.0</td>
<td>20210201223924</td>
<td>20210201223924</td>
<td>3.0</td>
<td>NaN</td>
<td>19991020</td>
<td>360782</td>
<td>1.0</td>
<td>0.0</td>
<td>1.0</td>
<td>train</td>
<td>2021-05-02</td>
<td>2021</td>
<td>5</td>
<td>2</td>
<td>2</td>
<td>6</td>
<td>122</td>
<td>17</td>
<td>1</td>
<td>1</td>
</tr>
<tr>
<th>2</th>
<td>2676555.0</td>
<td>9B22CA8BB9124D6AA82D4B48D7573124</td>
<td>278976</td>
<td>202102012238</td>
<td>1.0</td>
<td>20210201223924</td>
<td>20210201223924</td>
<td>3.0</td>
<td>NaN</td>
<td>19991020</td>
<td>360782</td>
<td>1.0</td>
<td>0.0</td>
<td>1.0</td>
<td>train</td>
<td>2021-05-03</td>
<td>2021</td>
<td>5</td>
<td>3</td>
<td>2</td>
<td>0</td>
<td>123</td>
<td>18</td>
<td>0</td>
<td>1</td>
</tr>
df4.nunique()
ORDER_ID 9779
ORDER_PRIMARY_ID 9779
HOTELID 3179
ORDER_TIME 9181
STATUS 3
INSERT_TIME 8011
MODIFY_TIME 8011
FIRM 2
GUEST_ID 1468
BDATE 5635
XZQH 1909
guest_sum 3
guest_sum_notnull 3
in_hotel_num 11
type 2
DATE 69
in_year 2
in_month 5
in_day 31
in_quarter 2
in_dayofweek 5
in_dayofyear 67
in_weekofyear 23
in_is_wknd 2
holiday 2
dtype: int64
df4.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15854 entries, 0 to 15853
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 12543 non-null float64
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 15854 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 12543 non-null float64
14 type 15854 non-null object
15 DATE 15854 non-null object
16 in_year 15854 non-null int64
17 in_month 15854 non-null int64
18 in_day 15854 non-null int64
19 in_quarter 15854 non-null int64
20 in_dayofweek 15854 non-null int64
21 in_dayofyear 15854 non-null int64
22 in_weekofyear 15854 non-null int64
23 in_is_wknd 15854 non-null int64
24 holiday 15854 non-null int64
dtypes: float64(6), int64(9), object(10)
memory usage: 3.1+ MB
合并room进去
df = df4.merge(room_info, on=['HOTELID', 'FIRM'], how='left')
df = df.sort_values(by=['HOTELID', 'DATE'])
df.head(2)
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
<tr style="text-align: right;">
<th></th>
<th>ORDER_ID</th>
<th>ORDER_PRIMARY_ID</th>
<th>HOTELID</th>
<th>ORDER_TIME</th>
<th>STATUS</th>
<th>INSERT_TIME</th>
<th>MODIFY_TIME</th>
<th>FIRM</th>
<th>GUEST_ID</th>
<th>BDATE</th>
<th>XZQH</th>
<th>guest_sum</th>
<th>guest_sum_notnull</th>
<th>in_hotel_num</th>
<th>type</th>
<th>DATE</th>
<th>in_year</th>
<th>in_month</th>
<th>in_day</th>
<th>in_quarter</th>
<th>in_dayofweek</th>
<th>in_dayofyear</th>
<th>in_weekofyear</th>
<th>in_is_wknd</th>
<th>holiday</th>
<th>JYMJ</th>
<th>ROOM_NUM</th>
<th>BED_NUM</th>
<th>CZLY</th>
<th>CALLED_freq</th>
<th>ADDRESS_freq</th>
<th>JYMJ_freq</th>
<th>ROOM_NUM_freq</th>
<th>BED_NUM_freq</th>
<th>CZLY_freq</th>
<th>BUR_CODE_freq</th>
<th>STA_CODE_freq</th>
<th>SSX_freq</th>
<th>MPHM_freq</th>
<th>FIRM_freq</th>
<th>room_ratio</th>
<th>bed_ratio</th>
<th>room_bed</th>
<th>DJ_date</th>
<th>BG_date</th>
<th>DJ_gap</th>
<th>BG_year</th>
<th>BG_month</th>
<th>BG_day</th>
<th>BG_quarter</th>
<th>BG_dayofweek</th>
<th>BG_dayofyear</th>
</tr>
<tr>
<th>7905</th>
<td>4402846.0</td>
<td>BC0E8E3602434EA6A5F29A6F6FF42233</td>
<td>100177</td>
<td>202107111032</td>
<td>1.0</td>
<td>20210711140042</td>
<td>20210711140042</td>
<td>3.0</td>
<td>NaN</td>
<td>19731004</td>
<td>370602</td>
<td>1.0</td>
<td>0.0</td>
<td>1.0</td>
<td>train</td>
<td>2021-07-11</td>
<td>2021</td>
<td>7</td>
<td>11</td>
<td>3</td>
<td>6</td>
<td>192</td>
<td>27</td>
<td>1</td>
<td>0</td>
<td>NaN</td>
<td>1.0</td>
<td>1.0</td>
<td>1.0</td>
<td>1.0</td>
<td>1.0</td>
<td>NaN</td>
<td>3091.0</td>
<td>1502.0</td>
<td>4727.0</td>
<td>NaN</td>
<td>NaN</td>
<td>150.0</td>
<td>2565.0</td>
<td>3429.0</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>2020-05-18</td>
<td>2020-07-15</td>
<td>58.0</td>
<td>2020.0</td>
<td>5.0</td>
<td>18.0</td>
<td>2.0</td>
<td>0.0</td>
<td>139.0</td>
</tr>
<tr>
<th>8455</th>
<td>4455098.0</td>
<td>01D7394D02B44376913536F6071151AD</td>
<td>100177</td>
<td>202107151446</td>
<td>1.0</td>
<td>20210724140006</td>
<td>20210724140006</td>
<td>3.0</td>
<td>NaN</td>
<td>19941001</td>
<td>429006</td>
<td>1.0</td>
<td>0.0</td>
<td>1.0</td>
<td>train</td>
<td>2021-07-24</td>
<td>2021</td>
<td>7</td>
<td>24</td>
<td>3</td>
<td>5</td>
<td>205</td>
<td>29</td>
<td>1</td>
<td>0</td>
<td>NaN</td>
<td>1.0</td>
<td>1.0</td>
<td>1.0</td>
<td>1.0</td>
<td>1.0</td>
<td>NaN</td>
<td>3091.0</td>
<td>1502.0</td>
<td>4727.0</td>
<td>NaN</td>
<td>NaN</td>
<td>150.0</td>
<td>2565.0</td>
<td>3429.0</td>
<td>NaN</td>
<td>NaN</td>
<td>1.0</td>
<td>2020-05-18</td>
<td>2020-07-15</td>
<td>58.0</td>
<td>2020.0</td>
<td>5.0</td>
<td>18.0</td>
<td>2.0</td>
<td>0.0</td>
<td>139.0</td>
</tr>
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15854 entries, 7905 to 12436
Data columns (total 52 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 12543 non-null float64
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 15854 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 12543 non-null float64
14 type 15854 non-null object
15 DATE 15854 non-null object
16 in_year 15854 non-null int64
17 in_month 15854 non-null int64
18 in_day 15854 non-null int64
19 in_quarter 15854 non-null int64
20 in_dayofweek 15854 non-null int64
21 in_dayofyear 15854 non-null int64
22 in_weekofyear 15854 non-null int64
23 in_is_wknd 15854 non-null int64
24 holiday 15854 non-null int64
25 JYMJ 2119 non-null float64
26 ROOM_NUM 12543 non-null float64
27 BED_NUM 12543 non-null float64
28 CZLY 12543 non-null float64
29 CALLED_freq 12543 non-null float64
30 ADDRESS_freq 12543 non-null float64
31 JYMJ_freq 2119 non-null float64
32 ROOM_NUM_freq 12543 non-null float64
33 BED_NUM_freq 12543 non-null float64
34 CZLY_freq 12543 non-null float64
35 BUR_CODE_freq 7314 non-null float64
36 STA_CODE_freq 7314 non-null float64
37 SSX_freq 12543 non-null float64
38 MPHM_freq 12543 non-null float64
39 FIRM_freq 12543 non-null float64
40 room_ratio 2119 non-null float64
41 bed_ratio 2119 non-null float64
42 room_bed 12543 non-null float64
43 DJ_date 12527 non-null object
44 BG_date 12543 non-null object
45 DJ_gap 12527 non-null float64
46 BG_year 12543 non-null float64
47 BG_month 12543 non-null float64
48 BG_day 12543 non-null float64
49 BG_quarter 12543 non-null float64
50 BG_dayofweek 12543 non-null float64
51 BG_dayofyear 12543 non-null float64
dtypes: float64(31), int64(9), object(12)
memory usage: 6.4+ MB
gc.collect()
17
df['DATE'].min(), df['DATE'].max()
('2020-07-18', '2021-09-21')
df = df.sort_values(['HOTELID', 'DATE'])
def qty_shift(df, val):
#昨天,上周,上个月, 去年
df['last_1_qty'] = df.groupby('HOTELID')[val].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_2_qty'] = df.groupby('HOTELID')[val].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_3_qty'] = df.groupby('HOTELID')[val].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_4_qty'] = df.groupby('HOTELID')[val].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_5_qty'] = df.groupby('HOTELID')[val].shift(5).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_6_qty'] = df.groupby('HOTELID')[val].shift(6).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_7_qty'] = df.groupby('HOTELID')[val].shift(7).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_8_qty'] = df.groupby('HOTELID')[val].shift(8).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_10_qty'] = df.groupby('HOTELID')[val].shift(10).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_12_qty'] = df.groupby('HOTELID')[val].shift(12).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_15_qty'] = df.groupby('HOTELID')[val].shift(15).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_20_qty'] = df.groupby('HOTELID')[val].shift(20).fillna(method='ffill').reset_index().sort_index().set_index('index')
keys = ['in_hotel_num','guest_sum', 'guest_sum_notnull', 'in_year', 'in_month', 'in_dayofyear', 'in_quarter']
for key in keys:
print(key)
qty_shift(df, key)
in_hotel_num
guest_sum
guest_sum_notnull
in_year
in_month
in_dayofyear
in_quarter
#昨天,上周,上个月, 去年
df['yesterday_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_2_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_3_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_4_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_52_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(52).fillna(method='ffill').reset_index().sort_index().set_index('index')
def qty_rolling(df, window, val, keys):
df['qty_rolling'+str(window)+'_mean'] = df.groupby(keys)[val].transform(
lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").mean()).values.tolist()
df['qty_rolling'+str(window)+'_max'] = df.groupby(keys)[val].transform(
lambda x: x.shift(1).rolling(window=window, min_periods=3).max()).values.tolist()
df['qty_rolling'+str(window)+'_sum'] = df.groupby(keys)[val].transform(
lambda x: x.shift(1).rolling(window=window, min_periods=3).sum()).values.tolist()
# df['qty_rolling'+str(window)+'_std'] = df.groupby(keys)[val].transform(
# lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").std()).values.tolist()
# df['qty_rolling'+str(window)+'_skew'] = df.groupby(keys)[val].transform(
# lambda x: x.shift(1).rolling(window=window, min_periods=3).skew()).values.tolist()
# df['qty_rolling'+str(window)+'_kurt'] = df.groupby(keys)[val].transform(
# lambda x: x.shift(1).rolling(window=window, min_periods=3).kurt()).values.tolist()
#df['qty_rolling'+str(window)+'_quantile'] = df.groupby(keys)[val].transform(
#lambda x: x.rolling(window=window, min_periods=3).quantile()).values.tolist()
# df['qty_rolling'+str(window)+'_corr'] = df.groupby(keys)[val].transform(
# lambda x: x.shift(1).rolling(window=window, min_periods=3).corr()).values.tolist()
return df
gc.collect()
173
# 滚动7天和14天
keys = ['HOTELID']
# df = qty_rolling(df, 2, 'in_hotel_num', keys)
df = qty_rolling(df, 3, 'in_hotel_num', keys)
df = qty_rolling(df, 4, 'in_hotel_num', keys)
df = qty_rolling(df, 6, 'in_hotel_num', keys)
def qty_ewm(df, alpha, val, keys):
df['qty_ewm'+'_mean'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).mean()).values.tolist()
df['qty_ewm'+'_std'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).std()).values.tolist()
df['qty_ewm'+'_corr'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).corr()).values.tolist()
return df
df = qty_ewm(df, 0.95, 'in_hotel_num', keys)
df['ORDER_ID'] = df['ORDER_ID'].astype('str')
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15854 entries, 7905 to 12436
Data columns (total 78 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 15854 non-null object
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 15854 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 12543 non-null float64
14 type 15854 non-null object
15 DATE 15854 non-null object
16 in_year 15854 non-null int64
17 in_month 15854 non-null int64
18 in_day 15854 non-null int64
19 in_quarter 15854 non-null int64
20 in_dayofweek 15854 non-null int64
21 in_dayofyear 15854 non-null int64
22 in_weekofyear 15854 non-null int64
23 in_is_wknd 15854 non-null int64
24 holiday 15854 non-null int64
25 JYMJ 2119 non-null float64
26 ROOM_NUM 12543 non-null float64
27 BED_NUM 12543 non-null float64
28 CZLY 12543 non-null float64
29 CALLED_freq 12543 non-null float64
30 ADDRESS_freq 12543 non-null float64
31 JYMJ_freq 2119 non-null float64
32 ROOM_NUM_freq 12543 non-null float64
33 BED_NUM_freq 12543 non-null float64
34 CZLY_freq 12543 non-null float64
35 BUR_CODE_freq 7314 non-null float64
36 STA_CODE_freq 7314 non-null float64
37 SSX_freq 12543 non-null float64
38 MPHM_freq 12543 non-null float64
39 FIRM_freq 12543 non-null float64
40 room_ratio 2119 non-null float64
41 bed_ratio 2119 non-null float64
42 room_bed 12543 non-null float64
43 DJ_date 12527 non-null object
44 BG_date 12543 non-null object
45 DJ_gap 12527 non-null float64
46 BG_year 12543 non-null float64
47 BG_month 12543 non-null float64
48 BG_day 12543 non-null float64
49 BG_quarter 12543 non-null float64
50 BG_dayofweek 12543 non-null float64
51 BG_dayofyear 12543 non-null float64
52 yesterday_qty 15853 non-null float64
53 last_2_qty 15852 non-null float64
54 last_3_qty 15843 non-null float64
55 last_4_qty 15842 non-null float64
56 last_52_qty 12226 non-null float64
57 last_1_qty 15853 non-null float64
58 last_5_qty 15841 non-null float64
59 last_6_qty 15840 non-null float64
60 last_7_qty 15839 non-null float64
61 last_8_qty 15838 non-null float64
62 last_10_qty 15825 non-null float64
63 last_12_qty 15823 non-null float64
64 last_15_qty 15820 non-null float64
65 last_20_qty 15770 non-null float64
66 qty_rolling3_mean 6053 non-null float64
67 qty_rolling3_max 6053 non-null float64
68 qty_rolling3_sum 6053 non-null float64
69 qty_rolling4_mean 6393 non-null float64
70 qty_rolling4_max 6393 non-null float64
71 qty_rolling4_sum 6393 non-null float64
72 qty_rolling6_mean 7073 non-null float64
73 qty_rolling6_max 7073 non-null float64
74 qty_rolling6_sum 7073 non-null float64
75 qty_ewm_mean 12381 non-null float64
76 qty_ewm_std 9928 non-null float64
77 qty_ewm_corr 1624 non-null float64
dtypes: float64(56), int64(9), object(13)
memory usage: 9.6+ MB
df.select_dtypes(include='object').columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'INSERT_TIME',
'MODIFY_TIME', 'GUEST_ID', 'BDATE', 'XZQH', 'type', 'DATE', 'DJ_date',
'BG_date'],
dtype='object')
可以预测入住量
# df['ORDER_PRIMARY_ID'] = df['ORDER_PRIMARY_ID'].astype('category')
# df['ORDER_ID'] = df['ORDER_ID'].astype('category')
# df['HOTELID'] = df['HOTELID'].astype('category')
# df['ORDER_TIME'] = df['ORDER_TIME'].astype('category')
# df['INSERT_TIME'] = df['INSERT_TIME'].astype('category')
# df['MODIFY_TIME'] = df['MODIFY_TIME'].astype('category')
# df['GUEST_ID'] = df['GUEST_ID'].astype('category')
# df['BDATE'] = df['BDATE'].astype('category')
# df['XZQH'] = df['XZQH'].astype('category')
# df['DATE'] = df['DATE'].astype('category')
# df['DJ_date'] = df['DJ_date'].astype('category')
# df['BG_date'] = df['BG_date'].astype('category')
col_list = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
col_list.remove('in_hotel_num')
# col_list.remove('ORDER_ID')
# col_list.remove('date_start')
# col_list.remove('unit')
# col_list.remove('geography')
# col_list.remove('product')
# col_list.remove('ts')
# col_list.remove('monthofyear')
# col_list.remove('last_monthofyear')
col_list
['STATUS',
'FIRM',
'guest_sum',
'guest_sum_notnull',
'in_year',
'in_month',
'in_day',
'in_quarter',
'in_dayofweek',
'in_dayofyear',
'in_weekofyear',
'in_is_wknd',
'holiday',
'JYMJ',
'ROOM_NUM',
'BED_NUM',
'CZLY',
'CALLED_freq',
'ADDRESS_freq',
'JYMJ_freq',
'ROOM_NUM_freq',
'BED_NUM_freq',
'CZLY_freq',
'BUR_CODE_freq',
'STA_CODE_freq',
'SSX_freq',
'MPHM_freq',
'FIRM_freq',
'room_ratio',
'bed_ratio',
'room_bed',
'DJ_gap',
'BG_year',
'BG_month',
'BG_day',
'BG_quarter',
'BG_dayofweek',
'BG_dayofyear',
'yesterday_qty',
'last_2_qty',
'last_3_qty',
'last_4_qty',
'last_52_qty',
'last_1_qty',
'last_5_qty',
'last_6_qty',
'last_7_qty',
'last_8_qty',
'last_10_qty',
'last_12_qty',
'last_15_qty',
'last_20_qty',
'qty_rolling3_mean',
'qty_rolling3_max',
'qty_rolling3_sum',
'qty_rolling4_mean',
'qty_rolling4_max',
'qty_rolling4_sum',
'qty_rolling6_mean',
'qty_rolling6_max',
'qty_rolling6_sum',
'qty_ewm_mean',
'qty_ewm_std',
'qty_ewm_corr']
used_features = col_list
cate_cols = ['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'INSERT_TIME',
'MODIFY_TIME', 'GUEST_ID', 'BDATE', 'XZQH', 'DATE', 'DJ_date', 'BG_date']
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']
train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features]
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]
# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'
params = {
'objective': 'regression',
'boosting': 'gbdt',
'learning_rate': 0.001,
'num_leaves': 2 ** 5,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 66,
'feature_fraction': 0.7,
'feature_fraction_seed': 66,
'max_bin': 100,
'max_depth': 10,
'metric': {'l2', 'l1'},
'verbose': -1
}
for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
if fold_n in [0, 1, 2, 3]:
continue
print('Training with validation')
trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
categorical_feature=cate_cols)
val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
categorical_feature=cate_cols)
clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,
early_stopping_rounds=Early_Stopping_Rounds)
val = clf.predict(train.iloc[valid_index])
mae_ = mean_absolute_error(labels[valid_index], val)
print('MAE: {}'.format(mae_))
print("ReTraining on all data")
gc.enable()
del trn_data, val_data
gc.collect()
Best_iteration = clf.best_iteration
print("Best_iteration: ", Best_iteration)
trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)
clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
#valid_sets=[trn_data], verbose_eval=Verbose)
#pred = clf.predict(test[used_features])
Training with validation
Training until validation scores don't improve for 100 rounds
[500] training's l2: 0.289185 training's l1: 0.194349 valid_1's l2: 0.483504 valid_1's l1: 0.352138
[1000] training's l2: 0.15383 training's l1: 0.140402 valid_1's l2: 0.337205 valid_1's l1: 0.291079
[1500] training's l2: 0.100852 training's l1: 0.107452 valid_1's l2: 0.27767 valid_1's l1: 0.254093
[2000] training's l2: 0.0783474 training's l1: 0.087144 valid_1's l2: 0.251739 valid_1's l1: 0.231417
[2500] training's l2: 0.0675106 training's l1: 0.0745483 valid_1's l2: 0.242417 valid_1's l1: 0.217438
[3000] training's l2: 0.0613521 training's l1: 0.0667518 valid_1's l2: 0.241078 valid_1's l1: 0.209322
Early stopping, best iteration is:
[2923] training's l2: 0.0621631 training's l1: 0.0677411 valid_1's l2: 0.240825 valid_1's l1: 0.210172
MAE: 0.21017189645174383
ReTraining on all data
Best_iteration: 2923
pred = clf.predict(test)
# mae_test = mean_absolute_error(test_label, pred)
# mse_test = mean_squared_error(test_label, pred)
# print('测试集MAE: {}'.format(mae_test))
# print('测试集MSE: {}'.format(mse_test))
pred.min()
1.0238228580778739
0-撤销;1-预定;2-入住;3-离开