【无标题】房价的预测

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import warnings
import seaborn as sns
import scipy.stats as st
import gc

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'  
path1 = 'trainb/'
cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN19800627222405NaNNaN
13525D57CAE104A078E4962B2B89377B0371099C301202107090001199510253705232.021071e+112.021071e+11
gc.collect()
133
cust['BDATE'] = cust['BDATE'].astype('str')cust['XZQH'] = cust['XZQH'].astype('str')cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(3)
ORDER_PRIMARY_IDGUEST_IDBDATEXZQHIN_TIMEOUT_TIME
007C5BF73B18B44B0877DEED007F8771DNaN198006272224050.00.0
13525D57CAE104A078E4962B2B89377B0371099C30120210709000119951025370523202107092103202107111158
2A1414E7D71E34E1DA743AC08BCF49151371002182220200824000119860311370781202008241401202008251200
cust['in_hour'] = cust['IN_TIME'].apply(lambda x:x[-4:-2])cust['in_minute'] = cust['IN_TIME'].apply(lambda x:x[-2:])cust['in_time'] = cust['IN_TIME'].apply(lambda x:x[:8])cust['out_hour'] = cust['OUT_TIME'].apply(lambda x:x[-4:-2])cust['out_minute'] = cust['OUT_TIME'].apply(lambda x:x[-2:])cust['out_time'] = cust['OUT_TIME'].apply(lambda x:x[:8])
cust['in_datetime'] = pd.to_datetime(cust['in_time'], errors='coerce')cust['out_datetime'] = pd.to_datetime(cust['out_time'], errors='coerce')

cust.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 44030 entries, 0 to 3343Data columns (total 14 columns): #   Column            Non-Null Count  Dtype         ---  ------            --------------  -----          0   ORDER_PRIMARY_ID  44030 non-null  object         1   GUEST_ID          5158 non-null   object         2   BDATE             44030 non-null  object         3   XZQH              44030 non-null  object         4   IN_TIME           44030 non-null  object         5   OUT_TIME          44030 non-null  object         6   in_hour           44030 non-null  object         7   in_minute         44030 non-null  object         8   in_time           44030 non-null  object         9   out_hour          44030 non-null  object         10  out_minute        44030 non-null  object         11  out_time          44030 non-null  object         12  in_datetime       4933 non-null   datetime64[ns] 13  out_datetime      649 non-null    datetime64[ns]dtypes: datetime64[ns](2), object(12)memory usage: 5.0+ MB
cust.nunique()
ORDER_PRIMARY_ID    43883GUEST_ID             3698BDATE                9503XZQH                 2659IN_TIME              3396OUT_TIME              236in_hour                24in_minute              61in_time               264out_hour               15out_minute             25out_time              212in_datetime           263out_datetime          211dtype: int64
#前4位到市,前6位到县# cust_info['XZ_4'] = cust_info['XZQH'].apply(lambda x:x//100)
# cust_info['birth_date'] = pd.to_datetime(cust_info['BDATE'].astype(str), errors='coerce')
# cust_info['birth_year'] = cust_info['birth_date'].dt.year.astype(int)# cust_info['birth_year'] = cust_info['birth_date'].dt.month.astype(int)# cust_info['birth_year'] = cust_info['birth_date'].dt.day.astype(int)# cust_info['birth_year'] = cust_info['birth_date'].dt.quarter.astype(int)#cust_info['birth_year'] = pd['birth_date'].dt.years#cust_info['birth_year'] = pd['birth_date'].dt.years
# cust_info['in_time_year'] = cust_info['IN_TIME'].apply(lambda x: x//10**8)
# cust_info['in_time_month'] = cust_info['IN_TIME'].apply(lambda x: x//10**6 - (x//10**8)*10**2)
# cust_info['in_time_day'] = cust_info['IN_TIME'].apply(lambda x: x//10**4 - (x//10**6)*10**2)
# cust_info['in_time_hour'] = cust_info['IN_TIME'].apply(lambda x: x//10**2 - (x//10**4)*10**2)
# cust_info['in_time_minute'] = cust_info['IN_TIME'].apply(lambda x: x - (x//10**2)*10**2)
# cust_info['out_time_year'] = cust_info['OUT_TIME'].apply(lambda x: x//10**8)# cust_info['out_time_month'] = cust_info['OUT_TIME'].apply(lambda x: x//10**6 - (x//10**8)*10**2)# cust_info['out_time_day'] = cust_info['OUT_TIME'].apply(lambda x: x//10**4 - (x//10**6)*10**2)# cust_info['out_time_hour'] = cust_info['OUT_TIME'].apply(lambda x: x//10**2 - (x//10**4)*10**2)# cust_info['out_time_minute'] = cust_info['OUT_TIME'].apply(lambda x: x - (x//10**2)*10**2)
# cust_info.head()
# cust_info['check_in_day'] = (cust_info['out_time_year'] - cust_info['in_time_year'])*365 + (cust_info['out_time_month'] - cust_info['in_time_month'])*30 + (cust_info['out_time_day'] - cust_info['in_time_day'])
# cust_info['check_in_hour'] = (cust_info['out_time_year'] - cust_info['in_time_year'])*365*30*24 + (cust_info['out_time_month'] - cust_info['in_time_month'])*30*24 + (cust_info['out_time_day'] - cust_info['in_time_day'])*24 + (cust_info['out_time_hour'] - cust_info['in_time_hour'])
# cust_info['check_in_hour'] = cust_info['check_in_hour'].apply(lambda x: x if (x == np.nan and x > 0) else x+24)
# cust_info['check_in_day'] = cust_info['check_in_day'].apply(lambda x: x if (x == np.nan) else x+1)
# cust_info.head()
# cust_info_1 = cust_info.groupby(['in_time_year', 'in_time_month'])['ORDER_PRIMARY_ID']
# 各个月份订单数量# cust_info_1.count()
cust['IN_TIME'].max()
'202110041413'
#cust_info['in_time'] = pd.to_datetime(map(cust_info['IN_TIME'].astype(int), str), errors='coerce')
# cust_info['IN_TIME'][2]



room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(2)
CODEHOTELIDJYMJROOM_NUMBED_NUMFWLYCZLYCALLEDCITY_CODEBUR_CODESTA_CODESSXADDRESSMPHMJYQKFIRMDJSJBGSJSTATUSAUDITSTATUS
010017737108300020.01121【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽371000000000NaNNaN371083乳山市长江路 银泰海景花园 55-301ROOM001【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽32020-05-18 10:33:552020-07-15 10:23:58NaNNaN
110018137108300030.05721【山海边公寓】瞻星伴月阁371000000000NaNNaN371083乳山市银滩 檀香丽湾小区2-4-603ROOM001【山海边公寓】瞻星伴月阁32020-05-18 10:33:552020-07-15 10:23:58NaNNaN
room_info[(room_info['CALLED']==room_info['JYQK'])==False]
CODEHOTELIDJYMJROOM_NUMBED_NUMFWLYCZLYCALLEDCITY_CODEBUR_CODESTA_CODESSXADDRESSMPHMJYQKFIRMDJSJBGSJSTATUSAUDITSTATUS
room_info['CALLED'].nunique()
4232
room_info['CALLED'].value_counts()[:50]
高铁北站看海公寓                                                       24韩乐坊/不夜城美食街/乐天世纪城 豪华大床房 景驰公寓                                    20国际海水浴场精致海景大床房                                                  12高铁北站看海民宿                                                       10九龙湾豪华一线海景loft复式                                                10超大山景海景房、近威海北站、小石岛赶海公园、国际海水浴场、威高海洋馆、景区中心                         9【海居优选】海景房 威海北站海洋公园店 可住2~4人,知名经典云集。                              8海之约民宿                                                           7威海北站民宿                                                          7【朵笆】纵览山海/山东大学/海水浴场/密码开门/落地大窗/望海一居室                              7豪华海景家庭四人间                                                       7观海一居室                                                           7简约风格,观山看海,近高铁站。                                                 7【海居优选】山景房,高铁北站海水浴场店,可住3人,景点云集。                                  7【朵笆】紧临山东大学/海水浴场/落地飘窗/密码锁/实木大床 榻榻米一居                             6【温莎】观海大床房近/山大/国际海水浴场/金海滩海水浴场/小石岛/高铁北站/威高海洋馆/刘公岛/幸福门/东城夜市        6【遇见?AMOR】国际海水浴场/山东大学/温馨大床房                                      5简约风格,观山看海,近高铁站                                                  5【暖居】国际海水浴场 海信海景壹号公寓 赶海 大白帆 火炬八街 林海公园 距海滩6米 情侣海景房 可做饭 亲子家庭房      5康业现代城优质房源                                                       5【捷诚民宿】简约风格,观山看海,近高铁站。                                           5【温莎】温馨一室亲子房房近/山大/国际海水浴场/金海滩海水浴场/小石岛/高铁北站/威高海洋馆/刘公岛/幸福门/东城夜市     4【橙途】威海站/华夏城/舒适一居                                                4【已消毒】(地暖开放)月租环翠楼市中心智能门锁 小清新公寓 免费停车场                             4海之星温馨家庭复式公寓                                                     4荣成市那香海/成山头/鸡鸣岛/西霞口野生动物园港西陌野民宿                                   4北站温馨民宿,简约风。                                                     4威海寨子大润发 北欧风格 全景落地窗大床公寓 免费停车场                                    4【橙途】威高广场/幸福门/临海时尚一居                                             4一线观海大床房                                                         4威海浩然日租房普通大床                                                     4威海市中心两居可做饭,近幸福门/刘公岛/海水浴场/韩国城/滨海公园,新鲜海鲜,白马南居懂你的舒适生活家             4一线观海简约整洁单间,内有独立卫生间                                              4国际海水浴场一线观海大床房(已消毒)                                              3韩乐坊不夜城精装公寓                                                      3国际海水浴场两卧室四人套房                                                   3山海相依复式民宿/近海水浴场/日式清新风格/可住4人                                      3孙家疃看海公寓,近沙滩 家庭房                                                 3威海浩然日租房单人房                                                      3一线海景房                                                           3海里的民宿                                                           3国际海景城 观海阳台 一线海景 空调双床可做饭                                         3韩乐坊豪华舒适大床房                                                      3外滩临海市房屋,出门十分钟到达葡萄滩。交通便利,风景优美。当地人热情好客                            3威海九龙湾海边齐鲁商城南田园小院民宿客栈                                            3【捷诚民宿】简约风格,观山看海,近高铁站                                            3【已消毒】寨子大润发 智能门锁 投影浪漫大床公寓 免费停车场                                  3悦客之家~欢迎你!                                                       3荣成自在澜湾海景房大床房可以做饭观海                                              3九龙湾豪华一线海景Loft公寓                                                 3Name: CALLED, dtype: int64
room_info['HOTELID'].nunique()
5307
room_info['CODE'].nunique()
5307
room_info.info()
<class 'pandas.core.frame.DataFrame'>RangeIndex: 5307 entries, 0 to 5306Data columns (total 20 columns): #   Column       Non-Null Count  Dtype  ---  ------       --------------  -----   0   CODE         5307 non-null   object  1   HOTELID      5307 non-null   object  2   JYMJ         5307 non-null   float64 3   ROOM_NUM     5307 non-null   int64   4   BED_NUM      5307 non-null   int64   5   FWLY         5307 non-null   int64   6   CZLY         5307 non-null   int64   7   CALLED       5307 non-null   object  8   CITY_CODE    5307 non-null   int64   9   BUR_CODE     2658 non-null   float64 10  STA_CODE     2658 non-null   object  11  SSX          5307 non-null   int64   12  ADDRESS      5307 non-null   object  13  MPHM         5307 non-null   object  14  JYQK         5307 non-null   object  15  FIRM         5307 non-null   int64   16  DJSJ         5307 non-null   object  17  BGSJ         5307 non-null   object  18  STATUS       2658 non-null   float64 19  AUDITSTATUS  2658 non-null   float64dtypes: float64(4), int64(7), object(9)memory usage: 829.3+ KB
room_info.nunique()
CODE           5307HOTELID        5307JYMJ            173ROOM_NUM         11BED_NUM          17FWLY              1CZLY              4CALLED         4232CITY_CODE         1BUR_CODE          7STA_CODE         54SSX               5ADDRESS        4850MPHM            584JYQK           4232FIRM              2DJSJ           3753BGSJ           3294STATUS            1AUDITSTATUS       1dtype: int64
room_info.drop(['FWLY', 'CITY_CODE', 'STATUS', 'AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info['FIRM'].value_counts()
3     342910    1878Name: FIRM, dtype: int64


# platform infomationorder_info = pd.read_csv(path + '网约平台旅客订单信息.csv') order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')order = pd.concat([order_info, order1])
order.head()
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
092352196BBDB7CC049421C85826AE07020B1392783372020080112002020080212001.596120e+111NaN20200730224152202007302241523
1923696C72F20539AD1447D86CD1A8E5EAEC63A2829322020080414002020080612001.596121e+111NaN20200730225524202007302255243
2706648A4AACE06B518418C8A8CA1935DDD1C5A2271852020071513002020071612002.020071e+111NaN20200714171021202007141710213
374864767D551AACFD049AE9CC2AB65E987067894832020100114002020100210112.020071e+111NaN20201002101040202010021010403
475745543F69E6DFE004103BC4741C355D2A7932758342020071514002020071612002.020071e+112NaN20200714181929202007141819293
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
# order[~(order['CANCEL_TIME']=='nan')][:10]
order = order[(order['CANCEL_TIME']=='nan')]

order.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 39901 entries, 0 to 3332Data columns (total 11 columns): #   Column            Non-Null Count  Dtype ---  ------            --------------  -----  0   ORDER_ID          39901 non-null  int64  1   ORDER_PRIMARY_ID  39901 non-null  object 2   HOTELID           39901 non-null  object 3   PRE_IN_TIME       39901 non-null  object 4   PRE_OUT_TIME      39901 non-null  object 5   ORDER_TIME        39901 non-null  object 6   STATUS            39901 non-null  int64  7   CANCEL_TIME       39901 non-null  object 8   INSERT_TIME       39901 non-null  object 9   MODIFY_TIME       39901 non-null  object 10  FIRM              39901 non-null  int64 dtypes: int64(3), object(8)memory usage: 3.7+ MB
order.nunique()
ORDER_ID            29941ORDER_PRIMARY_ID    39901HOTELID              4801PRE_IN_TIME          6932PRE_OUT_TIME          741ORDER_TIME          26892STATUS                  3CANCEL_TIME             1INSERT_TIME         35319MODIFY_TIME         35319FIRM                    2dtype: int64
order_info['FIRM'].value_counts()
3     3202910     8521Name: FIRM, dtype: int64
# del order_info['FIRM']
order['STATUS'].value_counts()
1    344592     48333      609Name: STATUS, dtype: int64
#插入时间#order_info
#修改时间
#插入时间不等于修改时间,就是进行过修改过的订单,最后发现为空,等于没有修改过order[order['INSERT_TIME']!=order['MODIFY_TIME']]
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRM
#订单时间# order_info['ORDER_TIME'][2]
# order_info_cancel = order_info[~order_info['CANCEL_TIME'].isnull()]
# order_info_cancel.head()
# order_info_cancel.shape
# order_info_cancel[order_info_cancel['CANCEL_TIME'] < order_info_cancel['PRE_IN_TIME']].shape
# cancel_index = order_info_cancel[order_info_cancel['CANCEL_TIME'] < order_info_cancel['PRE_IN_TIME']].index.tolist()
# order_info = order_info[~order_info.index.isin(cancel_index)]
# order_info.reset_index(inplace=True)
# order_info = order_info.drop(columns='index')
# order_info.tail()

合并数据

# order时间包含了cust时间df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')
df.head()
ORDER_IDORDER_PRIMARY_IDHOTELIDPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEin_hourin_minutein_timeout_hourout_minuteout_timein_datetimeout_datetime
092352196BBDB7CC049421C85826AE07020B139278337202008011200202008021200159612011149.01nan20200730224152202007302241523NaN199308043208210.00.00.00.00.00.0NaTNaT
1923696C72F20539AD1447D86CD1A8E5EAEC63A282932202008041400202008061200159612092325.01nan20200730225524202007302255243NaN199706143703020.00.00.00.00.00.0NaTNaT
2706648A4AACE06B518418C8A8CA1935DDD1C5A227185202007151300202007161200202007092241.01nan20200714171021202007141710213NaN200104093710020.00.00.00.00.00.0NaTNaT
374864767D551AACFD049AE9CC2AB65E98706789483202010011400202010021011202007132109.01nan20201002101040202010021010403NaN198810233412020.00.00.00.00.00.0NaTNaT
475745543F69E6DFE004103BC4741C355D2A793275834202007151400202007161200202007141819.02nan202007141819292020071418192933710021366202007150001199202163707840.00.00.00.00.00.0NaTNaT
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 24 columns): #   Column            Non-Null Count  Dtype         ---  ------            --------------  -----          0   ORDER_ID          40043 non-null  int64          1   ORDER_PRIMARY_ID  40043 non-null  object         2   HOTELID           40043 non-null  object         3   PRE_IN_TIME       40043 non-null  object         4   PRE_OUT_TIME      40043 non-null  object         5   ORDER_TIME        40043 non-null  object         6   STATUS            40043 non-null  int64          7   CANCEL_TIME       40043 non-null  object         8   INSERT_TIME       40043 non-null  object         9   MODIFY_TIME       40043 non-null  object         10  FIRM              40043 non-null  int64          11  GUEST_ID          5131 non-null   object         12  BDATE             40043 non-null  object         13  XZQH              40043 non-null  object         14  IN_TIME           40043 non-null  object         15  OUT_TIME          40043 non-null  object         16  in_hour           40043 non-null  object         17  in_minute         40043 non-null  object         18  in_time           40043 non-null  object         19  out_hour          40043 non-null  object         20  out_minute        40043 non-null  object         21  out_time          40043 non-null  object         22  in_datetime       4913 non-null   datetime64[ns] 23  out_datetime      644 non-null    datetime64[ns]dtypes: datetime64[ns](2), int64(3), object(19)memory usage: 7.6+ MB
df.rename(columns={'HOTELID':'CODE'}, inplace=True)
df = pd.merge(df, room_info, on=['CODE', 'FIRM'])
df.head(2)
ORDER_IDORDER_PRIMARY_IDCODEPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMESTATUSCANCEL_TIMEINSERT_TIMEMODIFY_TIMEFIRMGUEST_IDBDATEXZQHIN_TIMEOUT_TIMEin_hourin_minutein_timeout_hourout_minuteout_timein_datetimeout_datetimeHOTELIDJYMJROOM_NUMBED_NUMCZLYCALLEDBUR_CODESTA_CODESSXADDRESSMPHMDJSJBGSJ
092352196BBDB7CC049421C85826AE07020B139278337202008011200202008021200159612011149.01nan20200730224152202007302241523NaN199308043208210.00.00.00.00.00.0NaTNaT37100A16200.0121金海滩临海落地窗公寓,北欧式装修NaNNaN371002环翠区高区金海滩街道新浪屿花园8号楼110411042020-07-06 17:59:352020-07-15 10:24:21
17612998923C26CC35F42C9A7C0A08BCC22179C278337202007221200202007251200202007150017.01nan20200715001752202007150017523NaN198106281201010.00.00.00.00.00.0NaTNaT37100A16200.0121金海滩临海落地窗公寓,北欧式装修NaNNaN371002环翠区高区金海滩街道新浪屿花园8号楼110411042020-07-06 17:59:352020-07-15 10:24:21
df.info()
<class 'pandas.core.frame.DataFrame'>Int64Index: 40043 entries, 0 to 40042Data columns (total 37 columns): #   Column            Non-Null Count  Dtype         ---  ------            --------------  -----          0   ORDER_ID          40043 non-null  int64          1   ORDER_PRIMARY_ID  40043 non-null  object         2   CODE              40043 non-null  object         3   PRE_IN_TIME       40043 non-null  object         4   PRE_OUT_TIME      40043 non-null  object         5   ORDER_TIME        40043 non-null  object         6   STATUS            40043 non-null  int64          7   CANCEL_TIME       40043 non-null  object         8   INSERT_TIME       40043 non-null  object         9   MODIFY_TIME       40043 non-null  object         10  FIRM              40043 non-null  int64          11  GUEST_ID          5131 non-null   object         12  BDATE             40043 non-null  object         13  XZQH              40043 non-null  object         14  IN_TIME           40043 non-null  object         15  OUT_TIME          40043 non-null  object         16  in_hour           40043 non-null  object         17  in_minute         40043 non-null  object         18  in_time           40043 non-null  object         19  out_hour          40043 non-null  object         20  out_minute        40043 non-null  object         21  out_time          40043 non-null  object         22  in_datetime       4913 non-null   datetime64[ns] 23  out_datetime      644 non-null    datetime64[ns] 24  HOTELID           40043 non-null  object         25  JYMJ              40043 non-null  float64        26  ROOM_NUM          40043 non-null  int64          27  BED_NUM           40043 non-null  int64          28  CZLY              40043 non-null  int64          29  CALLED            40043 non-null  object         30  BUR_CODE          20204 non-null  float64        31  STA_CODE          20204 non-null  object         32  SSX               40043 non-null  int64          33  ADDRESS           40043 non-null  object         34  MPHM              40043 non-null  object         35  DJSJ              40043 non-null  object         36  BGSJ              40043 non-null  object        dtypes: datetime64[ns](2), float64(2), int64(7), object(26)memory usage: 11.6+ MB
df.nunique()
ORDER_ID            29941ORDER_PRIMARY_ID    39901CODE                 4801PRE_IN_TIME          6932PRE_OUT_TIME          741ORDER_TIME          26892STATUS                  3CANCEL_TIME             1INSERT_TIME         35319MODIFY_TIME         35319FIRM                    2GUEST_ID             3675BDATE                9064XZQH                 2578IN_TIME              3380OUT_TIME              235in_hour                24in_minute              61in_time               264out_hour               15out_minute             25out_time              211in_datetime           263out_datetime          210HOTELID              4801JYMJ                  173ROOM_NUM               11BED_NUM                17CZLY                    4CALLED               3813BUR_CODE                7STA_CODE               53SSX                     5ADDRESS              4377MPHM                  534DJSJ                 3363BGSJ                 2871dtype: int64
df.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME',       'ORDER_TIME', 'STATUS', 'CANCEL_TIME', 'INSERT_TIME', 'MODIFY_TIME',       'FIRM', 'GUEST_ID', 'BDATE', 'XZQH', 'IN_TIME', 'OUT_TIME', 'in_hour',       'in_minute', 'in_time', 'out_hour', 'out_minute', 'out_time',       'in_datetime', 'out_datetime', 'HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM',       'CZLY', 'CALLED', 'BUR_CODE', 'STA_CODE', 'SSX', 'ADDRESS', 'MPHM',       'DJSJ', 'BGSJ'],      dtype='object')

删除HOTELID,CANCEL_TIME

df1 = df[['ORDER_ID', 'CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME',       'ORDER_TIME', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'STATUS']]

0-撤销;1-预定;2-入住;3-离开

df1['STATUS'].value_counts()
1    344612     49213      661Name: STATUS, dtype: int64
df1.head(50)
ORDER_IDCODEPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMEINSERT_TIMEMODIFY_TIMEIN_TIMEOUT_TIMEJYMJROOM_NUMBED_NUMSTATUS
0923521278337202008011200202008021200159612011149.020200730224152202007302241520.00.00.0121
1761299278337202007221200202007251200202007150017.020200715001752202007150017520.00.00.0121
2943703278337202008091200202008121200202008012010.020200801201019202008012010190.00.00.0121
31012389278337202008141200202008171200202008080900.020200808090100202008080901000.00.00.0121
41351873278337202010051200202010061200202009091245.020200909124547202009091245470.00.00.0121
51481730278337202010051200202010071200202009221039.020200922103911202009221039112020100516242020100712000.0123
61516263278337202010011200202010041200202009251502.020200925150231202009251502310.00.00.0121
71539785278337202010041200202010051200202009271942.020200927194258202009271942580.00.00.0121
81568043278337202010041200202010051200202009301837.020200930183801202009301838010.00.00.0121
91830343278337202010311200202011011200202010301016.020201031120256202010311202560.00.00.0121
101830343278337202010311200202011011200202010301016.020201030101655202010301016550.00.00.0121
111836522278337202010301200202010311200202010301933.020201030193317202010301933170.00.00.0121
121850449278337202011011200202011021200202011010951.020201101121128202011011211280.00.00.0121
131850449278337202011011200202011021200202011010951.020201101095111202011010951110.00.00.0121
142035663278337202011210022202011211200202011210022.020201121002216202011210022160.00.00.0121
152363298278337202101021200202101041200202012230028.020210102120800202101021208000.00.00.0121
162363298278337202101021200202101041200202012230028.020201223002859202012230028590.00.00.0121
172465555278337202101021200202101031200202101011632.020210101163250202101011632500.00.00.0121
182465555278337202101021200202101031200202101011632.020210102120701202101021207010.00.00.0121
193356024278337202104291158202105051158202104121231.020210429120002202104291200020.00.00.0121
203576969278337202105061158202105091158202104291925.020210506120002202105061200020.00.00.0121
213599404278337202105051158202105061158202105020724.020210505120008202105051200080.00.00.0121
223599404278337202105051158202105061158202105020724.020210502072638202105020726380.00.00.0121
233665000278337202106121158202106151158202105081926.020210508192825202105081928250.00.00.0121
243686275278337202105102004202105111158202105102004.020210510201002202105102010020.00.00.0121
253750952278337202105161158202105171158202105161150.020210516120228202105161202280.00.00.0121
263777571278337202105181904202105191158202105181904.020210518190621202105181906210.00.00.0121
273787561278337202105211158202105231158202105191621.020210519162332202105191623320.00.00.0121
283787561278337202105211158202105231158202105191621.020210521120006202105211200060.00.00.0121
293941122278337202106022128202106031158202106022128.020210602214001202106022140010.00.00.0121
303941122278337202106022128202106031158202106022128.020210602213051202106022130510.00.00.0121
313988599278337202106101158202106111158202106070140.020210610120016202106101200160.00.00.0121
323988599278337202106101158202106111158202106070140.020210607014246202106070142460.00.00.0121
334300717278337202107061158202107071158202107031110.020210706120003202107061200030.00.00.0121
344308464278337202107051158202107061158202107032053.020210703205553202107032055530.00.00.0121
354308464278337202107051158202107061158202107032053.020210705120003202107051200030.00.00.0121
36923696282932202008041400202008061200159612092325.020200730225524202007302255240.00.00.0231
371065945282932202008131400202008141200202008122227.020200812222725202008122227250.00.00.0231
381370082282932202009181400202009191200202009110933.020200911093350202009110933500.00.00.0231
391370328282932202009191400202009211200202009111005.020200911100510202009111005100.00.00.0231
401550691282932202010011400202010021200202009282156.020200928215621202009282156210.00.00.0231
411612645282932202010081400202010091200202010061132.020201006113229202010061132290.00.00.0231
424497012282932202107221358202107291158202107181945.020210718194727202107181947270.00.00.0231
434505766282932202107201358202107221158202107191521.020210719152358202107191523580.00.00.0231
444505766282932202107201358202107221158202107191521.020210720140022202107201400220.00.00.0231
45706648227185202007151300202007161200202007092241.020200714171021202007141710210.00.00.0111
46853709227185202007251300202007261200202007232243.020200723224348202007232243480.00.00.0111
471008589227185202008251300202008271200202008072052.020200807205244202008072052440.00.00.0111
481089505227185202008181300202008201200202008142213.020200814221323202008142213230.00.00.0111
491111054227185202008171300202008181200202008162352.020200816235228202008162352280.00.00.0111
df['CODE'].value_counts()[:10]
B106535897    420315790        278296093        222296537        187282520        177B105273690    142298705        124298707        122289326        116294473        116Name: CODE, dtype: int64
df['HOTELID'].value_counts()[:10]
37100A3821    420371098A808    27837100A9707    222371098B022    18737100A1898    17737100A3565    142371099A016    124371099A013    12237100A9501    11637100A8976    116Name: HOTELID, dtype: int64


df_g = df.groupby(['CODE', 'PRE_IN_TIME', 'PRE_OUT_TIME'])['ORDER_ID'].count().reset_index()
df_g.head()
CODEPRE_IN_TIMEPRE_OUT_TIMEORDER_ID
01001772021071113582021071211582
11001772021072413582021072611582
21001812021062713582021062911582
31001812021071913582021072011581
41001812021072213582021072311582

ORDER_ID有重复,MODIFY_TIME修改过,按照最后一次的修改确定时间

df1[df1['CODE']=='100177']
ORDER_IDCODEPRE_IN_TIMEPRE_OUT_TIMEORDER_TIMEINSERT_TIMEMODIFY_TIMEIN_TIMEOUT_TIMEJYMJROOM_NUMBED_NUMSTATUS
369374402846100177202107111358202107121158202107111032.020210711103458202107111034580.00.00.0111
369384402846100177202107111358202107121158202107111032.020210711140042202107111400420.00.00.0111
369394455098100177202107241358202107261158202107151446.020210724140006202107241400060.00.00.0111
369404455098100177202107241358202107261158202107151446.020210715144850202107151448500.00.00.0111
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 40043 entries, 0 to 40042
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ORDER_ID      40043 non-null  int64  
 1   CODE          40043 non-null  object 
 2   PRE_IN_TIME   40043 non-null  object 
 3   PRE_OUT_TIME  40043 non-null  object 
 4   ORDER_TIME    40043 non-null  object 
 5   INSERT_TIME   40043 non-null  object 
 6   MODIFY_TIME   40043 non-null  object 
 7   IN_TIME       40043 non-null  object 
 8   OUT_TIME      40043 non-null  object 
 9   JYMJ          40043 non-null  float64
 10  ROOM_NUM      40043 non-null  int64  
 11  BED_NUM       40043 non-null  int64  
 12  STATUS        40043 non-null  int64  
dtypes: float64(1), int64(4), object(8)
memory usage: 4.3+ MB












  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值