利用ipython notebook --numpy,pandas构建特征

工具:ipython notebook
数据来源:滴滴算法大赛
目标:统计预测时间的前三个时间片按照区域(district_id)、每分钟(time_piece)的需求(demand)、供应缺口(gap),及以10分钟为滑动窗口的统计值,再加上平均值、方差(多考虑了是否工作日)
亮点:一步代码,一步数据演示

import pandas as pd
import numpy as np
import gc
def trans_time_slice(time):
    time = time.split('-')
    if int(time[3]) > 1:
        time[3] = str(int(time[3]) - 1)
    else:
        if int(time[2]) > 11:
            time[2] = str(int(time[2]) - 1)
        else:
            time[2] = str(0)+str(int(time[2])-1)
        time[3] = '144'
    return '-'.join(time)
order_data = pd.read_csv('./clear_data/add_minute_order_data.csv',header=None
                         ,names=['order_id','driver_id','passenger_id','start_district_id','dest_district_id','price','time_piece','minute']
                        ,dtype = {'order_id':np.str,'driver_id':np.str,'passenger_id':np.str,'start_district_id':np.uint8,'dest_district_id':np.uint8,'price':np.float16,'time_piece':np.str,'minute':np.uint8})
print order_data[:1]
order_id driver_id \ 0 97ebd0c6680f7c0535dbfdead6e51b4b dd65fa250fca2833a3a8c16d2cf0457c passenger_id start_district_id dest_district_id \ 0 ed180d7daf639d936f1aeae4f7fb482f 23 47 price time_piece minute 0 24.0 2016-01-01-82 8
order_data = order_data[(order_data['time_piece']>'2016-01-01-3')]   #确保每个时间片都有前3个时间片
#得到分钟统计值
account_order_data_minute = order_data.groupby(['start_district_id', 'time_piece', 'minute'],as_index=False)['order_id','driver_id'].count()
account_order_data_minute = account_order_data_minute.rename(columns={'start_district_id':'district_id','order_id':'demand', 'driver_id':'supply'})
account_order_data_minute['gap'] = account_order_data_minute['demand'] - account_order_data_minute['supply']
account_order_data_minute = account_order_data_minute.drop(['supply'],axis=1)
account_order_data_minute[:1]
district_idtime_pieceminutedemandgap
012016-01-01-30110
#为了后续求均值、方差,需要定义一个变量
data_roll_10 = account_order_data_minute[['district_id','time_piece']].drop_duplicates()
data_roll_10[:1]
district_idtime_piece
012016-01-01-30
#添加前3个时间片
data_roll_10['time_piece_1'] = data_roll_10['time_piece'].apply(trans_time_slice)
data_roll_10['time_piece_2'] = data_roll_10['time_piece_1'].apply(trans_time_slice)
data_roll_10['time_piece_3'] = data_roll_10['time_piece_2'].apply(trans_time_slice)
#这里可以很清晰的看到,trans_time_slice函数式用于得到某个时间片的前一个时间片
data_roll_10[:1]
district_idtime_piecetime_piece_1time_piece_2time_piece_3
012016-01-01-302016-01-01-292016-01-01-282016-01-01-27
#得到前30分钟每分钟的demand,gap
for i in range(10):
    data_roll_10['minute_'+str(i+1)] = i+1

#connect 1 minute feature
for i in range(30):
    time_slice = int(i/10+1)
    minute = int((i+1)%10 if (i+1)%10 != 0 else 10)
    account_order_data_minute.columns = ['district_id','time_piece_'+str(time_slice),'minute_'+str(minute),'demand_'+str(time_slice)+'_'+str(minute),'gap_'+str(time_slice)+'_'+str(minute)]
    data_roll_10 = pd.merge(left = data_roll_10, right = account_order_data_minute, how = 'left' ,on = ['district_id','time_piece_'+str(time_slice),'minute_'+str(minute)], sort = False).fillna(0)

#删除用于连接的列
for i in range(10):
    data_roll_10 = data_roll_10.drop(['minute_'+str(i+1)],axis=1)

data_roll_10 = data_roll_10.drop(['time_piece_1','time_piece_2','time_piece_3'],axis=1)
data_roll_10[:3]
district_idtime_piecedemand_1_1gap_1_1demand_1_2gap_1_2demand_1_3gap_1_3demand_1_4gap_1_4demand_3_6gap_3_6demand_3_7gap_3_7demand_3_8gap_3_8demand_3_9gap_3_9demand_3_10gap_3_10
012016-01-01-300.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0
112016-01-01-311.00.04.04.00.00.04.01.00.00.00.00.00.00.00.00.00.00.0
212016-01-01-327.05.03.03.03.02.03.02.00.00.00.00.00.00.00.00.00.00.0

3 rows × 62 columns

#计算滑动窗口统计值
all_slice = 1
all_minute = 1
for i in range(21):
    tmp_slice = all_slice
    tmp_minute = all_minute
    for j in range(10):
        if j == 0:
            data_roll_10['demand_sum10_'+str(i+1)] = data_roll_10['demand_'+str(tmp_slice)+'_'+str(tmp_minute)]
            data_roll_10['gap_sum10_'+str(i+1)] = data_roll_10['gap_'+str(tmp_slice)+'_'+str(tmp_minute)]
        else:
            data_roll_10['demand_sum10_'+str(i+1)] += data_roll_10['demand_'+str(tmp_slice)+'_'+str(tmp_minute)]
            data_roll_10['gap_sum10_'+str(i+1)] += data_roll_10['gap_'+str(tmp_slice)+'_'+str(tmp_minute)]
        if tmp_minute <= 9:
            tmp_minute += 1
        else:
            tmp_slice += 1
            tmp_minute = 1
    if all_minute <= 9:
        all_minute += 1
    else:
        all_slice += 1
        all_minute = 1
#删除用于求滚动窗口的列
for i in range(30):
    time_slice = int(i/10+1)
    minute = int((i+1)%10 if (i+1)%10 != 0 else 10)
    data_roll_10 = data_roll_10.drop(['demand_'+str(time_slice)+'_'+str(minute),'gap_'+str(time_slice)+'_'+str(minute)],axis=1)
data_roll_10[:1]
district_idtime_piecedemand_sum10_1gap_sum10_1demand_sum10_2gap_sum10_2demand_sum10_3gap_sum10_3demand_sum10_4gap_sum10_4demand_sum10_17gap_sum10_17demand_sum10_18gap_sum10_18demand_sum10_19gap_sum10_19demand_sum10_20gap_sum10_20demand_sum10_21gap_sum10_21
012016-01-01-300.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0

1 rows × 44 columns

#加上是否工作日
from datetime import datetime
data_roll_10['slice'] = data_roll_10['time_piece'].apply(lambda x : x[x.rfind('-')+1:])
data_roll_10['week_info'] = data_roll_10['time_piece'].apply(lambda x : datetime.strptime(x[:x.rfind('-')],'%Y-%m-%d').weekday()+1)
data_roll_10.loc[(data_roll_10['week_info'] < 6),'isWeekday'] = 0
data_roll_10.loc[(data_roll_10['week_info'] >= 6),'isWeekday'] = 1
data_roll_10.loc[(data_roll_10['time_piece'].apply(lambda x : x[:x.rfind('-')]) == '2016-01-01'),'isWeekday'] = 1
data_roll_10['isWeekday'] = data_roll_10['isWeekday'].astype(np.uint8)
data_roll_10 = data_roll_10.drop(['week_info'],axis=1)
data_roll_10[:1]
district_idtime_piecedemand_sum10_1gap_sum10_1demand_sum10_2gap_sum10_2demand_sum10_3gap_sum10_3demand_sum10_4gap_sum10_4demand_sum10_18gap_sum10_18demand_sum10_19gap_sum10_19demand_sum10_20gap_sum10_20demand_sum10_21gap_sum10_21sliceisWeekday
012016-01-01-300.00.00.00.00.00.00.00.00.00.00.00.00.00.00.00.0301

1 rows × 46 columns

data_roll_10_tmp = data_roll_10.drop(['time_piece'],axis=1)
#平均值
data_roll_10_avg = data_roll_10_tmp.groupby(['district_id','isWeekday','slice']).mean().reset_index()
data_roll_10_avg_name=list(data_roll_10_avg.columns)
for i in range(3,len(data_roll_10_avg_name)):
    data_roll_10_avg_name[i]+='_avg'
data_roll_10_avg.columns=data_roll_10_avg_name

#方差
data_roll_10_std = data_roll_10_tmp.groupby(['district_id','isWeekday','slice']).std().reset_index()
data_roll_10_std_name=list(data_roll_10_std.columns)
for i in range(3,len(data_roll_10_std_name)):
    data_roll_10_std_name[i]+='_std'
data_roll_10_std.columns=data_roll_10_std_name
data_roll_10_avg[:1]
district_idisWeekdayslicedemand_sum10_1_avggap_sum10_1_avgdemand_sum10_2_avggap_sum10_2_avgdemand_sum10_3_avggap_sum10_3_avgdemand_sum10_4_avgdemand_sum10_17_avggap_sum10_17_avgdemand_sum10_18_avggap_sum10_18_avgdemand_sum10_19_avggap_sum10_19_avgdemand_sum10_20_avggap_sum10_20_avgdemand_sum10_21_avggap_sum10_21_avg
010154.8571433.21428655.5714293.35714356.2857143.35714357.35714360.1428571.92857160.7142862.35714360.4285712.42857161.2857142.42857163.6428572.357143

1 rows × 45 columns

data_roll_10_std[:1]
district_idisWeekdayslicedemand_sum10_1_stdgap_sum10_1_stddemand_sum10_2_stdgap_sum10_2_stddemand_sum10_3_stdgap_sum10_3_stddemand_sum10_4_stddemand_sum10_17_stdgap_sum10_17_stddemand_sum10_18_stdgap_sum10_18_stddemand_sum10_19_stdgap_sum10_19_stddemand_sum10_20_stdgap_sum10_20_stddemand_sum10_21_stdgap_sum10_21_std
010117.8664151.5776618.5874821.73680319.6485611.73680320.22035219.7439662.12907720.0208682.23975119.7122152.50274620.3069852.20887322.0579912.437121

1 rows × 45 columns

#链接均值、方差
data_roll_10 = pd.merge(left=data_roll_10,right=data_roll_10_avg,how='left',on=['district_id','isWeekday','slice'],sort=False).fillna(-1)
data_roll_10 = pd.merge(left=data_roll_10,right=data_roll_10_std,how='left',on=['district_id','isWeekday','slice'],sort=False).fillna(-1)
data_roll_10 = data_roll_10.drop(['isWeekday','slice'],axis=1)
del data_roll_10_avg,data_roll_10_std,data_roll_10_tmp,data_roll_10_avg_name,data_roll_10_std_name
gc.collect()
79
#至此,我们完成了以10分钟为滑动窗口的demand,gap的统计,并且加上了avg,std这两个统计量
data_roll_10[:2]
district_idtime_piecedemand_sum10_1gap_sum10_1demand_sum10_2gap_sum10_2demand_sum10_3gap_sum10_3demand_sum10_4gap_sum10_4demand_sum10_17_stdgap_sum10_17_stddemand_sum10_18_stdgap_sum10_18_stddemand_sum10_19_stdgap_sum10_19_stddemand_sum10_20_stdgap_sum10_20_stddemand_sum10_21_stdgap_sum10_21_std
012016-01-01-300.00.00.00.00.00.00.00.04.5773771.1338934.7559491.1338934.5721731.1338934.3369951.1338934.4721361.133893
112016-01-01-3125.013.024.013.020.09.020.09.04.8107020.3779644.3588990.3779644.5721730.3779644.6136440.3779644.5355740.000000

2 rows × 128 columns

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值