数据挖掘实战：特征工程python实战

最新推荐文章于 2023-09-07 00:30:00 发布

空腹熊猫

最新推荐文章于 2023-09-07 00:30:00 发布

阅读量385

点赞数

分类专栏：数据挖掘实战

原文链接：https://heartbeat.fritz.ai/a-practical-guide-to-feature-engineering-in-python-8326e40747c8

版权

数据挖掘实战专栏收录该内容

12 篇文章 12 订阅

订阅专栏

特征工程是数据科学和机器学习中的重要技巧，对机器模型性能和EDA（exploratory data analysis）的质量有重要影响。本文介绍几种特征工程技巧

详见 notebook

什么是特征工程

使用领域知识来创造特征，使得机器学习算法发挥作用
从原始数据提取特征以转换为算法需要的格式
需要领域知识、数学和编程技巧

数据集

本文使用贷款违约预测数据和送货平台数据，可直接参考 notebook

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') # 关闭警告


#load loan datasets
loan_demographics = pd.read_csv('traindemographics.csv')
loan_prev = pd.read_csv('trainprevloans.csv')
loan_perf = pd.read_csv('trainperf.csv')
#load logistics dataset
sendy_data = pd.read_csv('sendy_logistics.csv')

# 贷款统计信息
# customerid (顾客id)
# birthdate (顾客生日)
# bank_account_type （银行账号类型）
# longitude_gps（经度）
# latitude_gps （纬度）
# bank_name_clients (n银行名称)
# bank_branch_clients (支行，非必填，所以很多缺失值)
# employment_status_clients (职业状态)
# level_of_education_clients (教育程度)
loan_demographics.head().append(loan_demographics.tail())

	customerid	birthdate	bank_account_type	longitude_gps	latitude_gps	bank_name_clients	bank_branch_clients	employment_status_clients	level_of_education_clients
0	8a858e135cb22031015cbafc76964ebd	1973-10-10 00:00:00.000000	Savings	3.319219	6.528604	GT Bank	NaN	NaN	NaN
1	8a858e275c7ea5ec015c82482d7c3996	1986-01-21 00:00:00.000000	Savings	3.325598	7.119403	Sterling Bank	NaN	Permanent	NaN
2	8a858e5b5bd99460015bdc95cd485634	1987-04-01 00:00:00.000000	Savings	5.746100	5.563174	Fidelity Bank	NaN	NaN	NaN
3	8a858efd5ca70688015cabd1f1e94b55	1991-07-19 00:00:00.000000	Savings	3.362850	6.642485	GT Bank	NaN	Permanent	NaN
4	8a858e785acd3412015acd48f4920d04	1982-11-22 00:00:00.000000	Savings	8.455332	11.971410	GT Bank	NaN	Permanent	NaN
4341	8a858f155554552501555588ca2b3b40	1985-12-13 00:00:00.000000	Other	3.236753	7.030168	Stanbic IBTC	NaN	Permanent	Graduate
4342	8a858fc65cf978f4015cf97cee3a02ce	1982-07-01 00:00:00.000000	Savings	7.013749	4.875662	GT Bank	NaN	NaN	NaN
4343	8a858f4f5b66de3a015b66fc83c61902	1989-09-26 00:00:00.000000	Savings	6.295530	7.092508	GT Bank	NaN	Permanent	NaN
4344	8aaae7a74400b28201441c8b62514150	1985-09-06 00:00:00.000000	Savings	3.354206	6.539070	GT Bank	HEAD OFFICE	Permanent	Primary
4345	8a85896653e2e18b0153e69c1b90265c	1975-06-05 00:00:00.000000	Savings	6.661014	7.472700	UBA	NaN	Permanent	NaN

# trainperf.csv 顾客连续贷款记录，需根据顾客以前贷款信息和其自身信息预测是否会违约
# customerid (顾客id)
# systemloanid (贷款id. 每个顾客每次贷款有一个id)
# loannumber (需要预测的贷款数)
# approveddate (贷款批准日期)
# creationdate (贷款申请日期)
# loanamount (贷款金额)
# totaldue (总需还款金额，贷款金额+利息+其他费用)
# termdays (贷款期限)
# referredby (担保人id，空则没担保)
# good_bad_flag (good = 按时结清贷款; bad = 未按时结清贷款) - 是要预测的目标值
loan_perf.head().append(loan_perf.tail())

	customerid	systemloanid	loannumber	approveddate	creationdate	loanamount	totaldue	termdays	referredby	good_bad_flag
0	8a2a81a74ce8c05d014cfb32a0da1049	301994762	12	2017-07-25 08:22:56.000000	2017-07-25 07:22:47.000000	30000.0	34500.0	30	NaN	Good
1	8a85886e54beabf90154c0a29ae757c0	301965204	2	2017-07-05 17:04:41.000000	2017-07-05 16:04:18.000000	15000.0	17250.0	30	NaN	Good
2	8a8588f35438fe12015444567666018e	301966580	7	2017-07-06 14:52:57.000000	2017-07-06 13:52:51.000000	20000.0	22250.0	15	NaN	Good
3	8a85890754145ace015429211b513e16	301999343	3	2017-07-27 19:00:41.000000	2017-07-27 18:00:35.000000	10000.0	11500.0	15	NaN	Good
4	8a858970548359cc0154883481981866	301962360	9	2017-07-03 23:42:45.000000	2017-07-03 22:42:39.000000	40000.0	44000.0	30	NaN	Good
4363	8a858e6d58b0cc520158beeb14b22a5a	302003163	2	2017-07-30 09:19:42.000000	2017-07-30 08:18:30.000000	10000.0	13000.0	30	NaN	Bad
4364	8a858ee85cf400f5015cf44ab1c42d5c	301998967	2	2017-07-27 15:35:47.000000	2017-07-27 14:35:40.000000	10000.0	13000.0	30	NaN	Bad
4365	8a858f365b2547f3015b284597147c94	301995576	3	2017-07-25 16:25:57.000000	2017-07-25 15:24:47.000000	10000.0	11500.0	15	NaN	Bad
4366	8a858f935ca09667015ca0ee3bc63f51	301977679	2	2017-07-14 13:50:27.000000	2017-07-14 12:50:21.000000	10000.0	13000.0	30	8a858eda5c8863ff015c9dead65807bb	Bad
4367	8a858fd458639fcc015868eb14b542ad	301967124	8	2017-07-06 21:01:06.000000	2017-07-06 20:01:01.000000	30000.0	34500.0	30	NaN	Bad

# trainprevloans.csv: 本数据及是顾客在此次贷款前的所有贷款记录.
# customerid (顾客id)
# systemloanid (贷款id，每个顾客每次贷款都有)
# loannumber (T需要预测的贷款数)
# approveddate (贷款审批通过日期)
# creationdate (贷款申请日期)
# loanamount (贷款金额)
# totaldue (总需还款金额，贷款金额+利息+其他费用) 
# termdays (贷款期限)
# closeddate (贷款还清日期)
# referredby (担保人id，若空则无担保)
# firstduedate (贷款期限超30天的首次应付款日期)
# firstrepaiddate (实际首次付款日期)
loan_prev.head().append(loan_prev.tail())

	customerid	systemloanid	loannumber	approveddate	creationdate	loanamount	totaldue	termdays	closeddate	referredby	firstduedate	firstrepaiddate
0	8a2a81a74ce8c05d014cfb32a0da1049	301682320	2	2016-08-15 18:22:40.000000	2016-08-15 17:22:32.000000	10000.0	13000.0	30	2016-09-01 16:06:48.000000	NaN	2016-09-14 00:00:00.000000	2016-09-01 15:51:43.000000
1	8a2a81a74ce8c05d014cfb32a0da1049	301883808	9	2017-04-28 18:39:07.000000	2017-04-28 17:38:53.000000	10000.0	13000.0	30	2017-05-28 14:44:49.000000	NaN	2017-05-30 00:00:00.000000	2017-05-26 00:00:00.000000
2	8a2a81a74ce8c05d014cfb32a0da1049	301831714	8	2017-03-05 10:56:25.000000	2017-03-05 09:56:19.000000	20000.0	23800.0	30	2017-04-26 22:18:56.000000	NaN	2017-04-04 00:00:00.000000	2017-04-26 22:03:47.000000
3	8a8588f35438fe12015444567666018e	301861541	5	2017-04-09 18:25:55.000000	2017-04-09 17:25:42.000000	10000.0	11500.0	15	2017-04-24 01:35:52.000000	NaN	2017-04-24 00:00:00.000000	2017-04-24 00:48:43.000000
4	8a85890754145ace015429211b513e16	301941754	2	2017-06-17 09:29:57.000000	2017-06-17 08:29:50.000000	10000.0	11500.0	15	2017-07-14 21:18:43.000000	NaN	2017-07-03 00:00:00.000000	2017-07-14 21:08:35.000000
18178	8a858899538ddb8e0153a2b555421fc5	301611754	2	2016-04-16 13:36:34.000000	2016-04-16 12:36:28.000000	10000.0	13000.0	30	2016-05-14 00:04:52.000000	NaN	2016-05-16 00:00:00.000000	2016-05-13 18:05:07.000000
18179	8a858899538ddb8e0153a2b555421fc5	301761267	9	2016-11-18 14:26:07.000000	2016-11-18 13:25:51.000000	30000.0	34400.0	30	2016-12-13 16:08:57.000000	NaN	2016-12-19 00:00:00.000000	2016-12-13 15:53:48.000000
18180	8a858899538ddb8e0153a2b555421fc5	301631653	4	2016-06-12 15:30:56.000000	2016-06-12 14:30:50.000000	10000.0	13000.0	30	2016-07-09 15:39:00.000000	NaN	2016-07-12 00:00:00.000000	2016-07-09 15:23:56.000000
18181	8a858f0656b7820c0156c92ca3ba436f	301697691	1	2016-08-27 20:03:45.000000	2016-08-27 19:03:34.000000	10000.0	13000.0	30	2016-10-15 10:17:54.000000	NaN	2016-09-26 00:00:00.000000	2016-10-15 10:02:45.000000
18182	8a858faf5679a838015688de3028143d	301715255	2	2016-09-14 23:42:14.000000	2016-09-14 22:42:05.000000	10000.0	13000.0	30	2016-09-29 19:51:04.000000	NaN	2016-10-14 00:00:00.000000	2016-09-29 19:35:55.000000

# 送货数据
sendy_data.head().append(sendy_data.tail())

	Order No	User Id	Vehicle Type	Platform Type	Personal or Business	Placement - Day of Month	Placement - Weekday (Mo = 1)	Placement - Time	Confirmation - Day of Month	Confirmation - Weekday (Mo = 1)	...	Destination Lat	Destination Long	Rider Id	Time from Pickup to Arrival	speed	manhattan_dist	haversine_dist	bearing	center_latitude	center_longitude
0	Order_No_4211	User_Id_633	Bike	3	Business	9	5	9:35:46 AM	9	5	...	-1.300406	36.829741	Rider_Id_432	745.0	0.002160	0.017978	1.930333	-2.076903	-1.309080	36.830056
1	Order_No_25375	User_Id_2285	Bike	3	Personal	12	5	11:16:16 AM	12	5	...	-1.295004	36.814358	Rider_Id_856	1993.0	0.001422	0.141406	11.339849	-56.392163	-1.323229	36.856837
2	Order_No_1899	User_Id_265	Bike	3	Business	30	2	12:39:25 PM	30	2	...	-1.300921	36.828195	Rider_Id_155	455.0	0.003047	0.022588	1.880079	-64.183866	-1.304603	36.835807
3	Order_No_9336	User_Id_1402	Bike	3	Business	15	5	9:25:34 AM	15	5	...	-1.257147	36.795063	Rider_Id_855	1341.0	0.001717	0.061487	4.943458	-57.091553	-1.269224	36.813730
4	Order_No_27883	User_Id_1737	Bike	1	Personal	13	1	9:55:18 AM	13	1	...	-1.295041	36.809817	Rider_Id_770	1214.0	0.001897	0.046143	3.724829	148.114398	-1.280819	36.800968
21196	Order_No_8834	User_Id_2001	Bike	3	Personal	20	3	3:54:38 PM	20	3	...	-1.275285	36.802702	Rider_Id_953	9.0	0.154033	0.018968	1.890335	-172.912798	-1.266849	36.803751
21197	Order_No_22892	User_Id_1796	Bike	3	Business	13	6	10:13:34 AM	13	6	...	-1.331619	36.847976	Rider_Id_155	770.0	0.002701	0.047443	3.731709	136.829614	-1.319381	36.836493
21198	Order_No_2831	User_Id_2956	Bike	3	Business	7	4	5:06:16 PM	7	4	...	-1.258414	36.804800	Rider_Id_697	2953.0	0.001031	0.120338	10.756212	-73.420487	-1.272216	36.851167
21199	Order_No_6174	User_Id_2524	Bike	1	Personal	4	3	9:31:39 AM	4	3	...	-1.279209	36.794872	Rider_Id_347	1380.0	0.001912	0.108474	9.393189	-110.208129	-1.264620	36.834520
21200	Order_No_9836	User_Id_718	Bike	3	Business	26	2	2:19:47 PM	26	2	...	-1.320157	36.830887	Rider_Id_177	2128.0	0.001205	0.113651	9.026448	143.161318	-1.287673	36.806545

10 rows × 35 columns

这里主要有三种特征：数值、类别、日期特征，以下特征工程没有指定顺序，也不一定适用所有场景，需根据实际情况选择

缺失值处理

缺失值的出现可能是顾客没有提供，或因为错误留空，或因为难以估量. 缺失值可能对模型有重大影响，这里列出几种处理方式。
处理方式依赖特征类型

类别特征缺失值处理方法

有:众数填充、顺序填充（用前面/后面的值填充）、编码填充.
在贷款统计信息 loan_demographics 中，有三个类别特征：bank_branch_clients, employment_status_clients, level_of_education_clients 包含缺失值

## 查看缺失值情况
loan_demographics.isna().sum()

customerid                       0
birthdate                        0
bank_account_type                0
longitude_gps                    0
latitude_gps                     0
bank_name_clients                0
bank_branch_clients           4295
employment_status_clients      648
level_of_education_clients    3759
dtype: int64

## 查看类别分布
loan_demographics['employment_status_clients'].value_counts()

Permanent        3146
Self-Employed     348
Student           142
Unemployed         57
Retired             4
Contract            1
Name: employment_status_clients, dtype: int64

可以看到在职状态中，Permanent 出现的最多，因此可以用这个值填充缺失值

## 众数填充 (Permanent)
loan_demographics['employment_status_clients'] = loan_demographics['employment_status_clients'].fillna(value='Permanent')

loan_demographics.isna().sum()

customerid                       0
birthdate                        0
bank_account_type                0
longitude_gps                    0
latitude_gps                     0
bank_name_clients                0
bank_branch_clients           4295
employment_status_clients        0
level_of_education_clients    3759
dtype: int64

数值特征缺失值处理

1、用均值、众数、中位数填充
2、时序填充（前向/后向）
3、用机器学习模型填充：训练机器学习模型填充缺失值
Sendy logistics 数据集中有两项数值特征：Temperature 和 Precipitation in millimeters，以此为例：

sendy_data.head().append(sendy_data.tail())

	Order No	User Id	Vehicle Type	Platform Type	Personal or Business	Placement - Day of Month	Placement - Weekday (Mo = 1)	Placement - Time	Confirmation - Day of Month	Confirmation - Weekday (Mo = 1)	...	Arrival at Destination - Time	Distance (KM)	Temperature	Precipitation in millimeters	Pickup Lat	Pickup Long	Destination Lat	Destination Long	Rider Id	Time from Pickup to Arrival
0	Order_No_4211	User_Id_633	Bike	3	Business	9	5	9:35:46 AM	9	5	...	10:39:55 AM	4	20.4	NaN	-1.317755	36.830370	-1.300406	36.829741	Rider_Id_432	745
1	Order_No_25375	User_Id_2285	Bike	3	Personal	12	5	11:16:16 AM	12	5	...	12:17:22 PM	16	26.4	NaN	-1.351453	36.899315	-1.295004	36.814358	Rider_Id_856	1993
2	Order_No_1899	User_Id_265	Bike	3	Business	30	2	12:39:25 PM	30	2	...	1:00:38 PM	3	NaN	NaN	-1.308284	36.843419	-1.300921	36.828195	Rider_Id_155	455
3	Order_No_9336	User_Id_1402	Bike	3	Business	15	5	9:25:34 AM	15	5	...	10:05:27 AM	9	19.2	NaN	-1.281301	36.832396	-1.257147	36.795063	Rider_Id_855	1341
4	Order_No_27883	User_Id_1737	Bike	1	Personal	13	1	9:55:18 AM	13	1	...	10:25:37 AM	9	15.4	NaN	-1.266597	36.792118	-1.295041	36.809817	Rider_Id_770	1214
21196	Order_No_8834	User_Id_2001	Bike	3	Personal	20	3	3:54:38 PM	20	3	...	4:20:17 PM	3	28.6	NaN	-1.258414	36.804800	-1.275285	36.802702	Rider_Id_953	9
21197	Order_No_22892	User_Id_1796	Bike	3	Business	13	6	10:13:34 AM	13	6	...	10:46:17 AM	7	26.0	NaN	-1.307143	36.825009	-1.331619	36.847976	Rider_Id_155	770
21198	Order_No_2831	User_Id_2956	Bike	3	Business	7	4	5:06:16 PM	7	4	...	6:40:05 PM	20	29.2	NaN	-1.286018	36.897534	-1.258414	36.804800	Rider_Id_697	2953
21199	Order_No_6174	User_Id_2524	Bike	1	Personal	4	3	9:31:39 AM	4	3	...	10:08:15 AM	13	15.0	NaN	-1.250030	36.874167	-1.279209	36.794872	Rider_Id_347	1380
21200	Order_No_9836	User_Id_718	Bike	3	Business	26	2	2:19:47 PM	26	2	...	3:17:23 PM	12	30.9	NaN	-1.255189	36.782203	-1.320157	36.830887	Rider_Id_177	2128

10 rows × 29 columns

# 注意：以此只用一种填充方式，众数可能有多个，因此带下标0
mean_df = round(sendy_data['Temperature'].mean())
mode_df = round(sendy_data['Temperature'].mode()[0])
median_df = round(sendy_data['Temperature'].median())

#Fill with mean
print("Filling with mean value of {}".format(mean_df))
sendy_data['Temperature'] = sendy_data['Temperature'].fillna(mean_df)

#Fill with mode
print("Filling with modal value of {}".format(mode_df))
sendy_data['Temperature'] = sendy_data['Temperature'].fillna(mode_df)

#Fill with median
print("Filling with median value of {}".format(median_df))
sendy_data['Temperature'] = sendy_data['Temperature'].fillna(median_df)

Filling with mean value of 23.0
Filling with modal value of 23.0
Filling with median value of 23.0

使用模型填充缺失值

这里对Precipitation in millimeters特征进行模型填充，先找到与目标特征相关的特征值加以利用.
The Seaborn 热图有助于找到相关特征.

plt.figure(figsize = (15,10))
sns.heatmap(sendy_data.corr())

<matplotlib.axes._subplots.AxesSubplot at 0x15c46e85cc0>

在这里插入图片描述

上图看出，大部分特征都与之不相关，这里使用最后三个特征： Destination Lat，Destination Long，Time from Pickup to Arrival

from sklearn.linear_model import LinearRegression

lr = LinearRegression()

to_train = ['Precipitation in millimeters', 'Destination Lat', 'Destination Long', 'Time from Pickup to Arrival']
temp_df = sendy_data[to_train]

#Split dataset with missing values and no missing values as test and train set respectively.
x_train = temp_df[temp_df['Precipitation in millimeters'].notnull()].drop(columns='Precipitation in millimeters')
y_train = temp_df[temp_df['Precipitation in millimeters'].notnull()]['Precipitation in millimeters']
x_test = temp_df[temp_df['Precipitation in millimeters'].isnull()].drop(columns='Precipitation in millimeters')

#Fit a simple linear model to the dataset
lr.fit(x_train, y_train)
pred = lr.predict(x_test)

#print fill values
print(np.round(pred, 5))

#Perform filling
sendy_data['Precipitation in millimeters'][sendy_data['Precipitation in millimeters'].isnull()] = pred

[8.03056 8.18609 7.86142 ... 8.46164 7.45028 8.60717]

也可用sklearn.experimental 模块的IterativeImputer 来自动填充

missing_indx = list(sendy_data['Temperature'][sendy_data['Temperature'].isna()].index)
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# 使用带随机森林的IterativeImputer
imp = IterativeImputer(RandomForestRegressor(n_estimators=5), max_iter=5, random_state=1)
to_train = ['Temperature', 'Destination Lat', 'Destination Long','Time from Pickup to Arrival']

#填充
sendy_data[to_train] = pd.DataFrame(imp.fit_transform(sendy_data[to_train]), columns=to_train)

sendy_data['Temperature'][missing_indx].head(10)

2     23.70
8     26.06
14    21.66
15    20.42
16    21.46
32    23.84
42    22.32
46    27.18
49    22.34
53    21.36
Name: Temperature, dtype: float64

类别特征处理

类别特征只取有限的值. 如某地点/事物的热度或者app的评分等级(1,2,3,4,5).
本文中，loan_demographics信息的教育程度level_of_education_clients是类别特征，取值有：Secondary, Graduate, Post-Graduate, Primary.
机器学习算法不能直接作用域类别特征原始形式，因此必须转为数值形式，此过程成为编码
编码有多种方式，具体取决于类别特征类型.

类别特征类型

1、有序类别特征：有自然顺序，如评分等级 (1,2,3,4,5).
2、无序类别特征：无先后之分。例如食物：(大米、意面、通心粉).
有序特征编码：若类别数较少，如教育程度 level_of_education_clients ，可人工编码.

loan_demographics['level_of_education_clients'].unique()

array([nan, 'Secondary', 'Graduate', 'Post-Graduate', 'Primary'],
      dtype=object)

#use a simple map function
map_education = {"Primary" : 1, "Secondary": 2, "Graduate": 3, "Post-Graduate": 4}
    
loan_demographics['level_of_education_clients'] = loan_demographics['level_of_education_clients'].map(map_education)
loan_demographics['level_of_education_clients'].value_counts()

3.0    420
2.0     89
4.0     68
1.0     10
Name: level_of_education_clients, dtype: int64

若需自动编码，可以用categorical_encoders库，有大量编码方式。

!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.1.0-py2.py3-none-any.whl (100 kB)
Requirement already satisfied: numpy>=1.11.3 in c:\anaconda3\envs\mytf\lib\site-packages (from category_encoders) (1.18.1)
Collecting patsy>=0.4.1
  Downloading patsy-0.5.1-py2.py3-none-any.whl (231 kB)
Requirement already satisfied: scipy>=0.19.0 in c:\anaconda3\envs\mytf\lib\site-packages (from category_encoders) (1.4.1)
Requirement already satisfied: scikit-learn>=0.20.0 in c:\anaconda3\envs\mytf\lib\site-packages (from category_encoders) (0.22.2.post1)
Requirement already satisfied: pandas>=0.21.1 in c:\anaconda3\envs\mytf\lib\site-packages (from category_encoders) (1.0.2)
Collecting statsmodels>=0.6.1
  Downloading statsmodels-0.11.1-cp37-none-win_amd64.whl (8.2 MB)
Requirement already satisfied: six in c:\anaconda3\envs\mytf\lib\site-packages (from patsy>=0.4.1->category_encoders) (1.14.0)
Requirement already satisfied: joblib>=0.11 in c:\anaconda3\envs\mytf\lib\site-packages (from scikit-learn>=0.20.0->category_encoders) (0.14.1)
Requirement already satisfied: python-dateutil>=2.6.1 in c:\anaconda3\envs\mytf\lib\site-packages (from pandas>=0.21.1->category_encoders) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in c:\anaconda3\envs\mytf\lib\site-packages (from pandas>=0.21.1->category_encoders) (2019.3)
Installing collected packages: patsy, statsmodels, category-encoders
Successfully installed category-encoders-2.1.0 patsy-0.5.1 statsmodels-0.11.1

有序特征若类别数较多，可用标签编码（Label Encoding），对每个类别标记唯一的数值
这里以特征bank_name_clients和bank_branch_clients 作展示，两者类别数量较多

cat_cols = loan_demographics.select_dtypes(include='object').columns # object 为类别特征
for col in cat_cols:
    print("Number of classes in {}".format(col))
    print(loan_demographics[col].nunique())
    print('--------------------------')

Number of classes in customerid
4334
--------------------------
Number of classes in birthdate
3297
--------------------------
Number of classes in bank_account_type
3
--------------------------
Number of classes in bank_name_clients
18
--------------------------
Number of classes in bank_branch_clients
45
--------------------------
Number of classes in employment_status_clients
6
--------------------------

loan_demographics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4346 entries, 0 to 4345
Data columns (total 9 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4346 non-null   object 
 1   birthdate                   4346 non-null   object 
 2   bank_account_type           4346 non-null   object 
 3   longitude_gps               4346 non-null   float64
 4   latitude_gps                4346 non-null   float64
 5   bank_name_clients           4346 non-null   object 
 6   bank_branch_clients         51 non-null     object 
 7   employment_status_clients   4346 non-null   object 
 8   level_of_education_clients  587 non-null    float64
dtypes: float64(3), object(6)
memory usage: 305.7+ KB

categorical_encoders 库的OrdinalEncoder 函数可用于编码

import category_encoders as ce

#Label encoding
cat_cols = ['bank_name_clients', 'bank_branch_clients']
encoder = ce.OrdinalEncoder(cols=cat_cols)
loan_demographics = encoder.fit_transform(loan_demographics)

loan_demographics.bank_name_clients.value_counts()

1     1598
11     590
4      432
8      353
9      301
10     282
5      150
13     145
6      121
3      103
7      102
2       55
18      28
12      23
15      22
16      19
14      15
17       7
Name: bank_name_clients, dtype: int64

独特编码

使用二进制值表示类别，每种类别创建一个特征，在类别特征可取值变多时，效率降低。

cats = ['bank_account_type','level_of_education_clients']
one_hot_enc = ce.OneHotEncoder(cols=cats)
loan_demographics = one_hot_enc.fit_transform(loan_demographics)
loan_demographics.head().T

	0	1	2	3	4
customerid	8a858e135cb22031015cbafc76964ebd	8a858e275c7ea5ec015c82482d7c3996	8a858e5b5bd99460015bdc95cd485634	8a858efd5ca70688015cabd1f1e94b55	8a858e785acd3412015acd48f4920d04
birthdate	1973-10-10 00:00:00.000000	1986-01-21 00:00:00.000000	1987-04-01 00:00:00.000000	1991-07-19 00:00:00.000000	1982-11-22 00:00:00.000000
bank_account_type_1	1	1	1	1	1
bank_account_type_2	0	0	0	0	0
bank_account_type_3	0	0	0	0	0
longitude_gps	3.31922	3.3256	5.7461	3.36285	8.45533
latitude_gps	6.5286	7.1194	5.56317	6.64249	11.9714
bank_name_clients	1	2	3	1	1
bank_branch_clients	1	1	1	1	1
employment_status_clients	Permanent	Permanent	Permanent	Permanent	Permanent
level_of_education_clients_1	1	1	1	1	1
level_of_education_clients_2	0	0	0	0	0
level_of_education_clients_3	0	0	0	0	0
level_of_education_clients_4	0	0	0	0	0
level_of_education_clients_5	0	0	0	0	0

哈希编码

是一种快速、节省空间的特征编码方式. 在类别特征取值多时比较高效。对特征值施加hash函数

cat_cols = ['bank_name_clients', 'bank_branch_clients']
hash_enc = ce.HashingEncoder(cols=cat_cols, n_components=10)
loan_demographics = hash_enc.fit_transform(loan_demographics)
loan_demographics.head()

	customerid	birthdate	bank_account_type	longitude_gps	latitude_gps	bank_name_clients	bank_branch_clients	employment_status_clients	level_of_education_clients
0	8a858e135cb22031015cbafc76964ebd	1973-10-10 00:00:00.000000	Savings	3.319219	6.528604	1	1	Permanent	NaN
1	8a858e275c7ea5ec015c82482d7c3996	1986-01-21 00:00:00.000000	Savings	3.325598	7.119403	2	1	Permanent	NaN
2	8a858e5b5bd99460015bdc95cd485634	1987-04-01 00:00:00.000000	Savings	5.746100	5.563174	3	1	Permanent	NaN
3	8a858efd5ca70688015cabd1f1e94b55	1991-07-19 00:00:00.000000	Savings	3.362850	6.642485	1	1	Permanent	NaN
4	8a858e785acd3412015acd48f4920d04	1982-11-22 00:00:00.000000	Savings	8.455332	11.971410	1	1	Permanent	NaN

更多编码详见：https://contrib.scikit-learn.org/categorical-encoding/

数值/连续特征的处理

数值/连续特征是数据集中最常见特征的形式。关于此类特征的工程往往基于领域知识，可做以下处理:
对数变换：中心化（正则分布），可让大部分算法效果更好.
注意：当对目标特征做对数变换时，在做模型解释时需要加上指数变换。
对数变换常用于有偏特征，或左偏或有偏，偏度可用图像观察。
这里对sendy数据集的Distance 特征做分析，为右偏

sns.distplot(sendy_data['Distance (KM)'])
plt.title("Histogram of Distance (KM)")
plt.show()

在这里插入图片描述

sendy_data['Distance (KM)'] =  np.log1p(sendy_data['Distance (KM)']) # 使用加1后去对数，避免对0取对数
sns.distplot(sendy_data['Distance (KM)'])
plt.title("Log-Transformed plot of Distance (KM)")
plt.show()

在这里插入图片描述

使用领域知识构造特征

当对数据的领域了解时，可以依此创建一些对模型有帮助的特征，示例如下：
1、利息 Interest elapsed：为总需还款金额totaldue 与贷款金额loanamount之差

loan_prev['interest_elapsed'] = loan_prev['totaldue'] - loan_prev['loanamount']

loan_prev['interest_elapsed']

0        3000.0
1        3000.0
2        3800.0
3        1500.0
4        1500.0
          ...  
18178    3000.0
18179    4400.0
18180    3000.0
18181    3000.0
18182    3000.0
Name: interest_elapsed, Length: 18183, dtype: float64

loan_prev

	customerid	systemloanid	loannumber	approveddate	creationdate	loanamount	totaldue	termdays	closeddate	referredby	firstduedate	firstrepaiddate	interest_elapsed
0	8a2a81a74ce8c05d014cfb32a0da1049	301682320	2	2016-08-15 18:22:40.000000	2016-08-15 17:22:32.000000	10000.0	13000.0	30	2016-09-01 16:06:48.000000	NaN	2016-09-14 00:00:00.000000	2016-09-01 15:51:43.000000	3000.0
1	8a2a81a74ce8c05d014cfb32a0da1049	301883808	9	2017-04-28 18:39:07.000000	2017-04-28 17:38:53.000000	10000.0	13000.0	30	2017-05-28 14:44:49.000000	NaN	2017-05-30 00:00:00.000000	2017-05-26 00:00:00.000000	3000.0
2	8a2a81a74ce8c05d014cfb32a0da1049	301831714	8	2017-03-05 10:56:25.000000	2017-03-05 09:56:19.000000	20000.0	23800.0	30	2017-04-26 22:18:56.000000	NaN	2017-04-04 00:00:00.000000	2017-04-26 22:03:47.000000	3800.0
3	8a8588f35438fe12015444567666018e	301861541	5	2017-04-09 18:25:55.000000	2017-04-09 17:25:42.000000	10000.0	11500.0	15	2017-04-24 01:35:52.000000	NaN	2017-04-24 00:00:00.000000	2017-04-24 00:48:43.000000	1500.0
4	8a85890754145ace015429211b513e16	301941754	2	2017-06-17 09:29:57.000000	2017-06-17 08:29:50.000000	10000.0	11500.0	15	2017-07-14 21:18:43.000000	NaN	2017-07-03 00:00:00.000000	2017-07-14 21:08:35.000000	1500.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
18178	8a858899538ddb8e0153a2b555421fc5	301611754	2	2016-04-16 13:36:34.000000	2016-04-16 12:36:28.000000	10000.0	13000.0	30	2016-05-14 00:04:52.000000	NaN	2016-05-16 00:00:00.000000	2016-05-13 18:05:07.000000	3000.0
18179	8a858899538ddb8e0153a2b555421fc5	301761267	9	2016-11-18 14:26:07.000000	2016-11-18 13:25:51.000000	30000.0	34400.0	30	2016-12-13 16:08:57.000000	NaN	2016-12-19 00:00:00.000000	2016-12-13 15:53:48.000000	4400.0
18180	8a858899538ddb8e0153a2b555421fc5	301631653	4	2016-06-12 15:30:56.000000	2016-06-12 14:30:50.000000	10000.0	13000.0	30	2016-07-09 15:39:00.000000	NaN	2016-07-12 00:00:00.000000	2016-07-09 15:23:56.000000	3000.0
18181	8a858f0656b7820c0156c92ca3ba436f	301697691	1	2016-08-27 20:03:45.000000	2016-08-27 19:03:34.000000	10000.0	13000.0	30	2016-10-15 10:17:54.000000	NaN	2016-09-26 00:00:00.000000	2016-10-15 10:02:45.000000	3000.0
18182	8a858faf5679a838015688de3028143d	301715255	2	2016-09-14 23:42:14.000000	2016-09-14 22:42:05.000000	10000.0	13000.0	30	2016-09-29 19:51:04.000000	NaN	2016-10-14 00:00:00.000000	2016-09-29 19:35:55.000000	3000.0

18183 rows × 13 columns

2、贷款数量Loan count：根据顾客id分组，累计loannumber

loan_prev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18183 entries, 0 to 18182
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        18183 non-null  object 
 1   systemloanid      18183 non-null  int64  
 2   loannumber        18183 non-null  int64  
 3   approveddate      18183 non-null  object 
 4   creationdate      18183 non-null  object 
 5   loanamount        18183 non-null  float64
 6   totaldue          18183 non-null  float64
 7   termdays          18183 non-null  int64  
 8   closeddate        18183 non-null  object 
 9   referredby        1026 non-null   object 
 10  firstduedate      18183 non-null  object 
 11  firstrepaiddate   18183 non-null  object 
 12  interest_elapsed  18183 non-null  float64
dtypes: float64(3), int64(3), object(7)
memory usage: 1.8+ MB

#Groupby customer id and calculate their total loans taken
loannumber_count = loan_prev.groupby(by='customerid').agg(['count'])[['loannumber']].reset_index()

#merge back to dataset on customer_id
loan_prev = loan_prev.merge(right=loannumber_count, how='left', on='customerid')
loan_prev.head()

	customerid	systemloanid	loannumber	approveddate	creationdate	loanamount	totaldue	termdays	closeddate	referredby	firstduedate	firstrepaiddate	(loannumber, count)
0	8a2a81a74ce8c05d014cfb32a0da1049	301682320	2	2016-08-15 18:22:40.000000	2016-08-15 17:22:32.000000	10000.0	13000.0	30	2016-09-01 16:06:48.000000	NaN	2016-09-14 00:00:00.000000	2016-09-01 15:51:43.000000	11
1	8a2a81a74ce8c05d014cfb32a0da1049	301883808	9	2017-04-28 18:39:07.000000	2017-04-28 17:38:53.000000	10000.0	13000.0	30	2017-05-28 14:44:49.000000	NaN	2017-05-30 00:00:00.000000	2017-05-26 00:00:00.000000	11
2	8a2a81a74ce8c05d014cfb32a0da1049	301831714	8	2017-03-05 10:56:25.000000	2017-03-05 09:56:19.000000	20000.0	23800.0	30	2017-04-26 22:18:56.000000	NaN	2017-04-04 00:00:00.000000	2017-04-26 22:03:47.000000	11
3	8a8588f35438fe12015444567666018e	301861541	5	2017-04-09 18:25:55.000000	2017-04-09 17:25:42.000000	10000.0	11500.0	15	2017-04-24 01:35:52.000000	NaN	2017-04-24 00:00:00.000000	2017-04-24 00:48:43.000000	6
4	8a85890754145ace015429211b513e16	301941754	2	2017-06-17 09:29:57.000000	2017-06-17 08:29:50.000000	10000.0	11500.0	15	2017-07-14 21:18:43.000000	NaN	2017-07-03 00:00:00.000000	2017-07-14 21:08:35.000000	2

loan_prev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18183 entries, 0 to 18182
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customerid           18183 non-null  object 
 1   systemloanid         18183 non-null  int64  
 2   loannumber           18183 non-null  int64  
 3   approveddate         18183 non-null  object 
 4   creationdate         18183 non-null  object 
 5   loanamount           18183 non-null  float64
 6   totaldue             18183 non-null  float64
 7   termdays             18183 non-null  int64  
 8   closeddate           18183 non-null  object 
 9   referredby           1026 non-null   object 
 10  firstduedate         18183 non-null  object 
 11  firstrepaiddate      18183 non-null  object 
 12  (loannumber, count)  18183 non-null  int64  
dtypes: float64(2), int64(4), object(7)
memory usage: 1.9+ MB

3、速度Speed：发货距离Distance (KM)与时间Time from Pickup to Arrival之比

#create feature speed in sendy dataset
sendy_data['speed'] = sendy_data['Distance (KM)'] / sendy_data['Time from Pickup to Arrival']
sendy_data.head().T

	0	1	2	3	4
Order No	Order_No_4211	Order_No_25375	Order_No_1899	Order_No_9336	Order_No_27883
User Id	User_Id_633	User_Id_2285	User_Id_265	User_Id_1402	User_Id_1737
Vehicle Type	Bike	Bike	Bike	Bike	Bike
Platform Type	3	3	3	3	1
Personal or Business	Business	Personal	Business	Business	Personal
Placement - Day of Month	9	12	30	15	13
Placement - Weekday (Mo = 1)	5	5	2	5	1
Placement - Time	9:35:46 AM	11:16:16 AM	12:39:25 PM	9:25:34 AM	9:55:18 AM
Confirmation - Day of Month	9	12	30	15	13
Confirmation - Weekday (Mo = 1)	5	5	2	5	1
Confirmation - Time	9:40:10 AM	11:23:21 AM	12:42:44 PM	9:26:05 AM	9:56:18 AM
Arrival at Pickup - Day of Month	9	12	30	15	13
Arrival at Pickup - Weekday (Mo = 1)	5	5	2	5	1
Arrival at Pickup - Time	10:04:47 AM	11:40:22 AM	12:49:34 PM	9:37:56 AM	10:03:53 AM
Pickup - Day of Month	9	12	30	15	13
Pickup - Weekday (Mo = 1)	5	5	2	5	1
Pickup - Time	10:27:30 AM	11:44:09 AM	12:53:03 PM	9:43:06 AM	10:05:23 AM
Arrival at Destination - Day of Month	9	12	30	15	13
Arrival at Destination - Weekday (Mo = 1)	5	5	2	5	1
Arrival at Destination - Time	10:39:55 AM	12:17:22 PM	1:00:38 PM	10:05:27 AM	10:25:37 AM
Distance (KM)	1.60944	2.83321	1.38629	2.30259	2.30259
Temperature	20.4	26.4	23.7	19.2	15.4
Precipitation in millimeters	NaN	NaN	NaN	NaN	NaN
Pickup Lat	-1.31775	-1.35145	-1.30828	-1.2813	-1.2666
Pickup Long	36.8304	36.8993	36.8434	36.8324	36.7921
Destination Lat	-1.30041	-1.295	-1.30092	-1.25715	-1.29504
Destination Long	36.8297	36.8144	36.8282	36.7951	36.8098
Rider Id	Rider_Id_432	Rider_Id_856	Rider_Id_155	Rider_Id_855	Rider_Id_770
Time from Pickup to Arrival	745	1993	455	1341	1214
speed	0.00216032	0.00142158	0.0030468	0.00171707	0.00189669

多项式（交叉）特征

多项式特征是在特征之间创建关联，有助于捕捉特征之间的关系，减少模型偏差（只要不过拟合）.
可以通过特征之间的加、乘、除来创建交叉特征.
这里用sklearn的特征模块，从loan previous 数据集的loannumber, totaldue,termdays 创建特征

#Use Sklearn Polynomial Features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures()
to_cross = ['loannumber', 'totaldue', 'termdays']
crossed_feats = poly.fit_transform(loan_prev[to_cross].values)

#Convert to Pandas DataFrame and merge to original dataset
crossed_feats = pd.DataFrame(crossed_feats)
loan_prev = pd.concat([loan_prev, crossed_feats], axis=1)

loan_prev.head().T

	0	1	2	3	4
customerid	8a2a81a74ce8c05d014cfb32a0da1049	8a2a81a74ce8c05d014cfb32a0da1049	8a2a81a74ce8c05d014cfb32a0da1049	8a8588f35438fe12015444567666018e	8a85890754145ace015429211b513e16
systemloanid	301682320	301883808	301831714	301861541	301941754
loannumber	2	9	8	5	2
approveddate	2016-08-15 18:22:40.000000	2017-04-28 18:39:07.000000	2017-03-05 10:56:25.000000	2017-04-09 18:25:55.000000	2017-06-17 09:29:57.000000
creationdate	2016-08-15 17:22:32.000000	2017-04-28 17:38:53.000000	2017-03-05 09:56:19.000000	2017-04-09 17:25:42.000000	2017-06-17 08:29:50.000000
loanamount	10000	10000	20000	10000	10000
totaldue	13000	13000	23800	11500	11500
termdays	30	30	30	15	15
closeddate	2016-09-01 16:06:48.000000	2017-05-28 14:44:49.000000	2017-04-26 22:18:56.000000	2017-04-24 01:35:52.000000	2017-07-14 21:18:43.000000
referredby	NaN	NaN	NaN	NaN	NaN
firstduedate	2016-09-14 00:00:00.000000	2017-05-30 00:00:00.000000	2017-04-04 00:00:00.000000	2017-04-24 00:00:00.000000	2017-07-03 00:00:00.000000
firstrepaiddate	2016-09-01 15:51:43.000000	2017-05-26 00:00:00.000000	2017-04-26 22:03:47.000000	2017-04-24 00:48:43.000000	2017-07-14 21:08:35.000000
(loannumber, count)	11	11	11	6	2
0	1	1	1	1	1
1	2	9	8	5	2
2	13000	13000	23800	11500	11500
3	30	30	30	15	15
4	4	81	64	25	4
5	26000	117000	190400	57500	23000
6	60	270	240	75	30
7	1.69e+08	1.69e+08	5.6644e+08	1.3225e+08	1.3225e+08
8	390000	390000	714000	172500	172500
9	900	900	900	225	225

特征标准化

标准化将数值特征分布修改到正常范围，不破坏数据取值范围和信息。
正则化对于基于距离的模型如KNN十分重要，也有助于提高神经网络的训练速度.
sklearn 提供的标准化函数:
StandardScaler: 减去均值、缩放到单位方差.
RobustScaler: 使用对异常值有鲁棒性的统计信息来缩放特征
MinMaxScaler: 将特征缩放到指定范围（范围可自定义）.
注意: 不要将标准化函数在测试/验证集上训练。
sklearn的标准化函数对缺失值不具有鲁棒性，因此使用前要做缺失值填充.

from sklearn.preprocessing import StandardScaler

feats = ['loannumber', 'totaldue', 'termdays']
sc = StandardScaler()
sc_data = sc.fit_transform(loan_prev[feats])
sc_data

array([[-0.67377132, -0.62877649,  0.30213166],
       [ 1.48047238, -0.62877649,  0.30213166],
       [ 1.17272328,  0.40432506,  0.30213166],
       ...,
       [-0.05827312, -0.62877649,  0.30213166],
       [-0.98152042, -0.62877649,  0.30213166],
       [-0.67377132, -0.62877649,  0.30213166]])

sc_data.shape

(18183, 3)

from sklearn.preprocessing import RobustScaler
robsc = RobustScaler()

rb_data = robsc.fit_transform(loan_prev[feats])
rb_data

array([[-0.25      ,  0.        ,  0.        ],
       [ 1.5       ,  0.        ,  0.        ],
       [ 1.25      ,  0.83076923,  0.        ],
       ...,
       [ 0.25      ,  0.        ,  0.        ],
       [-0.5       ,  0.        ,  0.        ],
       [-0.25      ,  0.        ,  0.        ]])

from sklearn.preprocessing import MinMaxScaler

minsc = MinMaxScaler(feature_range=(0,2))
minmax_data = minsc.fit_transform(loan_prev[feats])
minmax_data

array([[0.08      , 0.29543697, 0.4       ],
       [0.64      , 0.29543697, 0.4       ],
       [0.56      , 0.6295437 , 0.4       ],
       ...,
       [0.24      , 0.29543697, 0.4       ],
       [0.        , 0.29543697, 0.4       ],
       [0.08      , 0.29543697, 0.4       ]])

日期特征处理

对时间特征可做很多处理，如时间差
对loan performance 数据集的申请时间creationdate 和审批通过时间做处理

#First convert to pandas datetime format
loan_perf['approveddate'] = pd.to_datetime(loan_perf['approveddate'])
loan_perf['creationdate'] = pd.to_datetime(loan_perf['creationdate'])
loan_perf['date_elapsed_in_secs'] = (loan_perf['approveddate'] - loan_perf['creationdate']) / np.timedelta64(1,'s') #can subtitute with [h,m,s]
loan_perf.head()

	customerid	systemloanid	loannumber	approveddate	creationdate	loanamount	totaldue	termdays	referredby	date_elapsed_in_secs
0	8a2a81a74ce8c05d014cfb32a0da1049	301994762	12	2017-07-25 08:22:56	2017-07-25 07:22:47	30000.0	34500.0	30	NaN	3609.0
1	8a85886e54beabf90154c0a29ae757c0	301965204	2	2017-07-05 17:04:41	2017-07-05 16:04:18	15000.0	17250.0	30	NaN	3623.0
2	8a8588f35438fe12015444567666018e	301966580	7	2017-07-06 14:52:57	2017-07-06 13:52:51	20000.0	22250.0	15	NaN	3606.0
3	8a85890754145ace015429211b513e16	301999343	3	2017-07-27 19:00:41	2017-07-27 18:00:35	10000.0	11500.0	15	NaN	3606.0
4	8a858970548359cc0154883481981866	301962360	9	2017-07-03 23:42:45	2017-07-03 22:42:39	40000.0	44000.0	30	NaN	3606.0

以下展示如何提取时间特征如：天、周、小时、秒等:

#First convert to pandas datetime format
loan_perf['approveddate'] = pd.to_datetime(loan_perf['approveddate'])

#use pandas built in functions
loan_perf['approved_day'] = loan_perf['approveddate'].dt.day
loan_perf['approved_week'] = loan_perf['approveddate'].dt.week
loan_perf['approved_hour'] = loan_perf['approveddate'].dt.hour

Pandas date特征还有其他属性可尝试.
还可以提取一天中的阶段(morning, afternoon, evenings)

def map_hours(x):
    if x in [0,1,2,3,4,5,6,7,8,9,10,11,12]:
        return 'morning'
    elif x in [13,14,15,16]:
        return 'afternoon'
    else:
        return 'evening'
    
loan_perf['period_of_day'] = loan_perf['approved_hour'].map(map_hours)
loan_perf.head()

	customerid	systemloanid	loannumber	approveddate	creationdate	loanamount	totaldue	termdays	referredby	date_elapsed_in_secs	approved_day	approved_week	approved_hour	period_of_day
0	8a2a81a74ce8c05d014cfb32a0da1049	301994762	12	2017-07-25 08:22:56	2017-07-25 07:22:47	30000.0	34500.0	30	NaN	3609.0	25	30	8	morning
1	8a85886e54beabf90154c0a29ae757c0	301965204	2	2017-07-05 17:04:41	2017-07-05 16:04:18	15000.0	17250.0	30	NaN	3623.0	5	27	17	evening
2	8a8588f35438fe12015444567666018e	301966580	7	2017-07-06 14:52:57	2017-07-06 13:52:51	20000.0	22250.0	15	NaN	3606.0	6	27	14	afternoon
3	8a85890754145ace015429211b513e16	301999343	3	2017-07-27 19:00:41	2017-07-27 18:00:35	10000.0	11500.0	15	NaN	3606.0	27	30	19	evening
4	8a858970548359cc0154883481981866	301962360	9	2017-07-03 23:42:45	2017-07-03 22:42:39	40000.0	44000.0	30	NaN	3606.0	3	27	23	evening

datasist库还有更多时间处理函数，详见：https://towardsdatascience.com/https-medium-com-risingdeveloper-easy-data-analysis-visualization-and-modeling-using-datasist-part1-8b26526dbe01

地理位置特征处理

数据集中的经纬度，地址都是地理位置特征.
对于经纬度特征可以做很多处理，可以用Geojson或 Geopy的库将位置特征值转为地图上的地址.
但这类方法很慢，本文展示更简单且更快的方法
以下示例来自于Kaggle.
曼哈顿距离: 两点间的水平距离和垂直距离之和

#曼哈顿距离
def manhattan_distance(lat1, lng1, lat2, lng2):
    a = np.abs(lat2 -lat1)
    b = np.abs(lng1 - lng2)
    return a + b
sendy_data['manhattan_dist'] = manhattan_distance(sendy_data['Pickup Lat'].values, sendy_data['Pickup Long'].values,
                                               sendy_data['Destination Lat'].values, sendy_data['Destination Long'].values)
sendy_data.head()

	Order No	User Id	Vehicle Type	Platform Type	Personal or Business	Placement - Day of Month	Placement - Weekday (Mo = 1)	Placement - Time	Confirmation - Day of Month	Confirmation - Weekday (Mo = 1)	...	Temperature	Precipitation in millimeters	Pickup Lat	Pickup Long	Destination Lat	Destination Long	Rider Id	Time from Pickup to Arrival	speed	manhattan_dist
0	Order_No_4211	User_Id_633	Bike	3	Business	9	5	9:35:46 AM	9	5	...	20.4	NaN	-1.317755	36.830370	-1.300406	36.829741	Rider_Id_432	745.0	0.002160	0.017978
1	Order_No_25375	User_Id_2285	Bike	3	Personal	12	5	11:16:16 AM	12	5	...	26.4	NaN	-1.351453	36.899315	-1.295004	36.814358	Rider_Id_856	1993.0	0.001422	0.141406
2	Order_No_1899	User_Id_265	Bike	3	Business	30	2	12:39:25 PM	30	2	...	23.7	NaN	-1.308284	36.843419	-1.300921	36.828195	Rider_Id_155	455.0	0.003047	0.022588
3	Order_No_9336	User_Id_1402	Bike	3	Business	15	5	9:25:34 AM	15	5	...	19.2	NaN	-1.281301	36.832396	-1.257147	36.795063	Rider_Id_855	1341.0	0.001717	0.061487
4	Order_No_27883	User_Id_1737	Bike	1	Personal	13	1	9:55:18 AM	13	1	...	15.4	NaN	-1.266597	36.792118	-1.295041	36.809817	Rider_Id_770	1214.0	0.001897	0.046143

5 rows × 31 columns

Haversine距离:是两点在球体上的距离, 在导航中很重要.

#Haversine distance
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h
sendy_data['haversine_dist'] = haversine_array(sendy_data['Pickup Lat'].values, sendy_data['Pickup Long'].values,
                                                   sendy_data['Destination Lat'].values, sendy_data['Destination Long'].values)
sendy_data.head()

	Order No	User Id	Vehicle Type	Platform Type	Personal or Business	Placement - Day of Month	Placement - Weekday (Mo = 1)	Placement - Time	Confirmation - Day of Month	Confirmation - Weekday (Mo = 1)	...	Precipitation in millimeters	Pickup Lat	Pickup Long	Destination Lat	Destination Long	Rider Id	Time from Pickup to Arrival	speed	manhattan_dist	haversine_dist
0	Order_No_4211	User_Id_633	Bike	3	Business	9	5	9:35:46 AM	9	5	...	NaN	-1.317755	36.830370	-1.300406	36.829741	Rider_Id_432	745.0	0.002160	0.017978	1.930333
1	Order_No_25375	User_Id_2285	Bike	3	Personal	12	5	11:16:16 AM	12	5	...	NaN	-1.351453	36.899315	-1.295004	36.814358	Rider_Id_856	1993.0	0.001422	0.141406	11.339849
2	Order_No_1899	User_Id_265	Bike	3	Business	30	2	12:39:25 PM	30	2	...	NaN	-1.308284	36.843419	-1.300921	36.828195	Rider_Id_155	455.0	0.003047	0.022588	1.880079
3	Order_No_9336	User_Id_1402	Bike	3	Business	15	5	9:25:34 AM	15	5	...	NaN	-1.281301	36.832396	-1.257147	36.795063	Rider_Id_855	1341.0	0.001717	0.061487	4.943458
4	Order_No_27883	User_Id_1737	Bike	1	Personal	13	1	9:55:18 AM	13	1	...	NaN	-1.266597	36.792118	-1.295041	36.809817	Rider_Id_770	1214.0	0.001897	0.046143	3.724829

5 rows × 32 columns

Bearing角度: 是从起始点到目的地的转向角度, 必须在 0 到 360之间.

#Bearing
def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))
sendy_data['bearing'] = bearing_array(sendy_data['Pickup Lat'].values, sendy_data['Pickup Long'].values,
                                                   sendy_data['Destination Lat'].values, sendy_data['Destination Long'].values)
sendy_data.head()

	Order No	User Id	Vehicle Type	Platform Type	Personal or Business	Placement - Day of Month	Placement - Weekday (Mo = 1)	Placement - Time	Confirmation - Day of Month	Confirmation - Weekday (Mo = 1)	...	Pickup Lat	Pickup Long	Destination Lat	Destination Long	Rider Id	Time from Pickup to Arrival	speed	manhattan_dist	haversine_dist	bearing
0	Order_No_4211	User_Id_633	Bike	3	Business	9	5	9:35:46 AM	9	5	...	-1.317755	36.830370	-1.300406	36.829741	Rider_Id_432	745.0	0.002160	0.017978	1.930333	-2.076903
1	Order_No_25375	User_Id_2285	Bike	3	Personal	12	5	11:16:16 AM	12	5	...	-1.351453	36.899315	-1.295004	36.814358	Rider_Id_856	1993.0	0.001422	0.141406	11.339849	-56.392163
2	Order_No_1899	User_Id_265	Bike	3	Business	30	2	12:39:25 PM	30	2	...	-1.308284	36.843419	-1.300921	36.828195	Rider_Id_155	455.0	0.003047	0.022588	1.880079	-64.183866
3	Order_No_9336	User_Id_1402	Bike	3	Business	15	5	9:25:34 AM	15	5	...	-1.281301	36.832396	-1.257147	36.795063	Rider_Id_855	1341.0	0.001717	0.061487	4.943458	-57.091553
4	Order_No_27883	User_Id_1737	Bike	1	Personal	13	1	9:55:18 AM	13	1	...	-1.266597	36.792118	-1.295041	36.809817	Rider_Id_770	1214.0	0.001897	0.046143	3.724829	148.114398

5 rows × 33 columns

中心点:计算两点经纬度之间的中点

#Get center of lat and longitude
sendy_data['center_latitude'] = (sendy_data['Pickup Lat'].values + sendy_data['Destination Lat'].values) / 2
sendy_data['center_longitude'] = (sendy_data['Pickup Long'].values + sendy_data['Destination Long'].values) / 2
sendy_data.head()

	Order No	User Id	Vehicle Type	Platform Type	Personal or Business	Placement - Day of Month	Placement - Weekday (Mo = 1)	Placement - Time	Confirmation - Day of Month	Confirmation - Weekday (Mo = 1)	...	Destination Lat	Destination Long	Rider Id	Time from Pickup to Arrival	speed	manhattan_dist	haversine_dist	bearing	center_latitude	center_longitude
0	Order_No_4211	User_Id_633	Bike	3	Business	9	5	9:35:46 AM	9	5	...	-1.300406	36.829741	Rider_Id_432	745.0	0.002160	0.017978	1.930333	-2.076903	-1.309080	36.830056
1	Order_No_25375	User_Id_2285	Bike	3	Personal	12	5	11:16:16 AM	12	5	...	-1.295004	36.814358	Rider_Id_856	1993.0	0.001422	0.141406	11.339849	-56.392163	-1.323229	36.856837
2	Order_No_1899	User_Id_265	Bike	3	Business	30	2	12:39:25 PM	30	2	...	-1.300921	36.828195	Rider_Id_155	455.0	0.003047	0.022588	1.880079	-64.183866	-1.304603	36.835807
3	Order_No_9336	User_Id_1402	Bike	3	Business	15	5	9:25:34 AM	15	5	...	-1.257147	36.795063	Rider_Id_855	1341.0	0.001717	0.061487	4.943458	-57.091553	-1.269224	36.813730
4	Order_No_27883	User_Id_1737	Bike	1	Personal	13	1	9:55:18 AM	13	1	...	-1.295041	36.809817	Rider_Id_770	1214.0	0.001897	0.046143	3.724829	148.114398	-1.280819	36.800968