import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Seaborn其实是在matplotlib的基础上进行了更高级的API封装,从而使得作图更加容易,在大多数情况下使用seaborn就能做出很具有吸引力的图。
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
# 数据路径
train_data_file = "./data/zhengqi_train.txt"
test_data_file = "./data/zhengqi_test.txt"
# 读取数据
# sep: 指定分割符,默认是’,’
train_data = pd.read_csv(train_data_file, sep = '\t', encoding = 'utf-8')
test_data = pd.read_csv(test_data_file, sep = '\t', encoding = 'utf-8')
# 查看训练集数据信息
# 共39列 2888行 无缺失值 数据类型为float64
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 39 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 V0 2888 non-null float64
1 V1 2888 non-null float64
2 V2 2888 non-null float64
3 V3 2888 non-null float64
4 V4 2888 non-null float64
5 V5 2888 non-null float64
6 V6 2888 non-null float64
7 V7 2888 non-null float64
8 V8 2888 non-null float64
9 V9 2888 non-null float64
10 V10 2888 non-null float64
11 V11 2888 non-null float64
12 V12 2888 non-null float64
13 V13 2888 non-null float64
14 V14 2888 non-null float64
15 V15 2888 non-null float64
16 V16 2888 non-null float64
17 V17 2888 non-null float64
18 V18 2888 non-null float64
19 V19 2888 non-null float64
20 V20 2888 non-null float64
21 V21 2888 non-null float64
22 V22 2888 non-null float64
23 V23 2888 non-null float64
24 V24 2888 non-null float64
25 V25 2888 non-null float64
26 V26 2888 non-null float64
27 V27 2888 non-null float64
28 V28 2888 non-null float64
29 V29 2888 non-null float64
30 V30 2888 non-null float64
31 V31 2888 non-null float64
32 V32 2888 non-null float64
33 V33 2888 non-null float64
34 V34 2888 non-null float64
35 V35 2888 non-null float64
36 V36 2888 non-null float64
37 V37 2888 non-null float64
38 target 2888 non-null float64
dtypes: float64(39)
memory usage: 880.1 KB
# 查看训练集数据信息
# 共39列 1925行 无缺失值 数据类型为float64
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925 entries, 0 to 1924
Data columns (total 38 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 V0 1925 non-null float64
1 V1 1925 non-null float64
2 V2 1925 non-null float64
3 V3 1925 non-null float64
4 V4 1925 non-null float64
5 V5 1925 non-null float64
6 V6 1925 non-null float64
7 V7 1925 non-null float64
8 V8 1925 non-null float64
9 V9 1925 non-null float64
10 V10 1925 non-null float64
11 V11 1925 non-null float64
12 V12 1925 non-null float64
13 V13 1925 non-null float64
14 V14 1925 non-null float64
15 V15 1925 non-null float64
16 V16 1925 non-null float64
17 V17 1925 non-null float64
18 V18 1925 non-null float64
19 V19 1925 non-null float64
20 V20 1925 non-null float64
21 V21 1925 non-null float64
22 V22 1925 non-null float64
23 V23 1925 non-null float64
24 V24 1925 non-null float64
25 V25 1925 non-null float64
26 V26 1925 non-null float64
27 V27 1925 non-null float64
28 V28 1925 non-null float64
29 V29 1925 non-null float64
30 V30 1925 non-null float64
31 V31 1925 non-null float64
32 V32 1925 non-null float64
33 V33 1925 non-null float64
34 V34 1925 non-null float64
35 V35 1925 non-null float64
36 V36 1925 non-null float64
37 V37 1925 non-null float64
dtypes: float64(38)
memory usage: 571.6 KB
# 训练集的统计信息
# count:数量统计,此列共有多少有效值
# unipue:不同的值有多少个
# std:标准差
# min:最小值
# 25%:四分之一分位数
# 50%:二分之一分位数
# 75%:四分之三分位数
# max:最大值
# mean:均值
train_data.describe(include='all') # include='all',代表对所有列进行统计,如果不加这个参数,则只对数值列进行统计
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | ... | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 |
mean | 0.123048 | 0.056068 | 0.289720 | -0.067790 | 0.012921 | -0.558565 | 0.182892 | 0.116155 | 0.177856 | -0.169452 | ... | 0.097648 | 0.055477 | 0.127791 | 0.020806 | 0.007801 | 0.006715 | 0.197764 | 0.030658 | -0.130330 | 0.126353 |
std | 0.928031 | 0.941515 | 0.911236 | 0.970298 | 0.888377 | 0.517957 | 0.918054 | 0.955116 | 0.895444 | 0.953813 | ... | 1.061200 | 0.901934 | 0.873028 | 0.902584 | 1.006995 | 1.003291 | 0.985675 | 0.970812 | 1.017196 | 0.983966 |
min | -4.335000 | -5.122000 | -3.420000 | -3.956000 | -4.742000 | -2.182000 | -4.576000 | -5.048000 | -4.692000 | -12.891000 | ... | -2.912000 | -4.507000 | -5.859000 | -4.053000 | -4.627000 | -4.789000 | -5.695000 | -2.608000 | -3.630000 | -3.044000 |
25% | -0.297000 | -0.226250 | -0.313000 | -0.652250 | -0.385000 | -0.853000 | -0.310000 | -0.295000 | -0.159000 | -0.390000 | ... | -0.664000 | -0.283000 | -0.170250 | -0.407250 | -0.499000 | -0.290000 | -0.202500 | -0.413000 | -0.798250 | -0.350250 |
50% | 0.359000 | 0.272500 | 0.386000 | -0.044500 | 0.110000 | -0.466000 | 0.388000 | 0.344000 | 0.362000 | 0.042000 | ... | -0.023000 | 0.053500 | 0.299500 | 0.039000 | -0.040000 | 0.160000 | 0.364000 | 0.137000 | -0.185500 | 0.313000 |
75% | 0.726000 | 0.599000 | 0.918250 | 0.624000 | 0.550250 | -0.154000 | 0.831250 | 0.782250 | 0.726000 | 0.042000 | ... | 0.745250 | 0.488000 | 0.635000 | 0.557000 | 0.462000 | 0.273000 | 0.602000 | 0.644250 | 0.495250 | 0.793250 |
max | 2.121000 | 1.918000 | 2.828000 | 2.457000 | 2.689000 | 0.489000 | 1.895000 | 1.918000 | 2.245000 | 1.335000 | ... | 4.580000 | 2.689000 | 2.013000 | 2.395000 | 5.465000 | 5.110000 | 2.324000 | 5.238000 | 3.000000 | 2.538000 |
8 rows × 39 columns
test_data.describe()
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V28 | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | ... | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 | 1925.000000 |
mean | -0.184404 | -0.083912 | -0.434762 | 0.101671 | -0.019172 | 0.838049 | -0.274092 | -0.173971 | -0.266709 | 0.255114 | ... | -0.206871 | -0.146463 | -0.083215 | -0.191729 | -0.030782 | -0.011433 | -0.009985 | -0.296895 | -0.046270 | 0.195735 |
std | 1.073333 | 1.076670 | 0.969541 | 1.034925 | 1.147286 | 0.963043 | 1.054119 | 1.040101 | 1.085916 | 1.014394 | ... | 1.064140 | 0.880593 | 1.126414 | 1.138454 | 1.130228 | 0.989732 | 0.995213 | 0.946896 | 1.040854 | 0.940599 |
min | -4.814000 | -5.488000 | -4.283000 | -3.276000 | -4.921000 | -1.168000 | -5.649000 | -5.625000 | -6.059000 | -6.784000 | ... | -2.435000 | -2.413000 | -4.507000 | -7.698000 | -4.057000 | -4.627000 | -4.789000 | -7.477000 | -2.608000 | -3.346000 |
25% | -0.664000 | -0.451000 | -0.978000 | -0.644000 | -0.497000 | 0.122000 | -0.732000 | -0.509000 | -0.775000 | -0.390000 | ... | -0.453000 | -0.818000 | -0.339000 | -0.476000 | -0.472000 | -0.460000 | -0.290000 | -0.349000 | -0.593000 | -0.432000 |
50% | 0.065000 | 0.195000 | -0.267000 | 0.220000 | 0.118000 | 0.437000 | -0.082000 | 0.018000 | -0.004000 | 0.401000 | ... | -0.445000 | -0.199000 | 0.010000 | 0.100000 | 0.155000 | -0.040000 | 0.160000 | -0.270000 | 0.083000 | 0.152000 |
75% | 0.549000 | 0.589000 | 0.278000 | 0.793000 | 0.610000 | 1.928000 | 0.457000 | 0.515000 | 0.482000 | 0.904000 | ... | -0.434000 | 0.468000 | 0.447000 | 0.471000 | 0.627000 | 0.419000 | 0.273000 | 0.364000 | 0.651000 | 0.797000 |
max | 2.100000 | 2.120000 | 1.946000 | 2.603000 | 4.475000 | 3.176000 | 1.528000 | 1.394000 | 2.408000 | 1.766000 | ... | 4.656000 | 3.022000 | 3.139000 | 1.428000 | 2.299000 | 5.465000 | 5.110000 | 1.671000 | 2.861000 | 3.021000 |
8 rows × 38 columns
# 训练集的前5行数据
train_data.head()
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.143 | 0.407 | 0.452 | -0.901 | -1.812 | -2.360 | -0.436 | -2.114 | ... | 0.136 | 0.109 | -0.615 | 0.327 | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 | 0.175 |
1 | 0.968 | 0.437 | 0.066 | 0.566 | 0.194 | -0.893 | -1.566 | -2.360 | 0.332 | -2.114 | ... | -0.128 | 0.124 | 0.032 | 0.600 | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 | 0.676 |
2 | 1.013 | 0.568 | 0.235 | 0.370 | 0.112 | -0.797 | -1.367 | -2.360 | 0.396 | -2.114 | ... | -0.009 | 0.361 | 0.277 | -0.116 | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 | 0.633 |
3 | 0.733 | 0.368 | 0.283 | 0.165 | 0.599 | -0.679 | -1.200 | -2.086 | 0.403 | -2.114 | ... | 0.015 | 0.417 | 0.279 | 0.603 | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 | 0.206 |
4 | 0.684 | 0.638 | 0.260 | 0.209 | 0.337 | -0.454 | -1.073 | -2.086 | 0.314 | -2.114 | ... | 0.183 | 1.078 | 0.328 | 0.418 | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 | 0.384 |
5 rows × 39 columns
test_data.head()
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V28 | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.368 | 0.380 | -0.225 | -0.049 | 0.379 | 0.092 | 0.550 | 0.551 | 0.244 | 0.904 | ... | -0.449 | 0.047 | 0.057 | -0.042 | 0.847 | 0.534 | -0.009 | -0.190 | -0.567 | 0.388 |
1 | 0.148 | 0.489 | -0.247 | -0.049 | 0.122 | -0.201 | 0.487 | 0.493 | -0.127 | 0.904 | ... | -0.443 | 0.047 | 0.560 | 0.176 | 0.551 | 0.046 | -0.220 | 0.008 | -0.294 | 0.104 |
2 | -0.166 | -0.062 | -0.311 | 0.046 | -0.055 | 0.063 | 0.485 | 0.493 | -0.227 | 0.904 | ... | -0.458 | -0.398 | 0.101 | 0.199 | 0.634 | 0.017 | -0.234 | 0.008 | 0.373 | 0.569 |
3 | 0.102 | 0.294 | -0.259 | 0.051 | -0.183 | 0.148 | 0.474 | 0.504 | 0.010 | 0.904 | ... | -0.456 | -0.398 | 1.007 | 0.137 | 1.042 | -0.040 | -0.290 | 0.008 | -0.666 | 0.391 |
4 | 0.300 | 0.428 | 0.208 | 0.051 | -0.033 | 0.116 | 0.408 | 0.497 | 0.155 | 0.904 | ... | -0.458 | -0.776 | 0.291 | 0.370 | 0.181 | -0.040 | -0.290 | 0.008 | -0.140 | -0.497 |
5 rows × 38 columns
# 箱型图
# orient="v" 箱子垂直显示,默认为'h'水平显示
# showfliers=False,#异常值关闭显示
# fliersize=15 设置离散值marker大小,默认为5
# flierprops = {'marker':'o',#异常值形状
# 'markerfacecolor':'red',#形状填充色
# 'color':'black',#形状外廓颜色
# },
# showcaps=False,#上下横线关闭
# capprops={'linestyle':'--','color':'red'},#设置上下横线属性
# whiskerprops={'linestyle':'--','color':'red'},#设置上下须属性
# notch=True,#箱子设置缺口
# color='white',#箱子不填充
# boxprops = {'color':'red',#箱子外框
# 'facecolor':'pink'#箱子填充色
# },#设置箱子属性
# showmeans=True,#箱图显示均值,
# meanprops = {'marker':'D','markerfacecolor':'red'},#设置均值属性
# meanline=True,#显示均值线
# meanprops = {'linestyle':'--','color':'red'},#设置均值线属性
fig = plt.figure(figsize=(6,4)) # 指定绘图对象的宽度和高度
sns.boxplot(train_data['V0'],
orient='v',
showfliers=True,
fliersize=4,
width=0.5,
flierprops={
'marker':'o',
'markerfacecolor':'red',
'color':'write'
},
showcaps=True,
capprops={
'linestyle':'--',
'color':'red'
},
whiskerprops={
'linestyle':'--',
'color':'red'
},
notch=True,#箱子设置缺口
color='white',#箱子不填充
boxprops = {'color':'red',#箱子外框
'facecolor':'pink'#箱子填充色
},#设置箱子属性
showmeans=True,#箱图显示均值,
# meanprops = {'marker':'D','markerfacecolor':'red'},#设置均值属性
meanline=True,#显示均值线
meanprops = {'linestyle':'--','color':'red'},#设置均值线属性
)
column = train_data.columns.tolist()[:39] # 列表头
fig = plt.figure(figsize=(80,60), dpi=75)
for i in range(38):
plt.subplot(7, 8,i+1 ) # 7行8列子图
sns.boxplot(train_data[column[i]], orient='v', width=0.5) # 箱式图
plt.ylabel(column[i], fontsize=36)
plt.show()
# 函数根据模型的预测检测异常值
def find_outliers(model, X, y, sigma = 3):
# 使用模型预测y的值
try :
y_pred = pd.Series(model.predict(X), index = y.index )
# 如果预测失败,首先尝试拟合模型
except :
model.fit(X, y)
y_pred = pd.Series(model.predict(X), index = y.index )
# 计算模型预测和真y值之间的残差
resid = y-y_pred # 残差
mean_resid = resid.mean() # 均值残差
std_resid = resid.std() # 标准差
# 计算z的统计量 定义离群值
z = (resid - mean_resid)/std_resid
outliers = z[abs(z)>sigma].index
# print and plot the results
print('R2=', model.score(X,y))
print("mse=", mean_squared_error(y, y_pred))
print('------------------------------------------')
print('mean of residuals:',mean_resid)
print('std of residuals: ', std_resid)
print('------------------------------------------')
print(len(outliers), 'outliners:')
print(outliers.tolist())
plt.figure(figsize=(15,5))
ax_131 = plt.subplot(1,3,1)
plt.plot(y,y_pred, '.')
plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('y')
plt.ylabel('y_pred')
ax_132 = plt.subplot(1,3,2)
plt.plot(y,y_pred, '.')
plt.plot(y.loc[outliers], y.loc[outliers]-y_pred[outliers], 'ro')
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('y')
plt.ylabel('y-y_pred')
ax_133 = plt.subplot(1,3,3)
z.plot.hist(bins=50,ax=ax_133)
z.loc[outliers].plot.hist(color='r', bins=50,ax=ax_133)
plt.legend(['Accepted', 'Outlier'])
plt.xlabel('z')
plt.savefig('outliers.png')
return outliers
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train = train_data.iloc[:,0:-1]
y_train = train_data.iloc[:,-1]
outliers = find_outliers(Ridge(),X_train, y_train)
R2= 0.8890858938210386
mse= 0.10734857773123635
------------------------------------------
mean of residuals: 6.558311911393757e-17
std of residuals: 0.32769766731934985
------------------------------------------
31 outliners:
[321, 348, 376, 777, 884, 1145, 1164, 1310, 1458, 1466, 1484, 1523, 1704, 1874, 1879, 1979, 2002, 2279, 2528, 2620, 2645, 2647, 2667, 2668, 2669, 2696, 2767, 2769, 2807, 2842, 2863]
# 直方图和Q-Q图
plt.figure(figsize=(10, 5))
ax = plt.subplot(1,2,1)
sns.distplot(train_data['V0'], fit = stats.norm)
ax = plt.subplot(1,2,2)
res = stats.probplot(train_data['V0'], plot=plt)
train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))
i = 0
for col in train_data.columns:
i+=1
ax =plt.subplot(train_rows,train_cols,i)
sns.distplot(train_data[col], fit=stats.norm)
i+=1
ax = plt.subplot(train_rows, train_cols,i)
res = stats.probplot(train_data[col], plot=plt)
plt.tight_layout()
plt.show()
# 直方图
plt.figure(figsize=(8,4), dpi=150)
ax = sns.kdeplot(train_data['V0'], color="Red", shade = True)
ax = sns.kdeplot(test_data['V0'], color='Blue', shade = True)
ax.set_xlabel('V0')
ax.legend(["train", "test"])
dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))
i = 1
for col in test_data.columns:
ax = plt.subplot(dist_rows,dist_cols,i)
ax = sns.kdeplot(train_data[col], color='Red', shade = True)
ax = sns.kdeplot(test_data[col], color='Blue', shade = True)
ax.set_xlabel(col)
ax.set_ylabel("Frequency")
ax = ax.legend(["train", "test"])
i+=1
plt.show()
# 线性回归关系图
fcols = 2
frows = 1
plt.figure(figsize=(8,4),dpi = 150 )
ax = plt.subplot(1,2,1)
sns.regplot(x='V0', y='target', data=train_data,ax=ax,
scatter_kws ={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'})
plt.xlabel('V0')
plt.ylabel('target')
ax = plt.subplot(1,2,2)
sns.distplot(train_data['V0'].dropna())
plt.xlabel('V0')
plt.show()
fcols = 6
frows = len(test_data.columns)
plt.figure(figsize=(8*fcols,4*frows))
i=0
for col in test_data.columns:
i+=1
ax = plt.subplot(frows,fcols,i)
sns.regplot(x=col, y='target', data=train_data,ax=ax,
scatter_kws ={'marker':'.','s':3,'alpha':0.3},
line_kws={'color':'k'});
plt.xlabel(col)
plt.ylabel('target')
i+=1
ax = plt.subplot(frows,fcols,i)
sns.displot(train_data[col].dropna())
plt.xlabel(col)
# 查看特征变量的相关性
# 计算相关性系数
pd.set_option('display.max_columns',10)
pd.set_option('display.max_row',10)
data_train1 = train_data.drop(['V5','V9','V11', 'V17','V22','V28'],axis=1) # 删除训练集和测试集中分布不一样的特征量
train_corr = data_train1.corr()
train_corr
V0 | V1 | V2 | V3 | V4 | ... | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
V0 | 1.000000 | 0.908607 | 0.463643 | 0.409576 | 0.781212 | ... | -0.019342 | 0.138933 | 0.231417 | -0.494076 | 0.873212 |
V1 | 0.908607 | 1.000000 | 0.506514 | 0.383924 | 0.657790 | ... | -0.029115 | 0.146329 | 0.235299 | -0.494043 | 0.871846 |
V2 | 0.463643 | 0.506514 | 1.000000 | 0.410148 | 0.057697 | ... | -0.025620 | 0.043648 | 0.316462 | -0.734956 | 0.638878 |
V3 | 0.409576 | 0.383924 | 0.410148 | 1.000000 | 0.315046 | ... | -0.031898 | 0.080034 | 0.324475 | -0.229613 | 0.512074 |
V4 | 0.781212 | 0.657790 | 0.057697 | 0.315046 | 1.000000 | ... | 0.028659 | 0.100010 | 0.113609 | -0.031054 | 0.603984 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
V34 | -0.019342 | -0.029115 | -0.025620 | -0.031898 | 0.028659 | ... | 1.000000 | 0.233616 | -0.019032 | -0.006854 | -0.006034 |
V35 | 0.138933 | 0.146329 | 0.043648 | 0.080034 | 0.100010 | ... | 0.233616 | 1.000000 | 0.025401 | -0.077991 | 0.140294 |
V36 | 0.231417 | 0.235299 | 0.316462 | 0.324475 | 0.113609 | ... | -0.019032 | 0.025401 | 1.000000 | -0.039478 | 0.319309 |
V37 | -0.494076 | -0.494043 | -0.734956 | -0.229613 | -0.031054 | ... | -0.006854 | -0.077991 | -0.039478 | 1.000000 | -0.565795 |
target | 0.873212 | 0.871846 | 0.638878 | 0.512074 | 0.603984 | ... | -0.006034 | 0.140294 | 0.319309 | -0.565795 | 1.000000 |
33 rows × 33 columns
# 画出相关性热力图
ax = plt.subplots(figsize=(30,30)) # 调整画布大小
ax = sns.heatmap(train_corr, vmax=8,square=True,annot=True) # 画热力图
# 寻找K个与target变量最相关的特征变量
k=10 #找前十个最大的
cols = train_corr.nlargest(k,'target')['target'] .index
cm = np.corrcoef(train_data[cols].values.T)
hm = plt.subplots(figsize=(10,10))
hm = sns.heatmap(train_data[cols].corr(), annot=True, square=True)
plt.show()
# 找出与target变量的相关系数大于0.5的特征变量
threshold=0.5
corrmat = train_data.corr()
corrmat
top_corr_features = corrmat.index[abs(corrmat["target"])>threshold]
top_corr_features
plt.figure(figsize=(10,10))
g = sns.heatmap(train_data[top_corr_features].corr(),
annot=True,
cmap="RdYlGn")
# 用相关系数阈值移除相关特征
threshold=0.5
# 相关系数矩阵
corr_matrix = data_train1.corr().abs()
corr_matrix
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
# data_all.drop(drop_col,axis=1,inplace=True)
Index(['V6', 'V7', 'V10', 'V13', 'V14', 'V15', 'V18', 'V19', 'V20', 'V21',
'V23', 'V24', 'V25', 'V26', 'V29', 'V30', 'V32', 'V33', 'V34', 'V35',
'V36'],
dtype='object')
# Box-Cox变换
# 在连续的响应变量不满足正态分布时,可以使用Box-Cox变换,这一变换可以使线性回归模型在满足线性、正态性、独立性及方差齐性的同时,又不丢失信息
# 可以在一定程度上减少不可观测的误差和预测变量的相关性
# 有利于线性模型的拟合及分析出特征的相关性
# 在做变换之前 需对数据做归一化处理
# 在归一化时,对数据进行合并操作可以使训练数据和测试数据一致。这种方式可以在线下分析建模中使用,而线上部署只需采用训练数据的归一化即可
drop_columns = ['V5','V9','V11','V17','V22','V28']
# 合并训练集和测试集的数据
train_x = train_data.drop(['target'],axis=1)
# data_all = pd.concat([train_data,test_data],axis=0,ignore_index=True)
data_all = pd.concat([train_x,test_data])
data_all
V0 | V1 | V2 | V3 | V4 | ... | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.143 | 0.407 | 0.452 | ... | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 |
1 | 0.968 | 0.437 | 0.066 | 0.566 | 0.194 | ... | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 |
2 | 1.013 | 0.568 | 0.235 | 0.370 | 0.112 | ... | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 |
3 | 0.733 | 0.368 | 0.283 | 0.165 | 0.599 | ... | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 |
4 | 0.684 | 0.638 | 0.260 | 0.209 | 0.337 | ... | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1920 | -1.362 | -1.553 | -3.096 | -0.444 | 0.381 | ... | -1.187 | -0.852 | -2.131 | -2.564 | 0.597 |
1921 | -2.698 | -3.452 | -3.620 | -1.066 | -1.385 | ... | -1.187 | -0.852 | -2.131 | -2.564 | 1.215 |
1922 | -2.615 | -3.564 | -3.402 | -0.422 | -1.272 | ... | -1.851 | -1.548 | -1.537 | -2.544 | 1.612 |
1923 | -2.661 | -3.646 | -3.271 | -0.699 | -1.270 | ... | -1.645 | -1.471 | -1.537 | -2.549 | 1.431 |
1924 | -2.321 | -3.037 | -3.214 | -1.594 | -0.910 | ... | -1.703 | -1.471 | -1.537 | -1.123 | 1.988 |
4813 rows × 38 columns
data_all.drop(drop_columns,axis=1,inplace=True)
data_all.head()
V0 | V1 | V2 | V3 | V4 | ... | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.143 | 0.407 | 0.452 | ... | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 |
1 | 0.968 | 0.437 | 0.066 | 0.566 | 0.194 | ... | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 |
2 | 1.013 | 0.568 | 0.235 | 0.370 | 0.112 | ... | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 |
3 | 0.733 | 0.368 | 0.283 | 0.165 | 0.599 | ... | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 |
4 | 0.684 | 0.638 | 0.260 | 0.209 | 0.337 | ... | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 |
5 rows × 32 columns
# 对合并后的每列数据进行归一化
cols_numeric=list(data_all.columns)
cols_numeric
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
data_all[cols_numeric].describe()
V0 | V1 | V2 | V3 | V4 | ... | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|
count | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | ... | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 |
mean | 0.694172 | 0.721357 | 0.602300 | 0.603139 | 0.523743 | ... | 0.458493 | 0.483790 | 0.762873 | 0.332385 | 0.545795 |
std | 0.144198 | 0.131443 | 0.140628 | 0.152462 | 0.106430 | ... | 0.099095 | 0.101020 | 0.102037 | 0.127456 | 0.150356 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.626676 | 0.679416 | 0.514414 | 0.503888 | 0.478182 | ... | 0.409037 | 0.454490 | 0.727273 | 0.270584 | 0.445647 |
50% | 0.729488 | 0.752497 | 0.617072 | 0.614270 | 0.535866 | ... | 0.454518 | 0.499949 | 0.800020 | 0.347056 | 0.539317 |
75% | 0.790195 | 0.799553 | 0.700464 | 0.710474 | 0.585036 | ... | 0.500000 | 0.511365 | 0.800020 | 0.414861 | 0.643061 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 32 columns
train_data_process = train_data[cols_numeric]
train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)
cols_numeric_left=cols_numeric[0:13]
cols_numeric_left
['V0',
'V1',
'V2',
'V3',
'V4',
'V6',
'V7',
'V8',
'V10',
'V12',
'V13',
'V14',
'V15']
cols_numeric_right=cols_numeric[13:]
cols_numeric_right
['V16',
'V18',
'V19',
'V20',
'V21',
'V23',
'V24',
'V25',
'V26',
'V27',
'V29',
'V30',
'V31',
'V32',
'V33',
'V34',
'V35',
'V36',
'V37']
train_data_process = pd.concat([train_data_process,train_data['target']],axis=1)
train_data_process
V0 | V1 | V2 | V3 | V4 | ... | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.759139 | 0.729830 | 0.524488 | 0.680337 | 0.698964 | ... | 0.000000 | 0.074074 | 0.000000 | 0.018401 | 0.175 |
1 | 0.821406 | 0.789631 | 0.557939 | 0.705130 | 0.664244 | ... | 0.499949 | 0.755580 | 0.289702 | 0.437406 | 0.676 |
2 | 0.828377 | 0.808239 | 0.584987 | 0.674567 | 0.653210 | ... | 0.499949 | 0.755580 | 0.429901 | 0.458673 | 0.633 |
3 | 0.785006 | 0.779830 | 0.592670 | 0.642601 | 0.718746 | ... | 0.477220 | 0.755580 | 0.374841 | 0.530618 | 0.206 |
4 | 0.777416 | 0.818182 | 0.588988 | 0.649462 | 0.683488 | ... | 0.462067 | 0.755580 | 0.296712 | 0.543288 | 0.384 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2883 | 0.700898 | 0.724006 | 0.525288 | 0.641977 | 0.718880 | ... | 0.481059 | 0.666667 | 0.405812 | 0.650980 | 0.235 |
2884 | 0.750000 | 0.806676 | 0.594750 | 0.645408 | 0.709460 | ... | 0.534094 | 0.666667 | 0.254015 | 0.490196 | 1.042 |
2885 | 0.610440 | 0.625142 | 0.469750 | 0.629970 | 0.656439 | ... | 0.534094 | 0.666667 | 0.453607 | 0.660332 | 0.005 |
2886 | 0.637546 | 0.687500 | 0.492318 | 0.609231 | 0.698560 | ... | 0.545409 | 0.681506 | 0.294035 | 0.631222 | 0.350 |
2887 | 0.728470 | 0.781534 | 0.511364 | 0.609231 | 0.689140 | ... | 0.482877 | 0.686495 | 0.260133 | 0.606033 | 0.417 |
2888 rows × 33 columns
fcols = 6
frows = len(cols_numeric_left)
plt.figure(figsize=(4*fcols, 4*frows))
i=0
for var in cols_numeric_left:
dat = train_data_process[[var,'target']].dropna()
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var], fit=stats.norm)
plt.title(var+'Original')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[var],plot=plt)
plt.title('skew'+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(dat[var],dat['target'],'.',alpha=0.5)
plt.title('corr='+
'{:.2f}'.format(np.corrcoef(dat[var],dat['target'])[0][1]))
i+=1
plt.subplot(frows,fcols,i)
trans_var,lambda_var = stats.boxcox(dat[var].dropna() + 1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var,fit=stats.norm)
plt.title(var+'Tramsformed')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var,plot=plt)
plt.title('skew'+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var,dat['target'],'.',alpha=0.5)
plt.title('corr='+
'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))