数据分析----数据探索实践(工业蒸汽预测)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Seaborn其实是在matplotlib的基础上进行了更高级的API封装,从而使得作图更加容易,在大多数情况下使用seaborn就能做出很具有吸引力的图。
import seaborn  as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# 数据路径
train_data_file = "./data/zhengqi_train.txt"
test_data_file = "./data/zhengqi_test.txt"

# 读取数据
# sep: 指定分割符,默认是’,’
train_data = pd.read_csv(train_data_file, sep = '\t', encoding = 'utf-8')
test_data = pd.read_csv(test_data_file, sep = '\t', encoding = 'utf-8')
# 查看训练集数据信息
# 共39列 2888行 无缺失值 数据类型为float64
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 39 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      2888 non-null   float64
 1   V1      2888 non-null   float64
 2   V2      2888 non-null   float64
 3   V3      2888 non-null   float64
 4   V4      2888 non-null   float64
 5   V5      2888 non-null   float64
 6   V6      2888 non-null   float64
 7   V7      2888 non-null   float64
 8   V8      2888 non-null   float64
 9   V9      2888 non-null   float64
 10  V10     2888 non-null   float64
 11  V11     2888 non-null   float64
 12  V12     2888 non-null   float64
 13  V13     2888 non-null   float64
 14  V14     2888 non-null   float64
 15  V15     2888 non-null   float64
 16  V16     2888 non-null   float64
 17  V17     2888 non-null   float64
 18  V18     2888 non-null   float64
 19  V19     2888 non-null   float64
 20  V20     2888 non-null   float64
 21  V21     2888 non-null   float64
 22  V22     2888 non-null   float64
 23  V23     2888 non-null   float64
 24  V24     2888 non-null   float64
 25  V25     2888 non-null   float64
 26  V26     2888 non-null   float64
 27  V27     2888 non-null   float64
 28  V28     2888 non-null   float64
 29  V29     2888 non-null   float64
 30  V30     2888 non-null   float64
 31  V31     2888 non-null   float64
 32  V32     2888 non-null   float64
 33  V33     2888 non-null   float64
 34  V34     2888 non-null   float64
 35  V35     2888 non-null   float64
 36  V36     2888 non-null   float64
 37  V37     2888 non-null   float64
 38  target  2888 non-null   float64
dtypes: float64(39)
memory usage: 880.1 KB
# 查看训练集数据信息
# 共39列 1925行 无缺失值 数据类型为float64
test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925 entries, 0 to 1924
Data columns (total 38 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      1925 non-null   float64
 1   V1      1925 non-null   float64
 2   V2      1925 non-null   float64
 3   V3      1925 non-null   float64
 4   V4      1925 non-null   float64
 5   V5      1925 non-null   float64
 6   V6      1925 non-null   float64
 7   V7      1925 non-null   float64
 8   V8      1925 non-null   float64
 9   V9      1925 non-null   float64
 10  V10     1925 non-null   float64
 11  V11     1925 non-null   float64
 12  V12     1925 non-null   float64
 13  V13     1925 non-null   float64
 14  V14     1925 non-null   float64
 15  V15     1925 non-null   float64
 16  V16     1925 non-null   float64
 17  V17     1925 non-null   float64
 18  V18     1925 non-null   float64
 19  V19     1925 non-null   float64
 20  V20     1925 non-null   float64
 21  V21     1925 non-null   float64
 22  V22     1925 non-null   float64
 23  V23     1925 non-null   float64
 24  V24     1925 non-null   float64
 25  V25     1925 non-null   float64
 26  V26     1925 non-null   float64
 27  V27     1925 non-null   float64
 28  V28     1925 non-null   float64
 29  V29     1925 non-null   float64
 30  V30     1925 non-null   float64
 31  V31     1925 non-null   float64
 32  V32     1925 non-null   float64
 33  V33     1925 non-null   float64
 34  V34     1925 non-null   float64
 35  V35     1925 non-null   float64
 36  V36     1925 non-null   float64
 37  V37     1925 non-null   float64
dtypes: float64(38)
memory usage: 571.6 KB
# 训练集的统计信息
# count:数量统计,此列共有多少有效值
# unipue:不同的值有多少个
# std:标准差
# min:最小值
# 25%:四分之一分位数
# 50%:二分之一分位数
# 75%:四分之三分位数
# max:最大值
# mean:均值
train_data.describe(include='all') # include='all',代表对所有列进行统计,如果不加这个参数,则只对数值列进行统计
V0V1V2V3V4V5V6V7V8V9...V29V30V31V32V33V34V35V36V37target
count2888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.000000...2888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.000000
mean0.1230480.0560680.289720-0.0677900.012921-0.5585650.1828920.1161550.177856-0.169452...0.0976480.0554770.1277910.0208060.0078010.0067150.1977640.030658-0.1303300.126353
std0.9280310.9415150.9112360.9702980.8883770.5179570.9180540.9551160.8954440.953813...1.0612000.9019340.8730280.9025841.0069951.0032910.9856750.9708121.0171960.983966
min-4.335000-5.122000-3.420000-3.956000-4.742000-2.182000-4.576000-5.048000-4.692000-12.891000...-2.912000-4.507000-5.859000-4.053000-4.627000-4.789000-5.695000-2.608000-3.630000-3.044000
25%-0.297000-0.226250-0.313000-0.652250-0.385000-0.853000-0.310000-0.295000-0.159000-0.390000...-0.664000-0.283000-0.170250-0.407250-0.499000-0.290000-0.202500-0.413000-0.798250-0.350250
50%0.3590000.2725000.386000-0.0445000.110000-0.4660000.3880000.3440000.3620000.042000...-0.0230000.0535000.2995000.039000-0.0400000.1600000.3640000.137000-0.1855000.313000
75%0.7260000.5990000.9182500.6240000.550250-0.1540000.8312500.7822500.7260000.042000...0.7452500.4880000.6350000.5570000.4620000.2730000.6020000.6442500.4952500.793250
max2.1210001.9180002.8280002.4570002.6890000.4890001.8950001.9180002.2450001.335000...4.5800002.6890002.0130002.3950005.4650005.1100002.3240005.2380003.0000002.538000

8 rows × 39 columns

test_data.describe()
V0V1V2V3V4V5V6V7V8V9...V28V29V30V31V32V33V34V35V36V37
count1925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.000000...1925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.0000001925.000000
mean-0.184404-0.083912-0.4347620.101671-0.0191720.838049-0.274092-0.173971-0.2667090.255114...-0.206871-0.146463-0.083215-0.191729-0.030782-0.011433-0.009985-0.296895-0.0462700.195735
std1.0733331.0766700.9695411.0349251.1472860.9630431.0541191.0401011.0859161.014394...1.0641400.8805931.1264141.1384541.1302280.9897320.9952130.9468961.0408540.940599
min-4.814000-5.488000-4.283000-3.276000-4.921000-1.168000-5.649000-5.625000-6.059000-6.784000...-2.435000-2.413000-4.507000-7.698000-4.057000-4.627000-4.789000-7.477000-2.608000-3.346000
25%-0.664000-0.451000-0.978000-0.644000-0.4970000.122000-0.732000-0.509000-0.775000-0.390000...-0.453000-0.818000-0.339000-0.476000-0.472000-0.460000-0.290000-0.349000-0.593000-0.432000
50%0.0650000.195000-0.2670000.2200000.1180000.437000-0.0820000.018000-0.0040000.401000...-0.445000-0.1990000.0100000.1000000.155000-0.0400000.160000-0.2700000.0830000.152000
75%0.5490000.5890000.2780000.7930000.6100001.9280000.4570000.5150000.4820000.904000...-0.4340000.4680000.4470000.4710000.6270000.4190000.2730000.3640000.6510000.797000
max2.1000002.1200001.9460002.6030004.4750003.1760001.5280001.3940002.4080001.766000...4.6560003.0220003.1390001.4280002.2990005.4650005.1100001.6710002.8610003.021000

8 rows × 38 columns

# 训练集的前5行数据
train_data.head()
V0V1V2V3V4V5V6V7V8V9...V29V30V31V32V33V34V35V36V37target
00.5660.016-0.1430.4070.452-0.901-1.812-2.360-0.436-2.114...0.1360.109-0.6150.327-4.627-4.789-5.101-2.608-3.5080.175
10.9680.4370.0660.5660.194-0.893-1.566-2.3600.332-2.114...-0.1280.1240.0320.600-0.8430.1600.364-0.335-0.7300.676
21.0130.5680.2350.3700.112-0.797-1.367-2.3600.396-2.114...-0.0090.3610.277-0.116-0.8430.1600.3640.765-0.5890.633
30.7330.3680.2830.1650.599-0.679-1.200-2.0860.403-2.114...0.0150.4170.2790.603-0.843-0.0650.3640.333-0.1120.206
40.6840.6380.2600.2090.337-0.454-1.073-2.0860.314-2.114...0.1831.0780.3280.418-0.843-0.2150.364-0.280-0.0280.384

5 rows × 39 columns

test_data.head()
V0V1V2V3V4V5V6V7V8V9...V28V29V30V31V32V33V34V35V36V37
00.3680.380-0.225-0.0490.3790.0920.5500.5510.2440.904...-0.4490.0470.057-0.0420.8470.534-0.009-0.190-0.5670.388
10.1480.489-0.247-0.0490.122-0.2010.4870.493-0.1270.904...-0.4430.0470.5600.1760.5510.046-0.2200.008-0.2940.104
2-0.166-0.062-0.3110.046-0.0550.0630.4850.493-0.2270.904...-0.458-0.3980.1010.1990.6340.017-0.2340.0080.3730.569
30.1020.294-0.2590.051-0.1830.1480.4740.5040.0100.904...-0.456-0.3981.0070.1371.042-0.040-0.2900.008-0.6660.391
40.3000.4280.2080.051-0.0330.1160.4080.4970.1550.904...-0.458-0.7760.2910.3700.181-0.040-0.2900.008-0.140-0.497

5 rows × 38 columns

# 箱型图
# orient="v" 箱子垂直显示,默认为'h'水平显示
# showfliers=False,#异常值关闭显示
# fliersize=15  设置离散值marker大小,默认为5
# flierprops = {'marker':'o',#异常值形状
#                          'markerfacecolor':'red',#形状填充色
#                          'color':'black',#形状外廓颜色
#                         },

# showcaps=False,#上下横线关闭
# capprops={'linestyle':'--','color':'red'},#设置上下横线属性
# whiskerprops={'linestyle':'--','color':'red'},#设置上下须属性
# notch=True,#箱子设置缺口
# color='white',#箱子不填充

# boxprops = {'color':'red',#箱子外框
#           'facecolor':'pink'#箱子填充色
#           },#设置箱子属性

# showmeans=True,#箱图显示均值,
# meanprops = {'marker':'D','markerfacecolor':'red'},#设置均值属性
# meanline=True,#显示均值线
# meanprops = {'linestyle':'--','color':'red'},#设置均值线属性

fig = plt.figure(figsize=(6,4))  # 指定绘图对象的宽度和高度

sns.boxplot(train_data['V0'], 
            orient='v',
            showfliers=True,
            fliersize=4, 
            width=0.5,
            flierprops={
                'marker':'o',
                'markerfacecolor':'red',
                'color':'write'
            },
           showcaps=True,
           capprops={
               'linestyle':'--',
               'color':'red'
           },
            whiskerprops={
                'linestyle':'--',
                'color':'red'
            },
            notch=True,#箱子设置缺口
            color='white',#箱子不填充
            boxprops = {'color':'red',#箱子外框
           'facecolor':'pink'#箱子填充色
           },#设置箱子属性
            showmeans=True,#箱图显示均值,
            # meanprops = {'marker':'D','markerfacecolor':'red'},#设置均值属性
            meanline=True,#显示均值线
            meanprops = {'linestyle':'--','color':'red'},#设置均值线属性
           )


在这里插入图片描述

column = train_data.columns.tolist()[:39] # 列表头
fig = plt.figure(figsize=(80,60), dpi=75)
for i in range(38):
    plt.subplot(7, 8,i+1 ) # 7行8列子图
    sns.boxplot(train_data[column[i]], orient='v', width=0.5)  # 箱式图
    plt.ylabel(column[i], fontsize=36)
plt.show()


在这里插入图片描述

# 函数根据模型的预测检测异常值
def find_outliers(model, X, y, sigma = 3):
    # 使用模型预测y的值
    try :
        y_pred = pd.Series(model.predict(X), index = y.index )
    # 如果预测失败,首先尝试拟合模型
    except :
        model.fit(X, y)
        y_pred = pd.Series(model.predict(X), index = y.index )
    # 计算模型预测和真y值之间的残差
    resid = y-y_pred   # 残差
    mean_resid = resid.mean() # 均值残差
    std_resid = resid.std() # 标准差
    
    
    # 计算z的统计量 定义离群值
    z = (resid - mean_resid)/std_resid
    outliers = z[abs(z)>sigma].index
    
    # print and plot the results 
    print('R2=', model.score(X,y))
    print("mse=", mean_squared_error(y, y_pred))
    print('------------------------------------------')
    
    
    print('mean of residuals:',mean_resid)
    print('std of residuals: ', std_resid)
    print('------------------------------------------')
    
    
    print(len(outliers), 'outliners:')
    print(outliers.tolist())
    
    plt.figure(figsize=(15,5))
    ax_131 = plt.subplot(1,3,1)
    plt.plot(y,y_pred, '.')
    plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('y')
    plt.ylabel('y_pred')
    
    ax_132 = plt.subplot(1,3,2)
    plt.plot(y,y_pred, '.')
    plt.plot(y.loc[outliers], y.loc[outliers]-y_pred[outliers], 'ro')
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('y')
    plt.ylabel('y-y_pred')
    
    ax_133 = plt.subplot(1,3,3)
    z.plot.hist(bins=50,ax=ax_133)
    z.loc[outliers].plot.hist(color='r', bins=50,ax=ax_133)
    plt.legend(['Accepted', 'Outlier']) 
    plt.xlabel('z')
    plt.savefig('outliers.png')
    
    return outliers
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train = train_data.iloc[:,0:-1]
y_train = train_data.iloc[:,-1]
outliers = find_outliers(Ridge(),X_train, y_train)
R2= 0.8890858938210386
mse= 0.10734857773123635
------------------------------------------
mean of residuals: 6.558311911393757e-17
std of residuals:  0.32769766731934985
------------------------------------------
31 outliners:
[321, 348, 376, 777, 884, 1145, 1164, 1310, 1458, 1466, 1484, 1523, 1704, 1874, 1879, 1979, 2002, 2279, 2528, 2620, 2645, 2647, 2667, 2668, 2669, 2696, 2767, 2769, 2807, 2842, 2863]

在这里插入图片描述

# 直方图和Q-Q图
plt.figure(figsize=(10, 5))

ax = plt.subplot(1,2,1)
sns.distplot(train_data['V0'], fit = stats.norm)
ax = plt.subplot(1,2,2)
res = stats.probplot(train_data['V0'], plot=plt)


在这里插入图片描述

train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))


i = 0
for col in train_data.columns:
    i+=1
    ax =plt.subplot(train_rows,train_cols,i)
    sns.distplot(train_data[col], fit=stats.norm)
    
    i+=1
    ax = plt.subplot(train_rows, train_cols,i)
    res = stats.probplot(train_data[col], plot=plt)
plt.tight_layout()
plt.show()


在这里插入图片描述

# 直方图
plt.figure(figsize=(8,4), dpi=150)
ax = sns.kdeplot(train_data['V0'], color="Red", shade = True)
ax = sns.kdeplot(test_data['V0'], color='Blue', shade = True)
ax.set_xlabel('V0')
ax.legend(["train", "test"])

在这里插入图片描述

dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))


i = 1
for col in test_data.columns:
    ax = plt.subplot(dist_rows,dist_cols,i)
    ax = sns.kdeplot(train_data[col], color='Red', shade = True)
    ax = sns.kdeplot(test_data[col], color='Blue', shade = True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train", "test"])
    i+=1
plt.show()


在这里插入图片描述

# 线性回归关系图
fcols = 2
frows = 1
plt.figure(figsize=(8,4),dpi = 150 )

ax = plt.subplot(1,2,1)
sns.regplot(x='V0', y='target', data=train_data,ax=ax,
           scatter_kws ={'marker':'.','s':3,'alpha':0.3},
           line_kws={'color':'k'})
plt.xlabel('V0')
plt.ylabel('target')

ax = plt.subplot(1,2,2)
sns.distplot(train_data['V0'].dropna())
plt.xlabel('V0')
plt.show()


在这里插入图片描述

fcols = 6
frows = len(test_data.columns)
plt.figure(figsize=(8*fcols,4*frows))

i=0
for col in test_data.columns:
    i+=1
    ax = plt.subplot(frows,fcols,i)
    sns.regplot(x=col, y='target', data=train_data,ax=ax,
           scatter_kws ={'marker':'.','s':3,'alpha':0.3},
           line_kws={'color':'k'});
    plt.xlabel(col)
    plt.ylabel('target')
 
    i+=1
    ax = plt.subplot(frows,fcols,i)
    sns.displot(train_data[col].dropna())
    plt.xlabel(col)
    


# 查看特征变量的相关性


# 计算相关性系数
pd.set_option('display.max_columns',10)
pd.set_option('display.max_row',10)
data_train1 = train_data.drop(['V5','V9','V11', 'V17','V22','V28'],axis=1) # 删除训练集和测试集中分布不一样的特征量
train_corr = data_train1.corr()
train_corr
V0V1V2V3V4...V34V35V36V37target
V01.0000000.9086070.4636430.4095760.781212...-0.0193420.1389330.231417-0.4940760.873212
V10.9086071.0000000.5065140.3839240.657790...-0.0291150.1463290.235299-0.4940430.871846
V20.4636430.5065141.0000000.4101480.057697...-0.0256200.0436480.316462-0.7349560.638878
V30.4095760.3839240.4101481.0000000.315046...-0.0318980.0800340.324475-0.2296130.512074
V40.7812120.6577900.0576970.3150461.000000...0.0286590.1000100.113609-0.0310540.603984
....................................
V34-0.019342-0.029115-0.025620-0.0318980.028659...1.0000000.233616-0.019032-0.006854-0.006034
V350.1389330.1463290.0436480.0800340.100010...0.2336161.0000000.025401-0.0779910.140294
V360.2314170.2352990.3164620.3244750.113609...-0.0190320.0254011.000000-0.0394780.319309
V37-0.494076-0.494043-0.734956-0.229613-0.031054...-0.006854-0.077991-0.0394781.000000-0.565795
target0.8732120.8718460.6388780.5120740.603984...-0.0060340.1402940.319309-0.5657951.000000

33 rows × 33 columns

# 画出相关性热力图

ax = plt.subplots(figsize=(30,30)) # 调整画布大小
ax = sns.heatmap(train_corr, vmax=8,square=True,annot=True) # 画热力图


在这里插入图片描述

# 寻找K个与target变量最相关的特征变量



k=10 #找前十个最大的
cols = train_corr.nlargest(k,'target')['target'] .index

cm = np.corrcoef(train_data[cols].values.T)
hm = plt.subplots(figsize=(10,10))
hm = sns.heatmap(train_data[cols].corr(), annot=True, square=True)
plt.show()


在这里插入图片描述

# 找出与target变量的相关系数大于0.5的特征变量
threshold=0.5

corrmat = train_data.corr()
corrmat
top_corr_features = corrmat.index[abs(corrmat["target"])>threshold]
top_corr_features
plt.figure(figsize=(10,10))

g =  sns.heatmap(train_data[top_corr_features].corr(),
                annot=True,
                cmap="RdYlGn")

在这里插入图片描述

# 用相关系数阈值移除相关特征

threshold=0.5

# 相关系数矩阵
corr_matrix = data_train1.corr().abs()
corr_matrix
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
# data_all.drop(drop_col,axis=1,inplace=True)
Index(['V6', 'V7', 'V10', 'V13', 'V14', 'V15', 'V18', 'V19', 'V20', 'V21',
       'V23', 'V24', 'V25', 'V26', 'V29', 'V30', 'V32', 'V33', 'V34', 'V35',
       'V36'],
      dtype='object')
# Box-Cox变换
# 在连续的响应变量不满足正态分布时,可以使用Box-Cox变换,这一变换可以使线性回归模型在满足线性、正态性、独立性及方差齐性的同时,又不丢失信息
# 可以在一定程度上减少不可观测的误差和预测变量的相关性
# 有利于线性模型的拟合及分析出特征的相关性
# 在做变换之前 需对数据做归一化处理
# 在归一化时,对数据进行合并操作可以使训练数据和测试数据一致。这种方式可以在线下分析建模中使用,而线上部署只需采用训练数据的归一化即可
drop_columns = ['V5','V9','V11','V17','V22','V28']
# 合并训练集和测试集的数据

train_x = train_data.drop(['target'],axis=1)

# data_all = pd.concat([train_data,test_data],axis=0,ignore_index=True)
data_all = pd.concat([train_x,test_data])
data_all
V0V1V2V3V4...V33V34V35V36V37
00.5660.016-0.1430.4070.452...-4.627-4.789-5.101-2.608-3.508
10.9680.4370.0660.5660.194...-0.8430.1600.364-0.335-0.730
21.0130.5680.2350.3700.112...-0.8430.1600.3640.765-0.589
30.7330.3680.2830.1650.599...-0.843-0.0650.3640.333-0.112
40.6840.6380.2600.2090.337...-0.843-0.2150.364-0.280-0.028
....................................
1920-1.362-1.553-3.096-0.4440.381...-1.187-0.852-2.131-2.5640.597
1921-2.698-3.452-3.620-1.066-1.385...-1.187-0.852-2.131-2.5641.215
1922-2.615-3.564-3.402-0.422-1.272...-1.851-1.548-1.537-2.5441.612
1923-2.661-3.646-3.271-0.699-1.270...-1.645-1.471-1.537-2.5491.431
1924-2.321-3.037-3.214-1.594-0.910...-1.703-1.471-1.537-1.1231.988

4813 rows × 38 columns

data_all.drop(drop_columns,axis=1,inplace=True)
data_all.head()
V0V1V2V3V4...V33V34V35V36V37
00.5660.016-0.1430.4070.452...-4.627-4.789-5.101-2.608-3.508
10.9680.4370.0660.5660.194...-0.8430.1600.364-0.335-0.730
21.0130.5680.2350.3700.112...-0.8430.1600.3640.765-0.589
30.7330.3680.2830.1650.599...-0.843-0.0650.3640.333-0.112
40.6840.6380.2600.2090.337...-0.843-0.2150.364-0.280-0.028

5 rows × 32 columns

# 对合并后的每列数据进行归一化
cols_numeric=list(data_all.columns)
cols_numeric


def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())

data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
data_all[cols_numeric].describe()

V0V1V2V3V4...V33V34V35V36V37
count4813.0000004813.0000004813.0000004813.0000004813.000000...4813.0000004813.0000004813.0000004813.0000004813.000000
mean0.6941720.7213570.6023000.6031390.523743...0.4584930.4837900.7628730.3323850.545795
std0.1441980.1314430.1406280.1524620.106430...0.0990950.1010200.1020370.1274560.150356
min0.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.000000
25%0.6266760.6794160.5144140.5038880.478182...0.4090370.4544900.7272730.2705840.445647
50%0.7294880.7524970.6170720.6142700.535866...0.4545180.4999490.8000200.3470560.539317
75%0.7901950.7995530.7004640.7104740.585036...0.5000000.5113650.8000200.4148610.643061
max1.0000001.0000001.0000001.0000001.000000...1.0000001.0000001.0000001.0000001.000000

8 rows × 32 columns

train_data_process = train_data[cols_numeric]
train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)
cols_numeric_left=cols_numeric[0:13]
cols_numeric_left
['V0',
 'V1',
 'V2',
 'V3',
 'V4',
 'V6',
 'V7',
 'V8',
 'V10',
 'V12',
 'V13',
 'V14',
 'V15']
cols_numeric_right=cols_numeric[13:]
cols_numeric_right
['V16',
 'V18',
 'V19',
 'V20',
 'V21',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37']
train_data_process = pd.concat([train_data_process,train_data['target']],axis=1)
train_data_process
V0V1V2V3V4...V34V35V36V37target
00.7591390.7298300.5244880.6803370.698964...0.0000000.0740740.0000000.0184010.175
10.8214060.7896310.5579390.7051300.664244...0.4999490.7555800.2897020.4374060.676
20.8283770.8082390.5849870.6745670.653210...0.4999490.7555800.4299010.4586730.633
30.7850060.7798300.5926700.6426010.718746...0.4772200.7555800.3748410.5306180.206
40.7774160.8181820.5889880.6494620.683488...0.4620670.7555800.2967120.5432880.384
....................................
28830.7008980.7240060.5252880.6419770.718880...0.4810590.6666670.4058120.6509800.235
28840.7500000.8066760.5947500.6454080.709460...0.5340940.6666670.2540150.4901961.042
28850.6104400.6251420.4697500.6299700.656439...0.5340940.6666670.4536070.6603320.005
28860.6375460.6875000.4923180.6092310.698560...0.5454090.6815060.2940350.6312220.350
28870.7284700.7815340.5113640.6092310.689140...0.4828770.6864950.2601330.6060330.417

2888 rows × 33 columns

fcols = 6
frows = len(cols_numeric_left)
plt.figure(figsize=(4*fcols, 4*frows))

i=0

for var in cols_numeric_left:
    dat = train_data_process[[var,'target']].dropna()
    i+=1
    plt.subplot(frows,fcols,i)
    sns.distplot(dat[var], fit=stats.norm)
    plt.title(var+'Original')
    plt.xlabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(dat[var],plot=plt)
    plt.title('skew'+'{:.4f}'.format(stats.skew(dat[var])))
    plt.xlabel('')
    plt.ylabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(dat[var],dat['target'],'.',alpha=0.5)
    plt.title('corr='+
             '{:.2f}'.format(np.corrcoef(dat[var],dat['target'])[0][1]))
    
    i+=1
    plt.subplot(frows,fcols,i)
    trans_var,lambda_var = stats.boxcox(dat[var].dropna() + 1)
    trans_var = scale_minmax(trans_var)
    sns.distplot(trans_var,fit=stats.norm)
    plt.title(var+'Tramsformed')
    plt.xlabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(trans_var,plot=plt)
    plt.title('skew'+'{:.4f}'.format(stats.skew(trans_var)))
    plt.xlabel('')
    plt.ylabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(trans_var,dat['target'],'.',alpha=0.5)
    plt.title('corr='+
             '{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
    


在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值