数据分析----数据探索实践（工业蒸汽预测）

哈哈你个大锤子

已于 2023-06-02 22:37:58 修改

阅读量572

点赞数

分类专栏：数据分析文章标签： python 机器学习数据分析数据挖掘

于 2021-04-21 23:28:29 首次发布

本文链接：https://blog.csdn.net/qq_45603718/article/details/115984641

版权

数据分析专栏收录该内容

6 篇文章 2 订阅

订阅专栏

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Seaborn其实是在matplotlib的基础上进行了更高级的API封装，从而使得作图更加容易，在大多数情况下使用seaborn就能做出很具有吸引力的图。
import seaborn  as sns
from scipy import stats

import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# 数据路径
train_data_file = "./data/zhengqi_train.txt"
test_data_file = "./data/zhengqi_test.txt"

# 读取数据
# sep: 指定分割符，默认是’,’
train_data = pd.read_csv(train_data_file, sep = '\t', encoding = 'utf-8')
test_data = pd.read_csv(test_data_file, sep = '\t', encoding = 'utf-8')

# 查看训练集数据信息
# 共39列 2888行 无缺失值 数据类型为float64
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2888 entries, 0 to 2887
Data columns (total 39 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      2888 non-null   float64
 1   V1      2888 non-null   float64
 2   V2      2888 non-null   float64
 3   V3      2888 non-null   float64
 4   V4      2888 non-null   float64
 5   V5      2888 non-null   float64
 6   V6      2888 non-null   float64
 7   V7      2888 non-null   float64
 8   V8      2888 non-null   float64
 9   V9      2888 non-null   float64
 10  V10     2888 non-null   float64
 11  V11     2888 non-null   float64
 12  V12     2888 non-null   float64
 13  V13     2888 non-null   float64
 14  V14     2888 non-null   float64
 15  V15     2888 non-null   float64
 16  V16     2888 non-null   float64
 17  V17     2888 non-null   float64
 18  V18     2888 non-null   float64
 19  V19     2888 non-null   float64
 20  V20     2888 non-null   float64
 21  V21     2888 non-null   float64
 22  V22     2888 non-null   float64
 23  V23     2888 non-null   float64
 24  V24     2888 non-null   float64
 25  V25     2888 non-null   float64
 26  V26     2888 non-null   float64
 27  V27     2888 non-null   float64
 28  V28     2888 non-null   float64
 29  V29     2888 non-null   float64
 30  V30     2888 non-null   float64
 31  V31     2888 non-null   float64
 32  V32     2888 non-null   float64
 33  V33     2888 non-null   float64
 34  V34     2888 non-null   float64
 35  V35     2888 non-null   float64
 36  V36     2888 non-null   float64
 37  V37     2888 non-null   float64
 38  target  2888 non-null   float64
dtypes: float64(39)
memory usage: 880.1 KB

# 查看训练集数据信息
# 共39列 1925行 无缺失值 数据类型为float64
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1925 entries, 0 to 1924
Data columns (total 38 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V0      1925 non-null   float64
 1   V1      1925 non-null   float64
 2   V2      1925 non-null   float64
 3   V3      1925 non-null   float64
 4   V4      1925 non-null   float64
 5   V5      1925 non-null   float64
 6   V6      1925 non-null   float64
 7   V7      1925 non-null   float64
 8   V8      1925 non-null   float64
 9   V9      1925 non-null   float64
 10  V10     1925 non-null   float64
 11  V11     1925 non-null   float64
 12  V12     1925 non-null   float64
 13  V13     1925 non-null   float64
 14  V14     1925 non-null   float64
 15  V15     1925 non-null   float64
 16  V16     1925 non-null   float64
 17  V17     1925 non-null   float64
 18  V18     1925 non-null   float64
 19  V19     1925 non-null   float64
 20  V20     1925 non-null   float64
 21  V21     1925 non-null   float64
 22  V22     1925 non-null   float64
 23  V23     1925 non-null   float64
 24  V24     1925 non-null   float64
 25  V25     1925 non-null   float64
 26  V26     1925 non-null   float64
 27  V27     1925 non-null   float64
 28  V28     1925 non-null   float64
 29  V29     1925 non-null   float64
 30  V30     1925 non-null   float64
 31  V31     1925 non-null   float64
 32  V32     1925 non-null   float64
 33  V33     1925 non-null   float64
 34  V34     1925 non-null   float64
 35  V35     1925 non-null   float64
 36  V36     1925 non-null   float64
 37  V37     1925 non-null   float64
dtypes: float64(38)
memory usage: 571.6 KB

# 训练集的统计信息
# count：数量统计，此列共有多少有效值
# unipue：不同的值有多少个
# std：标准差
# min：最小值
# 25%：四分之一分位数
# 50%：二分之一分位数
# 75%：四分之三分位数
# max：最大值
# mean：均值
train_data.describe(include='all') # include='all',代表对所有列进行统计，如果不加这个参数，则只对数值列进行统计

	V0	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V29	V30	V31	V32	V33	V34	V35	V36	V37	target
count	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	...	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000	2888.000000
mean	0.123048	0.056068	0.289720	-0.067790	0.012921	-0.558565	0.182892	0.116155	0.177856	-0.169452	...	0.097648	0.055477	0.127791	0.020806	0.007801	0.006715	0.197764	0.030658	-0.130330	0.126353
std	0.928031	0.941515	0.911236	0.970298	0.888377	0.517957	0.918054	0.955116	0.895444	0.953813	...	1.061200	0.901934	0.873028	0.902584	1.006995	1.003291	0.985675	0.970812	1.017196	0.983966
min	-4.335000	-5.122000	-3.420000	-3.956000	-4.742000	-2.182000	-4.576000	-5.048000	-4.692000	-12.891000	...	-2.912000	-4.507000	-5.859000	-4.053000	-4.627000	-4.789000	-5.695000	-2.608000	-3.630000	-3.044000
25%	-0.297000	-0.226250	-0.313000	-0.652250	-0.385000	-0.853000	-0.310000	-0.295000	-0.159000	-0.390000	...	-0.664000	-0.283000	-0.170250	-0.407250	-0.499000	-0.290000	-0.202500	-0.413000	-0.798250	-0.350250
50%	0.359000	0.272500	0.386000	-0.044500	0.110000	-0.466000	0.388000	0.344000	0.362000	0.042000	...	-0.023000	0.053500	0.299500	0.039000	-0.040000	0.160000	0.364000	0.137000	-0.185500	0.313000
75%	0.726000	0.599000	0.918250	0.624000	0.550250	-0.154000	0.831250	0.782250	0.726000	0.042000	...	0.745250	0.488000	0.635000	0.557000	0.462000	0.273000	0.602000	0.644250	0.495250	0.793250
max	2.121000	1.918000	2.828000	2.457000	2.689000	0.489000	1.895000	1.918000	2.245000	1.335000	...	4.580000	2.689000	2.013000	2.395000	5.465000	5.110000	2.324000	5.238000	3.000000	2.538000

8 rows × 39 columns

test_data.describe()

	V0	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V28	V29	V30	V31	V32	V33	V34	V35	V36	V37
count	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	...	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000	1925.000000
mean	-0.184404	-0.083912	-0.434762	0.101671	-0.019172	0.838049	-0.274092	-0.173971	-0.266709	0.255114	...	-0.206871	-0.146463	-0.083215	-0.191729	-0.030782	-0.011433	-0.009985	-0.296895	-0.046270	0.195735
std	1.073333	1.076670	0.969541	1.034925	1.147286	0.963043	1.054119	1.040101	1.085916	1.014394	...	1.064140	0.880593	1.126414	1.138454	1.130228	0.989732	0.995213	0.946896	1.040854	0.940599
min	-4.814000	-5.488000	-4.283000	-3.276000	-4.921000	-1.168000	-5.649000	-5.625000	-6.059000	-6.784000	...	-2.435000	-2.413000	-4.507000	-7.698000	-4.057000	-4.627000	-4.789000	-7.477000	-2.608000	-3.346000
25%	-0.664000	-0.451000	-0.978000	-0.644000	-0.497000	0.122000	-0.732000	-0.509000	-0.775000	-0.390000	...	-0.453000	-0.818000	-0.339000	-0.476000	-0.472000	-0.460000	-0.290000	-0.349000	-0.593000	-0.432000
50%	0.065000	0.195000	-0.267000	0.220000	0.118000	0.437000	-0.082000	0.018000	-0.004000	0.401000	...	-0.445000	-0.199000	0.010000	0.100000	0.155000	-0.040000	0.160000	-0.270000	0.083000	0.152000
75%	0.549000	0.589000	0.278000	0.793000	0.610000	1.928000	0.457000	0.515000	0.482000	0.904000	...	-0.434000	0.468000	0.447000	0.471000	0.627000	0.419000	0.273000	0.364000	0.651000	0.797000
max	2.100000	2.120000	1.946000	2.603000	4.475000	3.176000	1.528000	1.394000	2.408000	1.766000	...	4.656000	3.022000	3.139000	1.428000	2.299000	5.465000	5.110000	1.671000	2.861000	3.021000

8 rows × 38 columns

# 训练集的前5行数据
train_data.head()

	V0	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V29	V30	V31	V32	V33	V34	V35	V36	V37	target
0	0.566	0.016	-0.143	0.407	0.452	-0.901	-1.812	-2.360	-0.436	-2.114	...	0.136	0.109	-0.615	0.327	-4.627	-4.789	-5.101	-2.608	-3.508	0.175
1	0.968	0.437	0.066	0.566	0.194	-0.893	-1.566	-2.360	0.332	-2.114	...	-0.128	0.124	0.032	0.600	-0.843	0.160	0.364	-0.335	-0.730	0.676
2	1.013	0.568	0.235	0.370	0.112	-0.797	-1.367	-2.360	0.396	-2.114	...	-0.009	0.361	0.277	-0.116	-0.843	0.160	0.364	0.765	-0.589	0.633
3	0.733	0.368	0.283	0.165	0.599	-0.679	-1.200	-2.086	0.403	-2.114	...	0.015	0.417	0.279	0.603	-0.843	-0.065	0.364	0.333	-0.112	0.206
4	0.684	0.638	0.260	0.209	0.337	-0.454	-1.073	-2.086	0.314	-2.114	...	0.183	1.078	0.328	0.418	-0.843	-0.215	0.364	-0.280	-0.028	0.384

5 rows × 39 columns

test_data.head()

	V0	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V28	V29	V30	V31	V32	V33	V34	V35	V36	V37
0	0.368	0.380	-0.225	-0.049	0.379	0.092	0.550	0.551	0.244	0.904	...	-0.449	0.047	0.057	-0.042	0.847	0.534	-0.009	-0.190	-0.567	0.388
1	0.148	0.489	-0.247	-0.049	0.122	-0.201	0.487	0.493	-0.127	0.904	...	-0.443	0.047	0.560	0.176	0.551	0.046	-0.220	0.008	-0.294	0.104
2	-0.166	-0.062	-0.311	0.046	-0.055	0.063	0.485	0.493	-0.227	0.904	...	-0.458	-0.398	0.101	0.199	0.634	0.017	-0.234	0.008	0.373	0.569
3	0.102	0.294	-0.259	0.051	-0.183	0.148	0.474	0.504	0.010	0.904	...	-0.456	-0.398	1.007	0.137	1.042	-0.040	-0.290	0.008	-0.666	0.391
4	0.300	0.428	0.208	0.051	-0.033	0.116	0.408	0.497	0.155	0.904	...	-0.458	-0.776	0.291	0.370	0.181	-0.040	-0.290	0.008	-0.140	-0.497

5 rows × 38 columns

# 箱型图
# orient="v" 箱子垂直显示，默认为'h'水平显示
# showfliers=False,#异常值关闭显示
# fliersize=15  设置离散值marker大小，默认为5
# flierprops = {'marker':'o',#异常值形状
#                          'markerfacecolor':'red',#形状填充色
#                          'color':'black',#形状外廓颜色
#                         },

# showcaps=False,#上下横线关闭
# capprops={'linestyle':'--','color':'red'},#设置上下横线属性
# whiskerprops={'linestyle':'--','color':'red'},#设置上下须属性
# notch=True,#箱子设置缺口
# color='white',#箱子不填充

# boxprops = {'color':'red',#箱子外框
#           'facecolor':'pink'#箱子填充色
#           },#设置箱子属性

# showmeans=True,#箱图显示均值，
# meanprops = {'marker':'D','markerfacecolor':'red'},#设置均值属性
# meanline=True,#显示均值线
# meanprops = {'linestyle':'--','color':'red'},#设置均值线属性

fig = plt.figure(figsize=(6,4))  # 指定绘图对象的宽度和高度

sns.boxplot(train_data['V0'], 
            orient='v',
            showfliers=True,
            fliersize=4, 
            width=0.5,
            flierprops={
                'marker':'o',
                'markerfacecolor':'red',
                'color':'write'
            },
           showcaps=True,
           capprops={
               'linestyle':'--',
               'color':'red'
           },
            whiskerprops={
                'linestyle':'--',
                'color':'red'
            },
            notch=True,#箱子设置缺口
            color='white',#箱子不填充
            boxprops = {'color':'red',#箱子外框
           'facecolor':'pink'#箱子填充色
           },#设置箱子属性
            showmeans=True,#箱图显示均值，
            # meanprops = {'marker':'D','markerfacecolor':'red'},#设置均值属性
            meanline=True,#显示均值线
            meanprops = {'linestyle':'--','color':'red'},#设置均值线属性
           )

在这里插入图片描述

column = train_data.columns.tolist()[:39] # 列表头
fig = plt.figure(figsize=(80,60), dpi=75)
for i in range(38):
    plt.subplot(7, 8,i+1 ) # 7行8列子图
    sns.boxplot(train_data[column[i]], orient='v', width=0.5)  # 箱式图
    plt.ylabel(column[i], fontsize=36)
plt.show()

在这里插入图片描述

# 函数根据模型的预测检测异常值
def find_outliers(model, X, y, sigma = 3):
    # 使用模型预测y的值
    try :
        y_pred = pd.Series(model.predict(X), index = y.index )
    # 如果预测失败，首先尝试拟合模型
    except :
        model.fit(X, y)
        y_pred = pd.Series(model.predict(X), index = y.index )
    # 计算模型预测和真y值之间的残差
    resid = y-y_pred   # 残差
    mean_resid = resid.mean() # 均值残差
    std_resid = resid.std() # 标准差
    
    
    # 计算z的统计量 定义离群值
    z = (resid - mean_resid)/std_resid
    outliers = z[abs(z)>sigma].index
    
    # print and plot the results 
    print('R2=', model.score(X,y))
    print("mse=", mean_squared_error(y, y_pred))
    print('------------------------------------------')
    
    
    print('mean of residuals:',mean_resid)
    print('std of residuals： ', std_resid)
    print('------------------------------------------')
    
    
    print(len(outliers), 'outliners:')
    print(outliers.tolist())
    
    plt.figure(figsize=(15,5))
    ax_131 = plt.subplot(1,3,1)
    plt.plot(y,y_pred, '.')
    plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('y')
    plt.ylabel('y_pred')
    
    ax_132 = plt.subplot(1,3,2)
    plt.plot(y,y_pred, '.')
    plt.plot(y.loc[outliers], y.loc[outliers]-y_pred[outliers], 'ro')
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('y')
    plt.ylabel('y-y_pred')
    
    ax_133 = plt.subplot(1,3,3)
    z.plot.hist(bins=50,ax=ax_133)
    z.loc[outliers].plot.hist(color='r', bins=50,ax=ax_133)
    plt.legend(['Accepted', 'Outlier']) 
    plt.xlabel('z')
    plt.savefig('outliers.png')
    
    return outliers

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
X_train = train_data.iloc[:,0:-1]
y_train = train_data.iloc[:,-1]
outliers = find_outliers(Ridge(),X_train, y_train)

R2= 0.8890858938210386
mse= 0.10734857773123635
------------------------------------------
mean of residuals: 6.558311911393757e-17
std of residuals：  0.32769766731934985
------------------------------------------
31 outliners:
[321, 348, 376, 777, 884, 1145, 1164, 1310, 1458, 1466, 1484, 1523, 1704, 1874, 1879, 1979, 2002, 2279, 2528, 2620, 2645, 2647, 2667, 2668, 2669, 2696, 2767, 2769, 2807, 2842, 2863]

在这里插入图片描述

# 直方图和Q-Q图
plt.figure(figsize=(10, 5))

ax = plt.subplot(1,2,1)
sns.distplot(train_data['V0'], fit = stats.norm)
ax = plt.subplot(1,2,2)
res = stats.probplot(train_data['V0'], plot=plt)

在这里插入图片描述

train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4*train_cols,4*train_rows))


i = 0
for col in train_data.columns:
    i+=1
    ax =plt.subplot(train_rows,train_cols,i)
    sns.distplot(train_data[col], fit=stats.norm)
    
    i+=1
    ax = plt.subplot(train_rows, train_cols,i)
    res = stats.probplot(train_data[col], plot=plt)
plt.tight_layout()
plt.show()

在这里插入图片描述

# 直方图
plt.figure(figsize=(8,4), dpi=150)
ax = sns.kdeplot(train_data['V0'], color="Red", shade = True)
ax = sns.kdeplot(test_data['V0'], color='Blue', shade = True)
ax.set_xlabel('V0')
ax.legend(["train", "test"])

在这里插入图片描述

dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4*dist_cols,4*dist_rows))


i = 1
for col in test_data.columns:
    ax = plt.subplot(dist_rows,dist_cols,i)
    ax = sns.kdeplot(train_data[col], color='Red', shade = True)
    ax = sns.kdeplot(test_data[col], color='Blue', shade = True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax = ax.legend(["train", "test"])
    i+=1
plt.show()

在这里插入图片描述

# 线性回归关系图
fcols = 2
frows = 1
plt.figure(figsize=(8,4),dpi = 150 )

ax = plt.subplot(1,2,1)
sns.regplot(x='V0', y='target', data=train_data,ax=ax,
           scatter_kws ={'marker':'.','s':3,'alpha':0.3},
           line_kws={'color':'k'})
plt.xlabel('V0')
plt.ylabel('target')

ax = plt.subplot(1,2,2)
sns.distplot(train_data['V0'].dropna())
plt.xlabel('V0')
plt.show()

在这里插入图片描述

fcols = 6
frows = len(test_data.columns)
plt.figure(figsize=(8*fcols,4*frows))

i=0
for col in test_data.columns:
    i+=1
    ax = plt.subplot(frows,fcols,i)
    sns.regplot(x=col, y='target', data=train_data,ax=ax,
           scatter_kws ={'marker':'.','s':3,'alpha':0.3},
           line_kws={'color':'k'});
    plt.xlabel(col)
    plt.ylabel('target')
 
    i+=1
    ax = plt.subplot(frows,fcols,i)
    sns.displot(train_data[col].dropna())
    plt.xlabel(col)


# 查看特征变量的相关性

# 计算相关性系数

pd.set_option('display.max_columns',10)
pd.set_option('display.max_row',10)
data_train1 = train_data.drop(['V5','V9','V11', 'V17','V22','V28'],axis=1) # 删除训练集和测试集中分布不一样的特征量
train_corr = data_train1.corr()
train_corr

	V0	V1	V2	V3	V4	...	V34	V35	V36	V37	target
V0	1.000000	0.908607	0.463643	0.409576	0.781212	...	-0.019342	0.138933	0.231417	-0.494076	0.873212
V1	0.908607	1.000000	0.506514	0.383924	0.657790	...	-0.029115	0.146329	0.235299	-0.494043	0.871846
V2	0.463643	0.506514	1.000000	0.410148	0.057697	...	-0.025620	0.043648	0.316462	-0.734956	0.638878
V3	0.409576	0.383924	0.410148	1.000000	0.315046	...	-0.031898	0.080034	0.324475	-0.229613	0.512074
V4	0.781212	0.657790	0.057697	0.315046	1.000000	...	0.028659	0.100010	0.113609	-0.031054	0.603984
...	...	...	...	...	...	...	...	...	...	...	...
V34	-0.019342	-0.029115	-0.025620	-0.031898	0.028659	...	1.000000	0.233616	-0.019032	-0.006854	-0.006034
V35	0.138933	0.146329	0.043648	0.080034	0.100010	...	0.233616	1.000000	0.025401	-0.077991	0.140294
V36	0.231417	0.235299	0.316462	0.324475	0.113609	...	-0.019032	0.025401	1.000000	-0.039478	0.319309
V37	-0.494076	-0.494043	-0.734956	-0.229613	-0.031054	...	-0.006854	-0.077991	-0.039478	1.000000	-0.565795
target	0.873212	0.871846	0.638878	0.512074	0.603984	...	-0.006034	0.140294	0.319309	-0.565795	1.000000

33 rows × 33 columns

# 画出相关性热力图

ax = plt.subplots(figsize=(30,30)) # 调整画布大小
ax = sns.heatmap(train_corr, vmax=8,square=True,annot=True) # 画热力图

在这里插入图片描述

# 寻找K个与target变量最相关的特征变量



k=10 #找前十个最大的
cols = train_corr.nlargest(k,'target')['target'] .index

cm = np.corrcoef(train_data[cols].values.T)
hm = plt.subplots(figsize=(10,10))
hm = sns.heatmap(train_data[cols].corr(), annot=True, square=True)
plt.show()

在这里插入图片描述

# 找出与target变量的相关系数大于0.5的特征变量

threshold=0.5

corrmat = train_data.corr()
corrmat
top_corr_features = corrmat.index[abs(corrmat["target"])>threshold]
top_corr_features
plt.figure(figsize=(10,10))

g =  sns.heatmap(train_data[top_corr_features].corr(),
                annot=True,
                cmap="RdYlGn")

在这里插入图片描述

# 用相关系数阈值移除相关特征

threshold=0.5

# 相关系数矩阵
corr_matrix = data_train1.corr().abs()
corr_matrix
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
# data_all.drop(drop_col,axis=1,inplace=True)

Index(['V6', 'V7', 'V10', 'V13', 'V14', 'V15', 'V18', 'V19', 'V20', 'V21',
       'V23', 'V24', 'V25', 'V26', 'V29', 'V30', 'V32', 'V33', 'V34', 'V35',
       'V36'],
      dtype='object')

# Box-Cox变换
# 在连续的响应变量不满足正态分布时，可以使用Box-Cox变换，这一变换可以使线性回归模型在满足线性、正态性、独立性及方差齐性的同时，又不丢失信息
# 可以在一定程度上减少不可观测的误差和预测变量的相关性
# 有利于线性模型的拟合及分析出特征的相关性
# 在做变换之前 需对数据做归一化处理
# 在归一化时，对数据进行合并操作可以使训练数据和测试数据一致。这种方式可以在线下分析建模中使用，而线上部署只需采用训练数据的归一化即可

drop_columns = ['V5','V9','V11','V17','V22','V28']
# 合并训练集和测试集的数据

train_x = train_data.drop(['target'],axis=1)

# data_all = pd.concat([train_data,test_data],axis=0,ignore_index=True)
data_all = pd.concat([train_x,test_data])
data_all

	V0	V1	V2	V3	V4	...	V33	V34	V35	V36	V37
0	0.566	0.016	-0.143	0.407	0.452	...	-4.627	-4.789	-5.101	-2.608	-3.508
1	0.968	0.437	0.066	0.566	0.194	...	-0.843	0.160	0.364	-0.335	-0.730
2	1.013	0.568	0.235	0.370	0.112	...	-0.843	0.160	0.364	0.765	-0.589
3	0.733	0.368	0.283	0.165	0.599	...	-0.843	-0.065	0.364	0.333	-0.112
4	0.684	0.638	0.260	0.209	0.337	...	-0.843	-0.215	0.364	-0.280	-0.028
...	...	...	...	...	...	...	...	...	...	...	...
1920	-1.362	-1.553	-3.096	-0.444	0.381	...	-1.187	-0.852	-2.131	-2.564	0.597
1921	-2.698	-3.452	-3.620	-1.066	-1.385	...	-1.187	-0.852	-2.131	-2.564	1.215
1922	-2.615	-3.564	-3.402	-0.422	-1.272	...	-1.851	-1.548	-1.537	-2.544	1.612
1923	-2.661	-3.646	-3.271	-0.699	-1.270	...	-1.645	-1.471	-1.537	-2.549	1.431
1924	-2.321	-3.037	-3.214	-1.594	-0.910	...	-1.703	-1.471	-1.537	-1.123	1.988

4813 rows × 38 columns

data_all.drop(drop_columns,axis=1,inplace=True)
data_all.head()

	V0	V1	V2	V3	V4	...	V33	V34	V35	V36	V37
0	0.566	0.016	-0.143	0.407	0.452	...	-4.627	-4.789	-5.101	-2.608	-3.508
1	0.968	0.437	0.066	0.566	0.194	...	-0.843	0.160	0.364	-0.335	-0.730
2	1.013	0.568	0.235	0.370	0.112	...	-0.843	0.160	0.364	0.765	-0.589
3	0.733	0.368	0.283	0.165	0.599	...	-0.843	-0.065	0.364	0.333	-0.112
4	0.684	0.638	0.260	0.209	0.337	...	-0.843	-0.215	0.364	-0.280	-0.028

5 rows × 32 columns

# 对合并后的每列数据进行归一化

cols_numeric=list(data_all.columns)
cols_numeric


def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())

data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax,axis=0)
data_all[cols_numeric].describe()

	V0	V1	V2	V3	V4	...	V33	V34	V35	V36	V37
count	4813.000000	4813.000000	4813.000000	4813.000000	4813.000000	...	4813.000000	4813.000000	4813.000000	4813.000000	4813.000000
mean	0.694172	0.721357	0.602300	0.603139	0.523743	...	0.458493	0.483790	0.762873	0.332385	0.545795
std	0.144198	0.131443	0.140628	0.152462	0.106430	...	0.099095	0.101020	0.102037	0.127456	0.150356
min	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.626676	0.679416	0.514414	0.503888	0.478182	...	0.409037	0.454490	0.727273	0.270584	0.445647
50%	0.729488	0.752497	0.617072	0.614270	0.535866	...	0.454518	0.499949	0.800020	0.347056	0.539317
75%	0.790195	0.799553	0.700464	0.710474	0.585036	...	0.500000	0.511365	0.800020	0.414861	0.643061
max	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000

8 rows × 32 columns

train_data_process = train_data[cols_numeric]
train_data_process = train_data_process[cols_numeric].apply(scale_minmax,axis=0)

cols_numeric_left=cols_numeric[0:13]
cols_numeric_left

['V0',
 'V1',
 'V2',
 'V3',
 'V4',
 'V6',
 'V7',
 'V8',
 'V10',
 'V12',
 'V13',
 'V14',
 'V15']

cols_numeric_right=cols_numeric[13:]
cols_numeric_right

['V16',
 'V18',
 'V19',
 'V20',
 'V21',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V29',
 'V30',
 'V31',
 'V32',
 'V33',
 'V34',
 'V35',
 'V36',
 'V37']

train_data_process = pd.concat([train_data_process,train_data['target']],axis=1)
train_data_process

	V0	V1	V2	V3	V4	...	V34	V35	V36	V37	target
0	0.759139	0.729830	0.524488	0.680337	0.698964	...	0.000000	0.074074	0.000000	0.018401	0.175
1	0.821406	0.789631	0.557939	0.705130	0.664244	...	0.499949	0.755580	0.289702	0.437406	0.676
2	0.828377	0.808239	0.584987	0.674567	0.653210	...	0.499949	0.755580	0.429901	0.458673	0.633
3	0.785006	0.779830	0.592670	0.642601	0.718746	...	0.477220	0.755580	0.374841	0.530618	0.206
4	0.777416	0.818182	0.588988	0.649462	0.683488	...	0.462067	0.755580	0.296712	0.543288	0.384
...	...	...	...	...	...	...	...	...	...	...	...
2883	0.700898	0.724006	0.525288	0.641977	0.718880	...	0.481059	0.666667	0.405812	0.650980	0.235
2884	0.750000	0.806676	0.594750	0.645408	0.709460	...	0.534094	0.666667	0.254015	0.490196	1.042
2885	0.610440	0.625142	0.469750	0.629970	0.656439	...	0.534094	0.666667	0.453607	0.660332	0.005
2886	0.637546	0.687500	0.492318	0.609231	0.698560	...	0.545409	0.681506	0.294035	0.631222	0.350
2887	0.728470	0.781534	0.511364	0.609231	0.689140	...	0.482877	0.686495	0.260133	0.606033	0.417

2888 rows × 33 columns

fcols = 6
frows = len(cols_numeric_left)
plt.figure(figsize=(4*fcols, 4*frows))

i=0

for var in cols_numeric_left:
    dat = train_data_process[[var,'target']].dropna()
    i+=1
    plt.subplot(frows,fcols,i)
    sns.distplot(dat[var], fit=stats.norm)
    plt.title(var+'Original')
    plt.xlabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(dat[var],plot=plt)
    plt.title('skew'+'{:.4f}'.format(stats.skew(dat[var])))
    plt.xlabel('')
    plt.ylabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(dat[var],dat['target'],'.',alpha=0.5)
    plt.title('corr='+
             '{:.2f}'.format(np.corrcoef(dat[var],dat['target'])[0][1]))
    
    i+=1
    plt.subplot(frows,fcols,i)
    trans_var,lambda_var = stats.boxcox(dat[var].dropna() + 1)
    trans_var = scale_minmax(trans_var)
    sns.distplot(trans_var,fit=stats.norm)
    plt.title(var+'Tramsformed')
    plt.xlabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    _=stats.probplot(trans_var,plot=plt)
    plt.title('skew'+'{:.4f}'.format(stats.skew(trans_var)))
    plt.xlabel('')
    plt.ylabel('')
    
    i+=1
    plt.subplot(frows,fcols,i)
    plt.plot(trans_var,dat['target'],'.',alpha=0.5)
    plt.title('corr='+
             '{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

在这里插入图片描述

哈哈你个大锤子

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
数据分析----数据探索实践（工业蒸汽预测）

import numpy as npimport pandas as pdimport matplotlib.pyplot as plt# Seaborn其实是在matplotlib的基础上进行了更高级的API封装，从而使得作图更加容易，在大多数情况下使用seaborn就能做出很具有吸引力的图。import seaborn as snsfrom scipy import statsimport warningswarnings.filterwarnings("ignore")%mat
复制链接

扫一扫

专栏目录