回归分析过程实例(练习)

By:HEHE

本实例是基于:混凝土抗压强度的回归分析

# 导包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os

1. 数据基本面分析

# path
path_dir = os.path.dirname(os.path.dirname(os.getcwd()))

path_data = path_dir +  r'\concrete_data.xls'
# load_data
data = pd.read_excel(path_data)
# 查看数据基本面
data.head()
Cement (component 1)(kg in a m^3 mixture)Blast Furnace Slag (component 2)(kg in a m^3 mixture)Fly Ash (component 3)(kg in a m^3 mixture)Water (component 4)(kg in a m^3 mixture)Superplasticizer (component 5)(kg in a m^3 mixture)Coarse Aggregate (component 6)(kg in a m^3 mixture)Fine Aggregate (component 7)(kg in a m^3 mixture)Age (day)Concrete compressive strength(MPa, megapascals)
0540.00.00.0162.02.51040.0676.02879.986111
1540.00.00.0162.02.51055.0676.02861.887366
2332.5142.50.0228.00.0932.0594.027040.269535
3332.5142.50.0228.00.0932.0594.036541.052780
4198.6132.40.0192.00.0978.4825.536044.296075
# 修改列名
data.columns = ['cement_component', 'furnace_slag', 'flay_ash', 'water_component', 'superplasticizer', \
    'coarse_aggregate', 'fine_aggregate', 'age', 'concrete_strength']
data.head()
cement_componentfurnace_slagflay_ashwater_componentsuperplasticizercoarse_aggregatefine_aggregateageconcrete_strength
0540.00.00.0162.02.51040.0676.02879.986111
1540.00.00.0162.02.51055.0676.02861.887366
2332.5142.50.0228.00.0932.0594.027040.269535
3332.5142.50.0228.00.0932.0594.036541.052780
4198.6132.40.0192.00.0978.4825.536044.296075
# 查看数据基本面
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
cement_component     1030 non-null float64
furnace_slag         1030 non-null float64
flay_ash             1030 non-null float64
water_component      1030 non-null float64
superplasticizer     1030 non-null float64
coarse_aggregate     1030 non-null float64
fine_aggregate       1030 non-null float64
age                  1030 non-null int64
concrete_strength    1030 non-null float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB
# 查看数据基本面
data.describe()
cement_componentfurnace_slagflay_ashwater_componentsuperplasticizercoarse_aggregatefine_aggregateageconcrete_strength
count1030.0000001030.0000001030.0000001030.0000001030.0000001030.0000001030.0000001030.0000001030.000000
mean281.16563173.89548554.187136181.5663596.203112972.918592773.57888345.66213635.817836
std104.50714286.27910463.99646921.3555675.97349277.75381880.17542763.16991216.705679
min102.0000000.0000000.000000121.7500000.000000801.000000594.0000001.0000002.331808
25%192.3750000.0000000.000000164.9000000.000000932.000000730.9500007.00000023.707115
50%272.90000022.0000000.000000185.0000006.350000968.000000779.51000028.00000034.442774
75%350.000000142.950000118.270000192.00000010.1600001029.400000824.00000056.00000046.136287
max540.000000359.400000200.100000247.00000032.2000001145.000000992.600000365.00000082.599225

数据基本面总结如下:

  1. 数据集共1030条数据,特征8个,目标为concrete_strength
  2. 数据集无缺失值,数据类型全为数值

2. EDA(数据探索性分析)

2.1 concrete_strength
sns.distplot(data['concrete_strength'], bins = 20, color = 'red')
<matplotlib.axes._subplots.AxesSubplot at 0x213da2c2080>

955639-20190328094732856-974532645.png

concrete_strength:数据分布正常,稍微有点右偏

2.2 features
plt.figure(figsize = (15,10.5))
plot_count = 1

for feature in list(data.columns)[:-1]:
    plt.subplot(3,3, plot_count)
    plt.scatter(data[feature], data['concrete_strength'])
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')
    plot_count +=1

plt.show()

955639-20190328094920804-321055377.png

plt.figure(figsize=(9,9))
corrmat = data.corr()
sns.heatmap(corrmat, vmax= 0.8, square = True, )
<matplotlib.axes._subplots.AxesSubplot at 0x213ddc4e7b8>

955639-20190328094946022-1736893306.png

EDA总结:

  1. 数据相关性都不强,
  2. cement_component,water_component,superplasticizer,age似乎相关性高一点
  3. 由于特征都不多,可以分别用这四个特征以及所有特征尝试一遍
  4. 没有发现异常值
  5. 还没决定数据要不要标准化

3. model

实验内容:分别使用上面得到的特征,以及所有特征对混凝土强度做预测,同时使用不同的回归算法

from sklearn.model_selection import train_test_split
# 按数据集特征切割训练集测试集
def split_train_test(data, features=None, test_ratio=0.2):
    y = data['concrete_strength']
    if features != None:
        x = data[features]
    else:
        x = data.drop(['concrete_strength'], axis=1)
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = test_ratio)
    return train_x, test_x, train_y, test_y
# 训练集,测试集
train_x, test_x, train_y, test_y = split_train_test(data, test_ratio = 0)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score
def data_cross_val(x,y, clfs, clfs_name, cv= 5):
    for i,clf in enumerate(clfs):
        scores = cross_val_score(estimator=clf, X= x, y= y, cv=cv, scoring ='r2')
        print(clfs_name[i])
        print('the R2 score: %f' %  np.mean(scores))

3.1 所有特征做回归

clfs = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), GradientBoostingRegressor(), SVR()]
clfs_name = ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'GradientBoostingRegressor', 'SVR']
data_cross_val(train_x, train_y, clfs,clfs_name, cv = 5)
LinearRegression
the R2 score: 0.604974
Ridge
the R2 score: 0.604974
Lasso
the R2 score: 0.605090
ElasticNet
the R2 score: 0.605220
GradientBoostingRegressor
the R2 score: 0.908837
SVR
the R2 score: 0.023249

结论:单一的回归器还是没有梯度提升机好,可以尝试用bagging和stacking的方式再实验一下,或者增加特征。

3.2 部分相关特征做回归

# 训练集,测试集
features = ['cement_component','water_component','superplasticizer','age']
train_x, test_x, train_y, test_y = split_train_test(data, features, test_ratio = 0)
clfs = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), GradientBoostingRegressor(), SVR()]
clfs_name = ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'GradientBoostingRegressor', 'SVR']
data_cross_val(train_x, train_y, clfs,clfs_name, cv = 5)
LinearRegression
the R2 score: 0.485046
Ridge
the R2 score: 0.485045
Lasso
the R2 score: 0.484828
ElasticNet
the R2 score: 0.484840
GradientBoostingRegressor
the R2 score: 0.830816
SVR
the R2 score: 0.043992

总结:目前来说使用部分相关的特征来做回归,由于特征数目太少,还不如用所有特征来的比较好

3.3 单线性回归

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    
    x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

    # Create linear regression object
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue',
             linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()
cement_component 0.24550132796330282
flay_ash 0.012228585601186226
water_component 0.09828887425075417
superplasticizer 0.11471267678235075
coarse_aggregate 0.02046823335033021

955639-20190328095017848-112600655.png

features = ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']

data_tr = data
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, x_test, y_train, y_test = split_train_test(data_tr, features)

# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.155569
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

955639-20190328095033880-451514415.png

alphas = np.arange(0.1,5,0.1)

model = Ridge()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.155562
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

955639-20190328095049248-580059011.png

model = Lasso()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.151682
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

955639-20190328095106194-1457344672.png

model = ElasticNet()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.151796
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

955639-20190328095120888-2144523795.png

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

    # Create linear regression object
    regr = GradientBoostingRegressor()

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue',
             linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()
cement_component 0.35248985320039705
flay_ash 0.17319875701989795
water_component 0.285023360910455
superplasticizer 0.19306275412216778
coarse_aggregate 0.17712532312647877

955639-20190328095140465-1926789993.png

model = GradientBoostingRegressor()

y_pred = model.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue',
         linewidth=3)


print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
#print('Intercept: %f'%regr.intercept_)
#print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.177125

955639-20190328095158528-1568223554.png

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

    # Create linear regression object
    regr = SVR(kernel='linear')

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue', linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()
cement_component 0.2054832593541437
flay_ash -0.044636249705873654
water_component 0.07749271320026574
superplasticizer 0.0671220299245393
coarse_aggregate 0.016036478490831563

955639-20190328095214256-1455113775.png

model = SVR(kernel='linear')

y_pred = model.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.016036

955639-20190328095231776-130773976.png

4. 使用 cement_component和 water_component预测concrete_strength

feature = 'cement_component'
cc_new_data = np.array([[213.5]])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

regr = GradientBoostingRegressor()

# Train the model using the training sets

regr.fit(x_train, y_train)
cs_pred = regr.predict(cc_new_data)
print('Predicted value of concrete strength: %f'%cs_pred)
Predicted value of concrete strength: 36.472380
feature = 'water_component'
wc_new_data = np.array([[200]])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

regr = GradientBoostingRegressor()

# Train the model using the training sets
regr.fit(x_train, y_train)
cs_pred = regr.predict(wc_new_data)
print('Predicted value of concrete strength: %f'%cs_pred)
Predicted value of concrete strength: 32.648425

转载于:https://www.cnblogs.com/llssx/p/10612940.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值