文章目录
数据准备与分析
读取
import numpy as np
import seaborn as sns
import pandas as pd
data = pd.read_excel("./中国人寿.xlsx")
data.head(5)
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
分析
利用seaborn画出各个特征的概率分布曲线,删去不重要的特征
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 2, figsize=(6, 4))
sns.kdeplot(data=data, x='charges', fill=True, hue='smoker',ax=axes[0,0])
sns.kdeplot(data=data, x='charges', fill=True, hue='children',ax=axes[0,1])
sns.kdeplot(data=data, x='charges', fill=True, hue='sex',ax=axes[1,0])
sns.kdeplot(data=data, x='charges', fill=True, hue='region',ax=axes[1,1])
分析:概率分布曲线重合度越高表明,特征的区别度越低
我们建模的时候可以把这些特征删去!
如上图,可以删去sex,和region这两个特征。
删去区别度低的特征
import warnings
warnings.filterwarnings("ignore")
val_data = data[['age','bmi','children','smoker','charges']]
## 字符数据编码
val_data['smoker'] = data['smoker'].apply(lambda x:0 if x == 'no' else 1)
X = val_data.iloc[:,:-1]
y = val_data[['charges']]
X
age | bmi | children | smoker | |
---|---|---|---|---|
0 | 19 | 27.900 | 0 | 1 |
1 | 18 | 33.770 | 1 | 0 |
2 | 28 | 33.000 | 3 | 0 |
3 | 33 | 22.705 | 0 | 0 |
4 | 32 | 28.880 | 0 | 0 |
... | ... | ... | ... | ... |
1333 | 50 | 30.970 | 3 | 0 |
1334 | 18 | 31.920 | 0 | 0 |
1335 | 18 | 36.850 | 0 | 0 |
1336 | 21 | 25.800 | 0 | 0 |
1337 | 61 | 29.070 | 0 | 1 |
1338 rows × 4 columns
## 切分训练集与测试集#定义一个预处理的类
class data_process:
def __init__(self,X,y,train_partion=0.8):
self.X = X
self.y = y
self.train_partion = train_partion
#切分训练集与测试集
def shuffle_data(self):
#首先将数据转为np格式
X = self.X.values
y = self.y.values
#打乱数据先
index = np.arange(X.shape[0])
np.random.shuffle(index)
#numpy的花式索引,把样本打乱给成index的排序
X = X[index]
y = y[index]
#按比例切分训练集和测试集
margin = int(X.shape[0] * self.train_partion)
X_train = X[:margin,:]
y_train = y[:margin,:]
X_test = X[margin:,:]
y_test = y[margin:,:]
return X_train,y_train,X_test,y_test
#数据归一化
def data_normolize(self):
#0均值标准化
#axis = 0列
X_train,y_train,X_test,y_test = self.shuffle_data()
X_train_norml = (X_train - X_train.mean(axis = 0)) / X_train.std(axis = 0)
X_test_norml = (X_test - X_test.mean(axis = 0)) / X_test.std(axis = 0)
return X_train_norml,y_train,X_test_norml,y_test
def processing(self):
X_train_norml,y_train,X_test_norml,y_test = self.data_normolize()
return X_train_norml,y_train,X_test_norml,y_test
拿到处理完的数据
data_pro = data_process(X,y,train_partion=0.8)
X_train,y_train,X_test,y_test = data_pro.processing()
#display(X_train.shape,y_train.shape,X_test.shape,y_test.shape)
训练
使用线性回归、线性回归衍生算法
使用数据升维运算(对比不使用升维运算差异)
#R2分数评估模型,越接近1,模型越好
from sklearn.metrics import mean_squared_error,r2_score
线性回归
from sklearn.linear_model import LinearRegression
lnr = LinearRegression()
#训练
lnr.fit(X_train,y_train)
#预测
lnr_pre = lnr.predict(X_test)
#评分
lnr_score = r2_score(y_test,lnr_pre)
print("线性回归R2:",lnr_score)
线性回归R2: 0.7134052168884122
随机梯度下降
from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(X_train,y_train)
#预测
sgd_pre = sgd.predict(X_test)
#评分
sgd_score = r2_score(y_test,sgd_pre)
print("随机梯度下降R2:",sgd_score)
随机梯度下降R2: 0.7133771647173909
岭回归
from sklearn.linear_model import Ridge
ridge = Ridge(alpha= 1, solver='sag')
ridge.fit(X_train, y_train)
ridge_pre = ridge.predict(X_test)
#评分
ridge_score = r2_score(y_test,ridge_pre)
print("岭回归R2:",ridge_score)
岭回归R2: 0.7135328179378375
Lasso回归
from sklearn.linear_model import Lasso
lasso = Lasso(alpha= 0.5)
lasso.fit(X_train, y_train)
lasso_pre = lasso.predict(X_test)
lasso_score = r2_score(y_test,lasso_pre)
print("Lasso回归R2:",lasso_score)
Lasso回归R2: 0.7134145997065494
Elastic-Net
from sklearn.linear_model import ElasticNet
ElasticNet_model = ElasticNet(alpha= 1, l1_ratio = 0.7)
ElasticNet_model.fit(X_train, y_train)
ElasticNet_pre = ElasticNet_model.predict(X_test)
ElasticNet_score = r2_score(y_test,ElasticNet_pre)
print("ElasticNet R2:",ElasticNet_score)
ElasticNet R2: 0.698820757089488
数据升维
2次特征 ----- 是特征之间乘法+特征自身做升维运算的效果
# PolynomialFeatures,多项式,升维
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
#没有归一化的数据
X_train,y_train,X_test,y_test = data_pro.shuffle_data()
poly = PolynomialFeatures(degree=2,interaction_only=False)#特征之间乘法+特征自身做升维运算的效果
X_train_poly = poly.fit_transform(X_train)
X_text_poly = poly.fit_transform(X_test)
#升完维再做归一化
stardard = StandardScaler() # 标准化
X_train_poly_norm = stardard.fit_transform(X_train_poly)
X_test_poly_norm = stardard.fit_transform(X_text_poly)
升维+线性回归
from sklearn.linear_model import LinearRegression
lnr_poly = LinearRegression()
#训练
lnr_poly.fit(X_train_poly_norm,y_train)
#预测
lnr_poly_pre = lnr_poly.predict(X_test_poly_norm)
#评分
lnr_score_ploy = r2_score(y_test,lnr_poly_pre)
print("二维多项式线性回归 R2:",lnr_score_ploy)
二维多项式线性回归 R2: 0.8743729177611113
升维+随机梯度下降
from sklearn.linear_model import SGDRegressor
sgd_ploy = SGDRegressor()
#训练
sgd_ploy.fit(X_train_poly_norm,y_train)
#预测
sgd_poly_pre = sgd_ploy.predict(X_test_poly_norm)
#评分
sgd_score_ploy = r2_score(y_test,sgd_poly_pre)
print("二维多项式随机梯度下降 R2:",sgd_score_ploy)
二维多项式随机梯度下降 R2: 0.8706325203066192
升维+岭回归
from sklearn.linear_model import Ridge
ridge_ploy = Ridge(alpha= 1, solver='sag')
ridge_ploy.fit(X_train_poly_norm, y_train)
ridge_pre_ploy = ridge_ploy.predict(X_test_poly_norm)
#评分
ridge_score_ploy = r2_score(y_test,ridge_pre_ploy)
print("升维+岭回归 R2:",ridge_score_ploy)
升维+岭回归 R2: 0.8743792671047281
升维+Lossa回归
from sklearn.linear_model import Lasso
lasso_ploy = Lasso(alpha= 0.5)
lasso_ploy.fit(X_train_poly_norm, y_train)
lasso_pre_ploy = lasso_ploy.predict(X_test_poly_norm)
#评分
lasso_score_ploy = r2_score(y_test,lasso_pre_ploy)
print("升维+Lossa回归 R2:",lasso_score_ploy)
升维+Lossa回归 R2: 0.8747696868608776
升维+Elastic-Net
from sklearn.linear_model import ElasticNet
ElasticNet_model_ploy = ElasticNet(alpha= 1, l1_ratio = 0.7)
ElasticNet_model_ploy.fit(X_train_poly_norm, y_train)
ElasticNet_pre_ploy = ElasticNet_model_ploy.predict(X_test_poly_norm)
#评分
ElasticNet_score_ploy = r2_score(y_test,ElasticNet_pre_ploy)
print("升维+Elastic-Net R2:",ElasticNet_score_ploy)
升维+Elastic-Net R2: 0.8104941823304773