获取数据
import numpy as np
import pandas as pd
# pandas读取数据集
import pandas as pd
insurance=open('C:/Users/86188/sklean-machine-learning/上机代码/医保支出预测/Train_Data.csv')
df=pd.read_csv(insurance)
从数据探索和可视化中获得洞见¶
df.info()
df.head()
# 通过describe我们可以观察到数据的数量,平均值,标准差,最小值,最大值等数据
df.describe()
# 预处理
# 训练集
df['sex'] = df['sex'].apply({'male':0,'female':1}.get)
df['smoker'] = df['smoker'].apply({'yes':1,'no':0}.get)
df['region'] =df['region'].apply({'southwest':1,'southeast':2,'northwest':3,'northeast':4}.get)
x = df[['age', 'sex', 'bmi','smoker','region', 'children']]
y= df['charges']
x
# 是否有空值
np.isnan(df).any()
#数值型变量统计描述
y.describe()
上面的结果描述了有关数值型变量的简单统计值,包括非缺失观测的个数(count)、平均值(mean)、标准差(std)、最小值(min)、下四分位数(25%)、中位数(50%)、上四分位数(75%)和最大值(max)。
# 数据探索
%matplotlib inline
import matplotlib.pyplot as plt
df.hist(bins=50, figsize=(20,15))
plt.show()
#不同地区投保人BMI指数散点图
import matplotlib.pyplot as plt
import seaborn as sns
sns.stripplot(data=df,x='region',y='bmi')
plt.title('不同地区医保支出BMI指数散点图')
plt.rcParams['font.sans-serif'] = ['KaiTi']
# 寻找相关性
corr_matrix =df.corr()
# 可视化
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='PuBu')
plt.title('各个特征中的相关性')
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.show()
# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas
from pandas.plotting import scatter_matrix
attributes = ['age', 'sex', 'bmi','smoker','region', 'children']
scatter_matrix(df[attributes], figsize=(20, 15))
plt.show()
模型预测
# 预处理
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=42)
scaler=StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
random=RandomForestRegressor()
random.fit(x_train,y_train)
y_pred=random.predict(x_test)
y_pred
print("准确率:",random.score(x_test,y_test))
insurance_Test=open('C:/Users/86188/sklean-machine-learning/上机代码/医保支出预测/Test_Data.csv')
df1=pd.read_csv(insurance_Test)
# 测试集
df1['sex'] = df1['sex'].apply({'male':0,'female':1}.get)
df1['smoker'] = df1['smoker'].apply({'yes':1,'no':0}.get)
df1['region'] =df1['region'].apply({'southwest':1,'southeast':2,'northwest':3,'northeast':4}.get)
x_test1= df1[['age', 'sex', 'bmi','smoker','region', 'children']]
x_test1
y_pred1=random.predict(x_test1)
y_pred1
调优
# 调优
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
tree_grid_parameter = {'min_samples_split':list((3, 6, 9)), 'n_estimators':list((10, 50, 100))}
grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_grid_parameter, cv=3,
scoring='neg_mean_squared_error',
return_train_score=True)
grid.fit(x_train,y_train)
#print(grid.best_scores_) # 打印得分
#print(grid.best_params_) # 打印最好的参数组合
#print(grid.best_score_) # 打印最好的得分
print(grid.best_params_)
调优后预测当年所需医保支出(charge)
grid_pred=grid.predict(x_test1)
grid_pred
from sklearn.metrics import explained_variance_score
report=explained_variance_score(y_pred1,grid_pred)
print("准确率:",report)