机器学习——医保预测

 获取数据

import numpy as np

import pandas as pd

# pandas读取数据集

import pandas as pd

insurance=open('C:/Users/86188/sklean-machine-learning/上机代码/医保支出预测/Train_Data.csv')

df=pd.read_csv(insurance)

从数据探索和可视化中获得洞见¶

df.info()

df.head()

#  通过describe我们可以观察到数据的数量,平均值,标准差,最小值,最大值等数据

df.describe()

# 预处理

# 训练集

df['sex'] = df['sex'].apply({'male':0,'female':1}.get)

df['smoker'] = df['smoker'].apply({'yes':1,'no':0}.get)

df['region'] =df['region'].apply({'southwest':1,'southeast':2,'northwest':3,'northeast':4}.get)

x = df[['age', 'sex', 'bmi','smoker','region', 'children']]

y= df['charges']

x

# 是否有空值

np.isnan(df).any()

#数值型变量统计描述

y.describe()

上面的结果描述了有关数值型变量的简单统计值,包括非缺失观测的个数(count)、平均值(mean)、标准差(std)、最小值(min)、下四分位数(25%)、中位数(50%)、上四分位数(75%)和最大值(max)。

# 数据探索

%matplotlib inline

import matplotlib.pyplot as plt

df.hist(bins=50, figsize=(20,15))

plt.show()

#不同地区投保人BMI指数散点图

import matplotlib.pyplot as plt

import seaborn as sns

sns.stripplot(data=df,x='region',y='bmi')

plt.title('不同地区医保支出BMI指数散点图')

plt.rcParams['font.sans-serif'] = ['KaiTi']

# 寻找相关性

corr_matrix =df.corr()

# 可视化

import seaborn as sns

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))

sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='PuBu')

plt.title('各个特征中的相关性')

plt.rcParams['font.sans-serif'] = ['KaiTi']

plt.show()

# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas

from pandas.plotting import scatter_matrix

attributes = ['age', 'sex', 'bmi','smoker','region', 'children']

scatter_matrix(df[attributes], figsize=(20, 15))

plt.show()

模型预测

# 预处理

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

x_train, x_test, y_train, y_test=train_test_split(x,y,random_state=42)

scaler=StandardScaler()

x_train = scaler.fit_transform(x_train)

x_test = scaler.fit_transform(x_test)

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

random=RandomForestRegressor()

random.fit(x_train,y_train)

y_pred=random.predict(x_test)

y_pred

print("准确率:",random.score(x_test,y_test))

insurance_Test=open('C:/Users/86188/sklean-machine-learning/上机代码/医保支出预测/Test_Data.csv')

df1=pd.read_csv(insurance_Test)

# 测试集

df1['sex'] = df1['sex'].apply({'male':0,'female':1}.get)

df1['smoker'] = df1['smoker'].apply({'yes':1,'no':0}.get)

df1['region'] =df1['region'].apply({'southwest':1,'southeast':2,'northwest':3,'northeast':4}.get)

x_test1= df1[['age', 'sex', 'bmi','smoker','region', 'children']]

x_test1

y_pred1=random.predict(x_test1)

y_pred1

调优

# 调优

import numpy as np

from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV

tree_grid_parameter = {'min_samples_split':list((3, 6, 9)), 'n_estimators':list((10, 50, 100))}

grid = GridSearchCV(RandomForestRegressor(), param_grid=tree_grid_parameter, cv=3,

                    scoring='neg_mean_squared_error',

                    return_train_score=True)

grid.fit(x_train,y_train)

#print(grid.best_scores_) # 打印得分

#print(grid.best_params_) # 打印最好的参数组合

#print(grid.best_score_)  # 打印最好的得分

print(grid.best_params_)

调优后预测当年所需医保支出(charge)

grid_pred=grid.predict(x_test1)

grid_pred

from sklearn.metrics import explained_variance_score

report=explained_variance_score(y_pred1,grid_pred)

print("准确率:",report)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

消失的狐狸君

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值