数据读取
我们的目标是找到一种方法,利用其他列中的数值来估计 "收费 "栏中的数值。如果我们能够对历史数据做到这一点,那么我们也应该能够估计新客户的收费,只需询问他们的年龄、性别、BMI、是否有孩子、吸烟习惯和地区等信息。
import pandas as pd
import numpy as pd
medical_df = pd.read_csv('medical.csv')
medical_df.head()
|
age |
sex |
bmi |
children |
smoker |
region |
charges |
0 |
19 |
female |
27.900 |
0 |
yes |
southwest |
16884.92400 |
1 |
18 |
male |
33.770 |
1 |
no |
southeast |
1725.55230 |
2 |
28 |
male |
33.000 |
3 |
no |
southeast |
4449.46200 |
3 |
33 |
male |
22.705 |
0 |
no |
northwest |
21984.47061 |
4 |
32 |
male |
28.880 |
0 |
no |
northwest |
3866.85520 |
medical_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1338 non-null int64
1 sex 1338 non-null object
2 bmi 1338 non-null float64
3 children 1338 non-null int64
4 smoker 1338 non-null object
5 region 1338 non-null object
6 charges 1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
medical_df.describe()
|
age |
bmi |
children |
charges |
count |
1338.000000 |
1338.000000 |
1338.000000 |
1338.000000 |
mean |
39.207025 |
30.663397 |
1.094918 |
13270.422265 |
std |
14.049960 |
6.098187 |
1.205493 |
12110.011237 |
min |
18.000000 |
15.960000 |
0.000000 |
1121.873900 |
25% |
27.000000 |
26.296250 |
0.000000 |
4740.287150 |
50% |
39.000000 |
30.400000 |
1.000000 |
9382.033000 |
75% |
51.000000 |
34.693750 |
2.000000 |
16639.912515 |
max |
64.000000 |
53.130000 |
5.000000 |
63770.428010 |
import matplotlib.pyplot as plt
import seaborn as sns
探索性分析找关系
sns.histplot(medical_df['charges']);
sns.histplot(medical_df['children']);
sns.histplot(medical_df['age'], bins = 47);
sns.histplot(medical_df['bmi']);
plt.figure(figsize = (12,6))
sns.scatterplot(x = 'age', y = 'charges', hue = 'sex', data = medical_df);
sns.regplot(x = 'age', y = 'charges', data = medical_df);
sns.lmplot(x = 'age', y = 'charges', hue = 'sex', data = medical_df);