引言
尝试使用jupyter notebook 作为工具,对WBPC预后诊断数据集进行了一些相关的描述性统计分析,源码如下,不足之处望读者多加指正。
#引用约定
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
#数据导入
data=pd.read_csv('data_2.csv')
data.head()
id | outcome | time | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | ... | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diameter of the excised tumor in centimeters | Lymph node status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 119513 | N | 31 | 18.02 | 27.600000 | 117.50 | 1013.0 | 0.094890 | 0.103600 | 0.1086 | ... | 139.70 | 1436.0 | 0.119500 | 0.192600 | 0.3140 | 0.1170 | 0.267700 | 0.08113 | 5.0 | 5 |
1 | 8423 | N | 61 | 17.99 | 22.300979 | 122.80 | 1001.0 | 0.118400 | 0.142642 | 0.3001 | ... | 184.60 | 2019.0 | 0.162200 | 0.665600 | 0.7119 | 0.2654 | 0.460100 | 0.11890 | 3.0 | 2 |
2 | 842517 | N | 116 | 21.37 | 17.440000 | 137.50 | 1373.0 | 0.088360 | 0.118900 | 0.1255 | ... | 159.10 | 1949.0 | 0.118800 | 0.344900 | 0.3414 | 0.2032 | 0.433400 | 0.09067 | 2.5 | 0 |
3 | 843483 | N | 123 | 11.42 | 20.380000 | 77.58 | 386.1 | 0.102774 | 0.142642 | 0.2414 | ... | 98.87 | 567.7 | 0.143921 | 0.364567 | 0.6869 | 0.2575 | 0.322251 | 0.17300 | 2.0 | 0 |
4 | 843584 | R | 27 | 20.29 | 14.340000 | 135.10 | 1297.0 | 0.100300 | 0.132800 | 0.1980 | ... | 152.20 | 1575.0 | 0.137400 | 0.205000 | 0.4000 | 0.1625 | 0.236400 | 0.07678 | 3.5 | 0 |
5 rows × 35 columns
#查看标签
col=data.columns
print(col)
Index(['id', 'outcome', 'time', 'radius_mean', 'texture_mean',
'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean',
'concavity_mean', 'concave points_mean', 'symmetry_mean',
'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se',
'area_se', 'smoothness_se', 'compactness_se', 'concavity_se',
'concave points_se', 'symmetry_se', 'fractal_dimension_se',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
'smoothness_worst', 'compactness_worst', 'concavity_worst',
'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst',
'diameter of the excised tumor in centimeters', 'Lymph node status'],
dtype='object')
#提取想要分析的结果
y=data.outcome
x=data.drop(['id','outcome','time'],axis = 1)
x.head()
radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diameter of the excised tumor in centimeters | Lymph node status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.02 | 27.600000 | 117.50 | 1013.0 | 0.094890 | 0.103600 | 0.1086 | 0.07055 | 0.1865 | 0.063330 | ... | 139.70 | 1436.0 | 0.119500 | 0.192600 | 0.3140 | 0.1170 | 0.267700 | 0.08113 | 5.0 | 5 |
1 | 17.99 | 22.300979 | 122.80 | 1001.0 | 0.118400 | 0.142642 | 0.3001 | 0.14710 | 0.2419 | 0.078710 | ... | 184.60 | 2019.0 | 0.162200 | 0.665600 | 0.7119 | 0.2654 | 0.460100 | 0.11890 | 3.0 | 2 |
2 | 21.37 | 17.440000 | 137.50 | 1373.0 | 0.088360 | 0.118900 | 0.1255 | 0.08180 | 0.2333 | 0.060100 | ... | 159.10 | 1949.0 | 0.118800 | 0.344900 | 0.3414 | 0.2032 | 0.433400 | 0.09067 | 2.5 | 0 |
3 | 11.42 | 20.380000 | 77.58 | 386.1 | 0.102774 | 0.142642 | 0.2414 | 0.10520 | 0.2597 | 0.062743 | ... | 98.87 | 567.7 | 0.143921 | 0.364567 | 0.6869 | 0.2575 | 0.322251 | 0.17300 | 2.0 | 0 |
4 | 20.29 | 14.340000 | 135.10 | 1297.0 | 0.100300 | 0.132800 | 0.1980 | 0.10430 | 0.1809 | 0.058830 | ... | 152.20 | 1575.0 | 0.137400 | 0.205000 | 0.4000 | 0.1625 | 0.236400 | 0.07678 | 3.5 | 0 |
5 rows × 32 columns
#对数据进行描述性统计分析
ax=sns.countplot(y,label="Count")
N,R=y.value_counts()
print('Number of recur',R)
print('Number of nonrecur:', N)
Number of recur 46
Number of nonrecur: 148
#进行描述性数值分析
des=x.describe()
#绘制相似性热力图
feature_mean=list(data.columns[2:12])
feature_se=list(data.columns[12:22])
feature_worst=list(data.columns[22:32])
feature_other=list(data.columns[32:34])
corr=data[feature_mean].corr()
plt.figure(figsize=(14,14))
sns.heatmap(corr,annot=True)
<matplotlib.axes._subplots.AxesSubplot at 0x26a5d0e56d8>
#数据可视化
#绘制对比分析统计图
data_dia=y
data=x
data_n_2=(data-data.mean())/(data.std())#标准化
data=pd.concat([y,data_n_2.iloc[:,0:10]],axis=1)
data=pd.melt(data,id_vars='outcome',
var_name='features',
value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x='features',y='value',hue='outcome',data=data,split=True,inner='quart')
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
#绘制箱线图
plt.figure(figsize=(10,10))
sns.boxplot(x='features',y='value',hue='outcome',data=data)
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
data_dia=y
data=x
data_n_2=(data-data.mean())/(data.std())#标准化
data=pd.concat([y,data_n_2.iloc[:,10:20]],axis=1)
data=pd.melt(data,id_vars='outcome',
var_name='features',
value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x='features',y='value',hue='outcome',data=data,split=True,inner='quart')
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
#绘制箱线图
plt.figure(figsize=(10,10))
sns.boxplot(x='features',y='value',hue='outcome',data=data)
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
data_dia=y
data=x
data_n_2=(data-data.mean())/(data.std())#标准化
data=pd.concat([y,data_n_2.iloc[:,20:30]],axis=1)
data=pd.melt(data,id_vars='outcome',
var_name='features',
value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x='features',y='value',hue='outcome',data=data,split=True,inner='quart')
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
#绘制箱线图
plt.figure(figsize=(10,10))
sns.boxplot(x='features',y='value',hue='outcome',data=data)
plt.xticks(rotation=90)
(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), <a list of 10 Text xticklabel objects>)
data_dia=y
data=x
data_n_2=(data-data.mean())/(data.std())#标准化
data=pd.concat([y,data_n_2.iloc[:,30:32]],axis=1)
data=pd.melt(data,id_vars='outcome',
var_name='features',
value_name='value')
plt.figure(figsize=(10,10))
sns.violinplot(x='features',y='value',hue='outcome',data=data,split=True,inner='quart')
plt.xticks(rotation=90)
(array([0, 1]), <a list of 2 Text xticklabel objects>)
#绘制箱线图
plt.figure(figsize=(10,10))
sns.boxplot(x='features',y='value',hue='outcome',data=data)
plt.xticks(rotation=90)
(array([0, 1]), <a list of 2 Text xticklabel objects>)