import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.DataFrame({'Condition 1': np.random.rand(20),
'Condition 2': np.random.rand(20) * 0.9,
'Condition 3': np.random.rand(20) * 1.1
})
df
Condition 1 | Condition 2 | Condition 3 | |
---|---|---|---|
0 | 0.150388 | 0.319698 | 0.307660 |
1 | 0.969830 | 0.613011 | 0.695216 |
2 | 0.660890 | 0.552131 | 0.229432 |
3 | 0.574232 | 0.679883 | 0.738781 |
4 | 0.527174 | 0.578460 | 0.981132 |
5 | 0.952754 | 0.388025 | 0.935823 |
6 | 0.077330 | 0.331501 | 0.663525 |
7 | 0.288425 | 0.755113 | 0.829731 |
8 | 0.398153 | 0.668251 | 0.674626 |
9 | 0.687752 | 0.540433 | 0.971847 |
10 | 0.470583 | 0.352360 | 0.249517 |
11 | 0.643588 | 0.240827 | 0.640346 |
12 | 0.278763 | 0.012188 | 0.506313 |
13 | 0.486791 | 0.538330 | 0.005713 |
14 | 0.661333 | 0.101712 | 0.868087 |
15 | 0.420160 | 0.640365 | 0.388247 |
16 | 0.932169 | 0.580433 | 0.594378 |
17 | 0.956558 | 0.878580 | 0.458417 |
18 | 0.637018 | 0.058973 | 0.338527 |
19 | 0.950942 | 0.647577 | 0.687604 |
fig,ax = plt.subplots(figsize=(10,8))
#stacked 是否堆叠
df.plot.bar(ax=ax,stacked=False)
<matplotlib.axes._subplots.AxesSubplot at 0xa263898>
from matplotlib.ticker import FuncFormatter
df_ratio = df.div(df.sum(axis=1),axis=0)
fig,ax = plt.subplots()
df_ratio.plot.bar(ax=ax,stacked=True)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y,_:'{:.0%}'.format(y)))
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv'
df = pd.read_csv(url, na_values="?")
df.head()
Age | Number of sexual partners | First sexual intercourse | Num of pregnancies | Smokes | Smokes (years) | Smokes (packs/year) | Hormonal Contraceptives | Hormonal Contraceptives (years) | IUD | ... | STDs: Time since first diagnosis | STDs: Time since last diagnosis | Dx:Cancer | Dx:CIN | Dx:HPV | Dx | Hinselmann | Schiller | Citology | Biopsy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18 | 4.0 | 15.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 15 | 1.0 | 14.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 34 | 1.0 | NaN | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 52 | 5.0 | 16.0 | 4.0 | 1.0 | 37.0 | 37.0 | 1.0 | 3.0 | 0.0 | ... | NaN | NaN | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
4 | 46 | 3.0 | 21.0 | 4.0 | 0.0 | 0.0 | 0.0 | 1.0 | 15.0 | 0.0 | ... | NaN | NaN | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 36 columns
from sklearn.preprocessing import Imputer
#缺失值填充处理
impute = pd.DataFrame(Imputer().fit_transform(df))
impute.columns = df.columns
impute.index = df.index
impute.head()
Age | Number of sexual partners | First sexual intercourse | Num of pregnancies | Smokes | Smokes (years) | Smokes (packs/year) | Hormonal Contraceptives | Hormonal Contraceptives (years) | IUD | ... | STDs: Time since first diagnosis | STDs: Time since last diagnosis | Dx:Cancer | Dx:CIN | Dx:HPV | Dx | Hinselmann | Schiller | Citology | Biopsy | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 4.0 | 15.0000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 6.140845 | 5.816901 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 15.0 | 1.0 | 14.0000 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 6.140845 | 5.816901 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 34.0 | 1.0 | 16.9953 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 6.140845 | 5.816901 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 52.0 | 5.0 | 16.0000 | 4.0 | 1.0 | 37.0 | 37.0 | 1.0 | 3.0 | 0.0 | ... | 6.140845 | 5.816901 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 46.0 | 3.0 | 21.0000 | 4.0 | 0.0 | 0.0 | 0.0 | 1.0 | 15.0 | 0.0 | ... | 6.140845 | 5.816901 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 36 columns
%matplotlib notebook
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
features = impute.drop('Dx:Cancer', axis=1)
y = impute["Dx:Cancer"]
pca = PCA(n_components=3)
X_r = pca.fit_transform(features)
print("Explained variance:\nPC1 {:.2%}\nPC2 {:.2%}\nPC3 {:.2%}"
.format(pca.explained_variance_ratio_[0],
pca.explained_variance_ratio_[1],
pca.explained_variance_ratio_[2]))
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(X_r[:, 0], X_r[:, 1], X_r[:, 2], c=y, cmap=plt.cm.coolwarm)
# Label the axes
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')