Pandas与sklearn结合实例

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.DataFrame({'Condition 1': np.random.rand(20),
                   'Condition 2': np.random.rand(20) * 0.9,
                   'Condition 3': np.random.rand(20) * 1.1
                  })
df
 Condition 1Condition 2Condition 3
00.1503880.3196980.307660
10.9698300.6130110.695216
20.6608900.5521310.229432
30.5742320.6798830.738781
40.5271740.5784600.981132
50.9527540.3880250.935823
60.0773300.3315010.663525
70.2884250.7551130.829731
80.3981530.6682510.674626
90.6877520.5404330.971847
100.4705830.3523600.249517
110.6435880.2408270.640346
120.2787630.0121880.506313
130.4867910.5383300.005713
140.6613330.1017120.868087
150.4201600.6403650.388247
160.9321690.5804330.594378
170.9565580.8785800.458417
180.6370180.0589730.338527
190.9509420.6475770.687604
fig,ax = plt.subplots(figsize=(10,8))
#stacked 是否堆叠
df.plot.bar(ax=ax,stacked=False)
<matplotlib.axes._subplots.AxesSubplot at 0xa263898>

from matplotlib.ticker import FuncFormatter

df_ratio = df.div(df.sum(axis=1),axis=0)
fig,ax = plt.subplots()
df_ratio.plot.bar(ax=ax,stacked=True)
ax.yaxis.set_major_formatter(FuncFormatter(lambda y,_:'{:.0%}'.format(y)))

url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00383/risk_factors_cervical_cancer.csv'
df = pd.read_csv(url, na_values="?")
df.head()
 AgeNumber of sexual partnersFirst sexual intercourseNum of pregnanciesSmokesSmokes (years)Smokes (packs/year)Hormonal ContraceptivesHormonal Contraceptives (years)IUD...STDs: Time since first diagnosisSTDs: Time since last diagnosisDx:CancerDx:CINDx:HPVDxHinselmannSchillerCitologyBiopsy
0184.015.01.00.00.00.00.00.00.0...NaNNaN00000000
1151.014.01.00.00.00.00.00.00.0...NaNNaN00000000
2341.0NaN1.00.00.00.00.00.00.0...NaNNaN00000000
3525.016.04.01.037.037.01.03.00.0...NaNNaN10100000
4463.021.04.00.00.00.01.015.00.0...NaNNaN00000000

5 rows × 36 columns

from sklearn.preprocessing import Imputer
#缺失值填充处理
impute =  pd.DataFrame(Imputer().fit_transform(df))
impute.columns = df.columns
impute.index = df.index

impute.head()

 

 AgeNumber of sexual partnersFirst sexual intercourseNum of pregnanciesSmokesSmokes (years)Smokes (packs/year)Hormonal ContraceptivesHormonal Contraceptives (years)IUD...STDs: Time since first diagnosisSTDs: Time since last diagnosisDx:CancerDx:CINDx:HPVDxHinselmannSchillerCitologyBiopsy
018.04.015.00001.00.00.00.00.00.00.0...6.1408455.8169010.00.00.00.00.00.00.00.0
115.01.014.00001.00.00.00.00.00.00.0...6.1408455.8169010.00.00.00.00.00.00.00.0
234.01.016.99531.00.00.00.00.00.00.0...6.1408455.8169010.00.00.00.00.00.00.00.0
352.05.016.00004.01.037.037.01.03.00.0...6.1408455.8169011.00.01.00.00.00.00.00.0
446.03.021.00004.00.00.00.01.015.00.0...6.1408455.8169010.00.00.00.00.00.00.00.0

5 rows × 36 columns

%matplotlib notebook
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
features = impute.drop('Dx:Cancer', axis=1)
y = impute["Dx:Cancer"]

pca = PCA(n_components=3)
X_r = pca.fit_transform(features)

print("Explained variance:\nPC1 {:.2%}\nPC2 {:.2%}\nPC3 {:.2%}"
      .format(pca.explained_variance_ratio_[0],
              pca.explained_variance_ratio_[1],
              pca.explained_variance_ratio_[2]))

fig = plt.figure()
ax = Axes3D(fig)

ax.scatter(X_r[:, 0], X_r[:, 1], X_r[:, 2], c=y, cmap=plt.cm.coolwarm)

# Label the axes
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值