数据分析–iris dataset
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import pylab
from pylab import figure, subplot, hist, xlim, show
from sklearn.datasets import load_iris
data = load_iris()
features = data.data
targets = data.target
plt.plot(features[targets==0,0],features[targets==0,1],'bo',features[targets==0,2],features[targets==0,3],'b+')
plt.plot(features[targets==1,0],features[targets==1,1],'go',features[targets==1,2],features[targets==1,3],'g+')
plt.plot(features[targets==2,0],features[targets==2,3],'ro',features[targets==2,2],features[targets==2,3],'r+')
xmin = min(features[:,0])
xmax = max(features[:,0])
subplot(411)
hist(features[targets==0,0],color='b',alpha=.7)
xlim(xmin,xmax)
subplot(412)
hist(features[targets==1,0],color='r',alpha=.7)
xlim(xmin,xmax)
subplot(413)
hist(features[targets==2,0],color='y',alpha=.7)
xlim(xmin,xmax)
subplot(414)
hist(features[:,0],color='g',alpha=.7)
xlim(xmin,xmax)
使用贝叶斯分类器
from sklearn.naive_bayes import GaussianNB
classifer = cf = GaussianNB()
cf.fit(features,targets)
print cf.predict(features)
print targets
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2]
划分训练集和验证集
from sklearn import cross_validation
train, test, t_train, t_test = cross_validation.train_test_split(features, targets, test_size=0.4, random_state=0)
cf.fit(train,t_train)
t_score=cf.score(test,t_test)
print t_score
train_score = cf.score(train,t_train)
print train_score
0.933333333333
0.977777777778
pca 降维
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
pcad=pca.fit_transform(features)
plt.plot(pcad[targets==0,0],pcad[targets==0,1],'bo')
聚类
from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3)
kms.fit(features)
c = kms.predict(features)
from sklearn.metrics import completeness_score, homogeneity_score
print completeness_score(targets,c)
print homogeneity_score(targets,c)
0.764986151449
0.751485402199
相关性分析
from numpy import corrcoef
corr = corrcoef(features.T)
print corr
[[ 1. -0.10936925 0.87175416 0.81795363]
[-0.10936925 1. -0.4205161 -0.35654409]
[ 0.87175416 -0.4205161 1. 0.9627571 ]
[ 0.81795363 -0.35654409 0.9627571 1. ]]