from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()#导入数据
cancer.feature_names #患病者所有的属性
cancer_data=cancer.data
cancer_target=cancer.target
print(cancer_data.shape)#采集个体的类别print(cancer_target.shape)#是否患病
2、实验组----测试比组分类
from sklearn.model_selection import train_test_split
cancer_data_train,cancer_data_test,cancer_target_train,cancer_target_test=\
train_test_split(cancer_data,cancer_target,test_size=0.2,random_state=42)#1. 20%作为比对结果 2. 离散度为42print(cancer_data_train.shape)print(cancer_target_train.shape)print(cancer_data_test.shape)print(cancer_target_test.shape)
3、对数据进行预处理----标准化
import numpy as np
from sklearn.preprocessing import MinMaxScaler
scale=MinMaxScaler().fit(cancer_data_train)#定义0-1标准化
cancer_trainscaler=scale.transform(cancer_data_train)#标准化
cancer_testscaler=scale.transform(cancer_data_test)#比较标准化前后的值print(np.max(cancer_data_train))print(np.max(cancer_trainscaler))
4、PCA降维
from sklearn.decomposition import PCA
pca_model=PCA(n_components=10).fit(cancer_trainscaler)#将数据降维10维
cancer_trainPca=pca_model.transform(cancer_trainscaler)
cancer_testPca=pca_model.transform(cancer_testscaler)
cancer_trainPca.shape
二、KMean分类
1.KMean分类预测
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
iris=load_iris()
iris_data=iris.data
iris_target=iris.target
scale=MinMaxScaler().fit(iris_data)#转化
iris_datascale=scale.transform(iris_data)
kmeans=KMeans(n_clusters=3,random_state=128).fit(iris_datascale)#定义离散度,将数据分为3类#理论print(iris_data[149])print(iris_target[149])#测试
result=kmeans.predict([[6,4,5,2]])#数据分类测试print([6,4,5,2])print(result[0])
2.图像观察KMean分类
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne=TSNE(n_components=2,init="random",random_state=177).fit(iris_data)#降低维度
df=pd.DataFrame(tsne.embedding_)
df["labels"]=kmeans.labels_
df1=df[df["labels"]==0]
df2=df[df["labels"]==1]
df3=df[df["labels"]==2]
fig=plt.figure(figsize=(9,6))
plt.plot(df1[0],df1[1],"bo",df2[0],df2[1],"r*",df3[0],df3[1],"gD")
3.对KMean分类评分
#1.FMI评分from sklearn.metrics import fowlkes_mallows_score
for i inrange(2,7):
kmeans=KMeans(n_clusters=i,random_state=123).fit(iris_data)
score=fowlkes_mallows_score(iris_target,kmeans.labels_)print(i,score)#2.CHS评分from sklearn.metrics import calinski_harabaz_score
for i inrange(2,7):
kmeans=KMeans(n_clusters=i,random_state=123).fit(iris_data)
score=calinski_harabaz_score(iris_data,kmeans.labels_)print(i,score)