大数据理论课(第八节----机械学习和KMean的使用)

一、机械学习

1、数据读取

from sklearn.datasets import load_breast_cancer
cancer=load_breast_cancer()     #导入数据

cancer.feature_names           #患病者所有的属性
cancer_data=cancer.data
cancer_target=cancer.target    

print(cancer_data.shape)       #采集个体的类别
print(cancer_target.shape)     #是否患病

2、实验组----测试比组分类

from sklearn.model_selection import train_test_split
cancer_data_train,cancer_data_test,cancer_target_train,cancer_target_test=\
train_test_split(cancer_data,cancer_target,test_size=0.2,random_state=42)   #1. 20%作为比对结果   2. 离散度为42

print(cancer_data_train.shape)
print(cancer_target_train.shape)
print(cancer_data_test.shape)  
print(cancer_target_test.shape)

3、对数据进行预处理----标准化

import numpy as np
from sklearn.preprocessing import MinMaxScaler

scale=MinMaxScaler().fit(cancer_data_train)            #定义0-1标准化
cancer_trainscaler=scale.transform(cancer_data_train)  #标准化
cancer_testscaler=scale.transform(cancer_data_test)

#比较标准化前后的值
print(np.max(cancer_data_train))
print(np.max(cancer_trainscaler))

4、PCA降维

from sklearn.decomposition  import PCA     
pca_model=PCA(n_components=10).fit(cancer_trainscaler)   #将数据降维10维
cancer_trainPca=pca_model.transform(cancer_trainscaler)   
cancer_testPca=pca_model.transform(cancer_testscaler)
cancer_trainPca.shape           

二、KMean分类

1.KMean分类预测

from sklearn.datasets import load_iris          
from sklearn.preprocessing import MinMaxScaler       
from sklearn.cluster import KMeans  

iris=load_iris()     
iris_data=iris.data     
iris_target=iris.target

scale=MinMaxScaler().fit(iris_data)   #转化
iris_datascale=scale.transform(iris_data)

kmeans=KMeans(n_clusters=3,random_state=128).fit(iris_datascale)   #定义离散度,将数据分为3类

#理论
print(iris_data[149])
print(iris_target[149])

#测试
result=kmeans.predict([[6,4,5,2]])   #数据分类测试
print([6,4,5,2])
print(result[0])

2.图像观察KMean分类

import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne=TSNE(n_components=2,init="random",random_state=177).fit(iris_data)   #降低维度
df=pd.DataFrame(tsne.embedding_)
df["labels"]=kmeans.labels_
df1=df[df["labels"]==0]
df2=df[df["labels"]==1]
df3=df[df["labels"]==2]
fig=plt.figure(figsize=(9,6))
plt.plot(df1[0],df1[1],"bo",df2[0],df2[1],"r*",df3[0],df3[1],"gD")

3.对KMean分类评分

#1.FMI评分
from sklearn.metrics import fowlkes_mallows_score
for i in range(2,7):
    kmeans=KMeans(n_clusters=i,random_state=123).fit(iris_data)
    score=fowlkes_mallows_score(iris_target,kmeans.labels_)
    print(i,score)


#2.CHS评分
from sklearn.metrics import calinski_harabaz_score
for i in range(2,7):
    kmeans=KMeans(n_clusters=i,random_state=123).fit(iris_data)
    score=calinski_harabaz_score(iris_data,kmeans.labels_)
    print(i,score)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值