python 知乎 sklearn_python机器学习sklearn库:从头到尾

一,简单示例

1.数据准备

from sklearn import datasets

from sklearn.model_selection import train_test_split

iris=datasets.load_iris()

2.将特征与标签分开

x,y=datasets.load_iris(return_X_y=True)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)

3.建立模型

from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier()

4.训练

knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',

metric_params=None, n_jobs=None, n_neighbors=5, p=2,

weights='uniform')

5.测试

acc=knn.score(x_test,y_test)

print(acc)

0.9333333333333333

二,线性回归与交叉验证

from sklearn.datasets import fetch_california_housing

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet

from sklearn.model_selection import cross_val_score

x,y=fetch_california_housing(return_X_y=True)

lr=LinearRegression()

loss=-cross_val_score(lr,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(y.min(),y.max())

print(loss)

0.14999 5.00001

0.5582901717686815

Lasso(L1正则化),Ridge(L2正则化),ElasticNet(L1和L2正则化都有)

lasso=Lasso(alpha=0.01)

loss1=-cross_val_score(lasso,x,y,cv=5,scoring='neg_mean_squared_error').mean()

ridge=Ridge(alpha=0.01)

loss2=-cross_val_score(ridge,x,y,cv=5,scoring='neg_mean_squared_error').mean()

elasticnet=ElasticNet(alpha=0.01)

loss3=-cross_val_score(elasticnet,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(loss1,loss2,loss3)

0.564023420597941 0.558290056170532 0.5603421342140549

三,逻辑回归与学习曲线

from sklearn.datasets import load_breast_cancer

from sklearn.linear_model import LogisticRegression as LR

from sklearn.model_selection import learning_curve

import matplotlib.pyplot as plt

x,y=load_breast_cancer(return_X_y=True)

lrl1=LR(penalty='l1',solver='liblinear',C=1,max_iter=1000)

lrl2=LR(penalty='l2',solver='liblinear',C=1,max_iter=1000)

train_size,train_acc,test_acc=learning_curve(lrl1,x,y,cv=5)

plt.plot(train_size,train_acc.mean(axis=1),label='train_acc')

plt.plot(train_size,test_acc.mean(axis=1),label='test_acc')

plt.legend()

四,手动调参与参数验证曲线

1,第一种方法

from sklearn.datasets import load_iris

from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score,validation_curve

import matplotlib.pyplot as plt

x,y=datasets.load_iris(return_X_y=True)

acc=[]

for i in range(1,11):

knn=KNeighborsClassifier(n_neighbors=i)

acc.append(cross_val_score(knn,x,y,cv=5).mean())

plt.plot(range(1,11),acc,'o-')

2,第二种方法

knn=KNeighborsClassifier()

train_acc,test_acc=validation_curve(knn,x,y,param_name='n_neighbors',param_range=range(1,11),cv=5)

plt.plot(range(1,11),train_acc.mean(axis=1),'o-',label='train_acc')

plt.plot(range(1,11),test_acc.mean(axis=1),'o-',label='test_acc')

plt.legend()

五,数据预处理

from sklearn import datasets,preprocessing

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier

x,y=datasets.load_iris(return_X_y=True)

1.标准化

x=preprocessing.StandardScaler().fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)

knn=KNeighborsClassifier().fit(x_train,y_train)

print(knn.score(x_test,y_test))

0.9333333333333333

2.归一化

x=preprocessing.MinMaxScaler().fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)

knn=KNeighborsClassifier().fit(x_train,y_train)

print(knn.score(x_test,y_test))

0.9555555555555556

3.处理异常值

x=preprocessing.RobustScaler().fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)

knn=KNeighborsClassifier().fit(x_train,y_train)

print(knn.score(x_test,y_test))

0.9555555555555556

4.稀疏矩阵:除最大数的绝对值

x=preprocessing.maxabs_scale(x)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)

knn=KNeighborsClassifier().fit(x_train,y_train)

print(knn.score(x_test,y_test))

0.9555555555555556

5. 处理缺失值

from sklearn.impute import SimpleImputer

x=SimpleImputer().fit_transform(x)

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)

knn=KNeighborsClassifier().fit(x_train,y_train)

print(knn.score(x_test,y_test))

0.9555555555555556

六,降维

from sklearn.decomposition import PCA

from sklearn.datasets import load_iris

import matplotlib.pyplot as plt

iris=datasets.load_iris()

pca=PCA(2)

x=pca.fit_transform(x)

plt.figure()

plt.scatter(x[y==0,0],x[y==0,1],c='r',label=iris.target_names[0])

plt.scatter(x[y==1,0],x[y==1,1],c='b',label=iris.target_names[1])

plt.scatter(x[y==2,0],x[y==2,1],c='y',label=iris.target_names[2])

plt.legend()

plt.title('PCA of iris dataset')

七,SVM

from sklearn.svm import LinearSVC,SVC

from sklearn.model_selection import cross_val_score

from sklearn.datasets import make_blobs

import matplotlib.pyplot as plt

x,y=make_blobs(n_samples=50,centers=2,random_state=0,cluster_std=0.6)

plt.scatter(x[:,0],x[:,1],c=y,s=50,cmap='rainbow')

八,朴素贝叶斯 先验概率 正态分布

from sklearn.naive_bayes import GaussianNB,BernoulliNB

from sklearn.model_selection import cross_val_score

from sklearn.datasets import load_digits

x,y=load_digits(return_X_y=True)

print(cross_val_score( GaussianNB(),x,y,cv=5,scoring='accuracy').mean())

0.8069281956050759

print(cross_val_score(BernoulliNB(),x,y,cv=5,scoring='accuracy').mean())

0.8241736304549674

九,聚类算法

from sklearn.datasets import load_digits

from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_score

import matplotlib.pyplot as plt

x,y=make_blobs(n_samples=500,n_features=2,centers=4,random_state=22)

fig,ax=plt.subplots(1,3,figsize=(12,4))

ax[0].scatter(x[:,0],x[:,1],s=8)

color=['r','green','b','orange']

for i in range(4):

ax[1].scatter(x[y==i,0],x[y==i,1],s=8,c=color[i])

pred=KMeans(n_clusters=4,random_state=22).fit_predict(x)

for i in range(4):

ax[2].scatter(x[:,0],x[:,1],s=8,c=pred)

print(silhouette_score(x,y))

print(silhouette_score(x,pred))

十,神经网络

from sklearn.datasets import fetch_california_housing

from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import cross_val_score

x,y=fetch_california_housing(return_X_y=True)

NN=MLPRegressor(hidden_layer_sizes=(100,),random_state=22)

loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(loss)

0.838450263979525

NN=MLPRegressor(hidden_layer_sizes=(100,100),random_state=22)

loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(loss)

3.4253744725964888

NN=MLPRegressor(hidden_layer_sizes=(150,),random_state=22)

loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(loss)

1.372901714431039

NN=MLPRegressor(hidden_layer_sizes=(50,),random_state=22)

loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(loss)

0.6867883583636181

NN=MLPRegressor(hidden_layer_sizes=(16,),random_state=22)

loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()

print(loss)

0.6464623806087815

十一,集成学习

from sklearn.datasets import load_wine

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier

from sklearn.tree import DecisionTreeClassifier

x,y=load_wine(return_X_y=True)

train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.8,random_state=0)

dtc=DecisionTreeClassifier(random_state=22).fit(train_x,train_y)

rfc=RandomForestClassifier(random_state=22).fit(train_x,train_y)

print(dtc.score(test_x,test_y))

print(rfc.score(test_x,test_y))

0.8951048951048951

0.972027972027972

bgc=BaggingClassifier(random_state=22).fit(train_x,train_y)

print(bgc.score(test_x,test_y))

adc= AdaBoostClassifier(learning_rate=0.1,random_state=22).fit(train_x,train_y)

print(adc.score(test_x,test_y))

0.9090909090909091

0.9370629370629371

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值