一,简单示例
1.数据准备
from sklearn import datasets
from sklearn.model_selection import train_test_split
iris=datasets.load_iris()
2.将特征与标签分开
x,y=datasets.load_iris(return_X_y=True)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3)
3.建立模型
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
4.训练
knn.fit(x_train,y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
metric_params=None, n_jobs=None, n_neighbors=5, p=2,
weights='uniform')
5.测试
acc=knn.score(x_test,y_test)
print(acc)
0.9333333333333333
二,线性回归与交叉验证
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.model_selection import cross_val_score
x,y=fetch_california_housing(return_X_y=True)
lr=LinearRegression()
loss=-cross_val_score(lr,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(y.min(),y.max())
print(loss)
0.14999 5.00001
0.5582901717686815
Lasso(L1正则化),Ridge(L2正则化),ElasticNet(L1和L2正则化都有)
lasso=Lasso(alpha=0.01)
loss1=-cross_val_score(lasso,x,y,cv=5,scoring='neg_mean_squared_error').mean()
ridge=Ridge(alpha=0.01)
loss2=-cross_val_score(ridge,x,y,cv=5,scoring='neg_mean_squared_error').mean()
elasticnet=ElasticNet(alpha=0.01)
loss3=-cross_val_score(elasticnet,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(loss1,loss2,loss3)
0.564023420597941 0.558290056170532 0.5603421342140549
三,逻辑回归与学习曲线
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
x,y=load_breast_cancer(return_X_y=True)
lrl1=LR(penalty='l1',solver='liblinear',C=1,max_iter=1000)
lrl2=LR(penalty='l2',solver='liblinear',C=1,max_iter=1000)
train_size,train_acc,test_acc=learning_curve(lrl1,x,y,cv=5)
plt.plot(train_size,train_acc.mean(axis=1),label='train_acc')
plt.plot(train_size,test_acc.mean(axis=1),label='test_acc')
plt.legend()
四,手动调参与参数验证曲线
1,第一种方法
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score,validation_curve
import matplotlib.pyplot as plt
x,y=datasets.load_iris(return_X_y=True)
acc=[]
for i in range(1,11):
knn=KNeighborsClassifier(n_neighbors=i)
acc.append(cross_val_score(knn,x,y,cv=5).mean())
plt.plot(range(1,11),acc,'o-')
2,第二种方法
knn=KNeighborsClassifier()
train_acc,test_acc=validation_curve(knn,x,y,param_name='n_neighbors',param_range=range(1,11),cv=5)
plt.plot(range(1,11),train_acc.mean(axis=1),'o-',label='train_acc')
plt.plot(range(1,11),test_acc.mean(axis=1),'o-',label='test_acc')
plt.legend()
五,数据预处理
from sklearn import datasets,preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
x,y=datasets.load_iris(return_X_y=True)
1.标准化
x=preprocessing.StandardScaler().fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)
knn=KNeighborsClassifier().fit(x_train,y_train)
print(knn.score(x_test,y_test))
0.9333333333333333
2.归一化
x=preprocessing.MinMaxScaler().fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)
knn=KNeighborsClassifier().fit(x_train,y_train)
print(knn.score(x_test,y_test))
0.9555555555555556
3.处理异常值
x=preprocessing.RobustScaler().fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)
knn=KNeighborsClassifier().fit(x_train,y_train)
print(knn.score(x_test,y_test))
0.9555555555555556
4.稀疏矩阵:除最大数的绝对值
x=preprocessing.maxabs_scale(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)
knn=KNeighborsClassifier().fit(x_train,y_train)
print(knn.score(x_test,y_test))
0.9555555555555556
5. 处理缺失值
from sklearn.impute import SimpleImputer
x=SimpleImputer().fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=13)
knn=KNeighborsClassifier().fit(x_train,y_train)
print(knn.score(x_test,y_test))
0.9555555555555556
六,降维
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
iris=datasets.load_iris()
pca=PCA(2)
x=pca.fit_transform(x)
plt.figure()
plt.scatter(x[y==0,0],x[y==0,1],c='r',label=iris.target_names[0])
plt.scatter(x[y==1,0],x[y==1,1],c='b',label=iris.target_names[1])
plt.scatter(x[y==2,0],x[y==2,1],c='y',label=iris.target_names[2])
plt.legend()
plt.title('PCA of iris dataset')
七,SVM
from sklearn.svm import LinearSVC,SVC
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
x,y=make_blobs(n_samples=50,centers=2,random_state=0,cluster_std=0.6)
plt.scatter(x[:,0],x[:,1],c=y,s=50,cmap='rainbow')
八,朴素贝叶斯 先验概率 正态分布
from sklearn.naive_bayes import GaussianNB,BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_digits
x,y=load_digits(return_X_y=True)
print(cross_val_score( GaussianNB(),x,y,cv=5,scoring='accuracy').mean())
0.8069281956050759
print(cross_val_score(BernoulliNB(),x,y,cv=5,scoring='accuracy').mean())
0.8241736304549674
九,聚类算法
from sklearn.datasets import load_digits
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
x,y=make_blobs(n_samples=500,n_features=2,centers=4,random_state=22)
fig,ax=plt.subplots(1,3,figsize=(12,4))
ax[0].scatter(x[:,0],x[:,1],s=8)
color=['r','green','b','orange']
for i in range(4):
ax[1].scatter(x[y==i,0],x[y==i,1],s=8,c=color[i])
pred=KMeans(n_clusters=4,random_state=22).fit_predict(x)
for i in range(4):
ax[2].scatter(x[:,0],x[:,1],s=8,c=pred)
print(silhouette_score(x,y))
print(silhouette_score(x,pred))
十,神经网络
from sklearn.datasets import fetch_california_housing
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import cross_val_score
x,y=fetch_california_housing(return_X_y=True)
NN=MLPRegressor(hidden_layer_sizes=(100,),random_state=22)
loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(loss)
0.838450263979525
NN=MLPRegressor(hidden_layer_sizes=(100,100),random_state=22)
loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(loss)
3.4253744725964888
NN=MLPRegressor(hidden_layer_sizes=(150,),random_state=22)
loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(loss)
1.372901714431039
NN=MLPRegressor(hidden_layer_sizes=(50,),random_state=22)
loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(loss)
0.6867883583636181
NN=MLPRegressor(hidden_layer_sizes=(16,),random_state=22)
loss=-cross_val_score(NN,x,y,cv=5,scoring='neg_mean_squared_error').mean()
print(loss)
0.6464623806087815
十一,集成学习
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
x,y=load_wine(return_X_y=True)
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.8,random_state=0)
dtc=DecisionTreeClassifier(random_state=22).fit(train_x,train_y)
rfc=RandomForestClassifier(random_state=22).fit(train_x,train_y)
print(dtc.score(test_x,test_y))
print(rfc.score(test_x,test_y))
0.8951048951048951
0.972027972027972
bgc=BaggingClassifier(random_state=22).fit(train_x,train_y)
print(bgc.score(test_x,test_y))
adc= AdaBoostClassifier(learning_rate=0.1,random_state=22).fit(train_x,train_y)
print(adc.score(test_x,test_y))
0.9090909090909091
0.9370629370629371