线性回归、正则化、多分类线性模型
- 全文代码如下:
import mglearn
import joblib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
#线性回归
x,y = mglearn.datasets.make_wave(n_samples=60)
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42)
lr = LinearRegression().fit(x_train,y_train)
print("lr.coef_:{}".format(lr.coef_))
print("lr.intercept_:{}".format(lr.intercept_))
print("training set score:{:.5f}".format(lr.score(x_train,y_train)))
print("test set score:{:.2f}".format(lr.score(x_test,y_test)))
#波士顿房价线性回归
x,y = mglearn.datasets.load_extended_boston()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
lr = LinearRegression().fit(x_train,y_train)
print("training set score:{:.2f}".format(lr.score(x_train,y_train)))
print("test set score:{:.2f}".format(lr.score(x_test,y_test)))
#岭回归
from sklearn.linear_model import Ridge
ridge = Ridge().fit(x_train,y_train)
print("train set score:{:.2f}".format(ridge.score(x_train,y_train)))
print("test set score:{:.2f}".format(ridge.score(x_test,y_test)))
ridge10 = Ridge(alpha=10).fit(x_train,y_train)
print("train set score:{:.2f}".format(ridge10.score(x_train,y_train)))
print("test set score:{:.2f}".format(ridge10.score(x_test,y_test)))
ridge01 = Ridge(alpha=0.1).fit(x_train,y_train)
print("train set score:{:.2f}".format(ridge01.score(x_train,y_train)))
print("test set score:{:.2f}".format(ridge01.score(x_test,y_test)))
plt.plot(ridge.coef_,'s',label='Ridge alpha=1')
plt.plot(ridge10.coef_,'^',label='Ridge alpha=10')
plt.plot(ridge01.coef_,'v',label='Ridge alpha=0.1')
plt.plot(lr.coef_,'o',label='LinearRegression')
plt.xlabel('coefficient index')
plt.ylabel('coefficient magnitude')
plt.hlines(0,0,len(lr.coef_))
plt.ylim(-25,25)
plt.legend()
plt.show()
mglearn.plots.plot_ridge_n_samples()
plt.show()
#lasso
from sklearn.linear_model import Lasso
lasso = Lasso().fit(x_train,y_train)
print("train set score:{:.2f}".format(lasso.score(x_train,y_train)))
print("test set score:{:.2f}".format(lasso.score(x_test,y_test)))
print('number of feature used:{}'.format(np.sum(lasso.coef_ != 0)))
lasso001 = Lasso(alpha=0.01,max_iter=100000).fit(x_train,y_train)
print("train set score:{:.2f}".format(lasso001.score(x_train,y_train)))
print("test set score:{:.2f}".format(lasso001.score(x_test,y_test)))
print('number of feature used:{}'.format(np.sum(lasso001.coef_ != 0)))
lasso00001 = Lasso(alpha=0.0001,max_iter=100000).fit(x_train,y_train)
print("train set score:{:.2f}".format(lasso00001.score(x_train,y_train)))
print("test set score:{:.2f}".format(lasso00001.score(x_test,y_test)))
print('number of feature used:{}'.format(np.sum(lasso00001.coef_ != 0)))
plt.plot(lasso.coef_,'s',label='lasso alpha=1')
plt.plot(lasso001.coef_,'^',label='lasso alpha=0.01')
plt.plot(lasso00001.coef_,'v',label='lasso alpha=0.0001')
plt.plot(ridge01.coef_,'o',label='Ridge alpha=0.1')
plt.xlabel('coefficient index')
plt.ylabel('coefficient magnitude')
plt.ylim(-25,25)
plt.legend(ncol=2,loc=(0,1.05))
plt.show()
#分类的线性模型
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
x,y = mglearn.datasets.make_forge()
fig,axes = plt.subplots(1,2,figsize=(10,3))
for model,ax in zip([LinearSVC(),LogisticRegression()],axes):
clf = model.fit(x,y)
mglearn.plots.plot_2d_separator(clf,x,fill=False,eps=0.5,ax=ax,alpha=.7)
mglearn.discrete_scatter(x[:,0],x[:,1],y,ax=ax)
ax.set_title('{}'.format(clf.__class__.__name__))
ax.set_xlabel('feature 0')
ax.set_ylabel('feature 1')
axes[0].legend()
plt.show()
mglearn.plots.plot_linear_svc_regularization()
plt.show()
#在乳腺癌数据集分析逻辑回归(默认L2正则化)
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
x_train,x_test,y_train,y_test = train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=42)
logreg = LogisticRegression().fit(x_train,y_train)
print("training set score:{:.3f}".format(logreg.score(x_train,y_train)))
print("test set score:{:.3f}".format(logreg.score(x_test,y_test)))
logreg100 = LogisticRegression(C=100).fit(x_train,y_train)
print("training set score:{:.3f}".format(logreg100.score(x_train,y_train)))
print("test set score:{:.3f}".format(logreg100.score(x_test,y_test)))
logreg001 = LogisticRegression(C=0.01).fit(x_train,y_train)
print("training set score:{:.3f}".format(logreg001.score(x_train,y_train)))
print("test set score:{:.3f}".format(logreg001.score(x_test,y_test)))
plt.plot(logreg.coef_.T,'o',label='C=1')
plt.plot(logreg100.coef_.T,'^',label='C=100')
plt.plot(logreg001.coef_.T,'v',label='C=0.001')
plt.xticks(range(cancer.data.shape[1]),cancer.feature_names,rotation=90)
plt.hlines(0,0,cancer.data.shape[1])
plt.ylim(-5,5)
plt.xlabel("coefficient index")
plt.ylabel("coefficient magnitude")
plt.legend()
plt.show()
#L1正则化
for C,marker in zip([0.001,1,100],['o','^','v']):
lr_l1 = LogisticRegression(C=C,penalty='l1').fit(x_train,y_train)
print('train accuracy of l1 logreg with C={:.3f}:{:.2f}'.format(C,lr_l1.score(x_train,y_train)))
print('test accuracy of l1 logreg with C={:.3f}:{:.2f}'.format(C,lr_l1.score(x_test,y_test)))
plt.plot(lr_l1.coef_.T,marker,label='C={:.3f}'.format(C))
plt.xticks(range(cancer.data.shape[1]),cancer.feature_names,rotation=90)
plt.hlines(0,0,cancer.data.shape[1])
plt.ylim(-5,5)
plt.xlabel("coefficient index")
plt.ylabel("coefficient magnitude")
plt.legend(loc=3)
plt.show()
#多分类的线性模型
from sklearn.datasets import make_blobs
from sklearn.svm import LinearSVC
x,y = make_blobs(random_state=42)
mglearn.discrete_scatter(x[:,0],x[:,1],y)
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.legend(['class 0','class 1','class 2'])
plt.show()
linear_svm = LinearSVC().fit(x,y)
print('coefficient shape:',linear_svm.coef_.shape)
print('intercept shape:',linear_svm.intercept_.shape)
mglearn.discrete_scatter(x[:,0],x[:,1],y)
line = np.linspace(-15,15)
for coef,intercept,color in zip(linear_svm.coef_,linear_svm.intercept_,['b','r','g']):
plt.plot(line,-(line*coef[0]+intercept)/coef[1],c=color)
plt.ylim(-10,15)
plt.xlim(-10,8)
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.legend(['class 0','class 1','class 2','line class 0','line class 1','class 2',],loc=(1.01,0.3))
plt.show()
mglearn.plots.plot_2d_classification(linear_svm,x,fill=True,alpha=.7)
mglearn.discrete_scatter(x[:,0],x[:,1],y)
line = np.linspace(-15,15)
for coef,intercept,color in zip(linear_svm.coef_,linear_svm.intercept_,['b','r','g']):
plt.plot(line,-(line*coef[0]+intercept)/coef[1],c=color)
plt.legend(['class 0','class 1','class 2','line class 0','line class 1','class 2',],loc=(1.01,0.3))
plt.xlabel('feature 0')
plt.ylabel('feature 1')
plt.show()
二分类、回归算法、威斯康辛乳腺癌、波士顿房价
import mglearn
import matplotlib.pyplot as plt
x,y = mglearn.datasets.make_forge()
mglearn.discrete_scatter(x[:,0],x[:,1],y)
plt.legend(['Class 0','Class 1'],loc = 4)
plt.xlabel('first feature')
plt.ylabel('second feature')
print('x.shape:{}'.format(x.shape))
plt.show()
x.shape:(26, 2)
x,y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(x,y,'o')
plt.ylim(-3,3)
plt.xlabel('feature')
plt.ylabel('target')
plt.show()
- 总程序
import mglearn
import matplotlib.pyplot as plt
import numpy as np
x,y = mglearn.datasets.make_forge()
mglearn.discrete_scatter(x[:,0],x[:,1],y)
plt.legend(['Class 0','Class 1'],loc = 4)
plt.xlabel('first feature')
plt.ylabel('second feature')
print('x.shape:{}'.format(x.shape))
plt.show()
x,y = mglearn.datasets.make_wave(n_samples=40)
plt.plot(x,y,'o')
plt.ylim(-3,3)
plt.xlabel('feature')
plt.ylabel('target')
plt.show()
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print("cancer.keys():\n{}".format(cancer.keys()))
print('shape of cancer data:\n{}'.format(cancer.data.shape))
print('sample counts per class:\n{}'.format({n : v for n,v in zip(cancer.target_names,np.bincount(cancer.target))}))
print('feature names:\n{}'.format(cancer.feature_names))
from sklearn.datasets import load_boston
boston = load_boston()
print('data shape:\n{}'.format(boston.data.shape))
x,y = mglearn.datasets.load_extended_boston()
print('x.shape:{}'.format(x.shape))
x.shape:(26, 2)
cancer.keys():
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
shape of cancer data:
(569, 30)
sample counts per class:
{'malignant': 212, 'benign': 357}
feature names:
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
'mean smoothness' 'mean compactness' 'mean concavity'
'mean concave points' 'mean symmetry' 'mean fractal dimension'
'radius error' 'texture error' 'perimeter error' 'area error'
'smoothness error' 'compactness error' 'concavity error'
'concave points error' 'symmetry error' 'fractal dimension error'
'worst radius' 'worst texture' 'worst perimeter' 'worst area'
'worst smoothness' 'worst compactness' 'worst concavity'
'worst concave points' 'worst symmetry' 'worst fractal dimension']
data shape:
(506, 13)
x.shape:(506, 104)
K邻近、二分类、回归
- 全文代码如下:
import matplotlib.pyplot as plt
import mglearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
x,y = mglearn.datasets.make_forge()
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 0)
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train,y_train)
print("test set predictions:{}".format(clf.predict(x_test)))
print('test set accuracy:{:.3f}'.format(clf.score(x_test,y_test)))
fig,axes = plt.subplots(1,3,figsize=(10,3))
for n_neighbors,ax in zip([1,3,9],axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(x,y)
mglearn.plots.plot_2d_separator(clf,x,fill=True,eps=0.5,ax=ax,alpha=.4)
mglearn.discrete_scatter(x[:,0],x[:,1],ax=ax)
ax.set_title('{} neighbor(s)'.format(n_neighbors))
ax.set_xlabel('feature 0')
ax.set_ylabel('feature 1')
axes[0].legend(loc=3)
plt.show()
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
x_train,x_test,y_train,y_test = train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_state=66)
training_accuracy = []
test_accuracy = []
neighbors_settings = range(1,11)
for n_neighbors in neighbors_settings:
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
clf.fit(x_train,y_train)
training_accuracy.append(clf.score(x_train,y_train))
test_accuracy.append(clf.score(x_test,y_test))
plt.plot(neighbors_settings,training_accuracy,label='training accuracy')
plt.plot(neighbors_settings,test_accuracy,label='test accuracy')
plt.ylabel('accuracy')
plt.xlabel('n_neighbors')
plt.legend()
plt.show()
from sklearn.neighbors import KNeighborsRegressor
x,y = mglearn.datasets.make_wave(n_samples=40)
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)
reg = KNeighborsRegressor(n_neighbors=3)
reg.fit(x_train,y_train)
print('test set predictons:{}'.format(reg.predict(x_test)))
print('test set r^2 :{:.2f}'.format(reg.score(x_test,y_test)))
fig,axes = plt.subplots(1,3,figsize=(15,4))
line = np.linspace(-3,3,1000).reshape(-1,1)
for n_neighbors,ax in zip([1,3,9],axes):
reg = KNeighborsRegressor(n_neighbors=n_neighbors)
reg.fit(x_train,y_train)
ax.plot(line,reg.predict(line))
ax.plot(x_train,y_train,'^',c=mglearn.cm2(0),markersize=8)
ax.plot(x_test,y_test,'^',c=mglearn.cm2(1),markersize=8)
ax.set_title('{}neighbor(s) train score:{:.2f} test score:{:.2f}'.format(n_neighbors,reg.score(x_train,y_train),reg.score(x_test,y_test)))
ax.set_xlabel('feature')
ax.set_ylabel('target')
axes[0].legend(['model predictiond','training data/target','test data/target'],loc='best')
plt.show()