# 龙珠计划-机器学# 龙珠计划-机器学习-day01-逻辑回归

• 优点：实现简单，易于理解和实现；计算代价不高，速度很快，存储资源低；
• 缺点：容易欠拟合，分类精度可能不高

# 1. 逻辑回归demo

!pip install seaborn
!pip install scikit-learn


• 首先,导入必须的工具包.
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression

• 之后 构造数据集
x_fearures = np.array([[-1, -2], [-2, -1], [-3, -2], [1, 3], [2, 1], [3, 2]])
y_label = np.array([0, 0, 0, 1, 1, 1])

• 对数据集进行可视化
plt.figure()
plt.scatter(x_fearures[:,0],x_fearures[:,1], c=y_label, s=50, cmap='viridis')
plt.title('Dataset')
plt.show()

• 现在我们开始对数据集进行训练
lr_clf = LogisticRegression()

lr_clf = lr_clf.fit(x_fearures,y_label)

• 训练完成之后,我们看一下模型得到的参数.

lr_clf.coef_


lr_clf.intercept_



plt.figure()
plt.scatter(x_fearures[:,0],x_fearures[:,1], c=y_label, s=50, cmap='viridis')
plt.title('Dataset')

nx, ny = 200, 100
x_min, x_max = plt.xlim()
y_min, y_max = plt.ylim()
x_grid, y_grid = np.meshgrid(np.linspace(x_min, x_max, nx),np.linspace(y_min, y_max, ny))

z_prob = lr_clf.predict_proba(np.c_[x_grid.ravel(),y_grid.ravel()])
z_prob = z_prob[:,1].reshape(x_grid.shape)

plt.contour(x_grid,y_grid,z_prob,[0.5],linewidths=2,colors='blue')


• 新数据点分类
plt.figure()
## new point 1
x_fearures_new1 = np.array([[0, -1]])
plt.scatter(x_fearures_new1[:,0],x_fearures_new1[:,1], s=50, cmap='viridis')
plt.annotate(s='New point 1',xy=(0,-1),xytext=(-2,0),color='blue',arrowprops=dict(arrowstyle='-|>',connectionstyle='arc3',color='red'))

## new point 2
x_fearures_new2 = np.array([[1, 2]])
plt.scatter(x_fearures_new2[:,0],x_fearures_new2[:,1], s=50, cmap='viridis')
plt.annotate(s='New point 2',xy=(1,2),xytext=(-1.5,2.5),color='red',arrowprops=dict(arrowstyle='-|>',connectionstyle='arc3',color='red'))

## 训练样本
plt.scatter(x_fearures[:,0],x_fearures[:,1], c=y_label, s=50, cmap='viridis')
plt.title('Dataset')

# 可视化决策边界
plt.contour(x_grid, y_grid, z_prob, [0.5], linewidths=2., colors='blue')

plt.show()


# 2. 基于鸢尾花数据集的实战

## 2.1 数据集


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

data


from sklearn.model_selection import train_test_split
iris_features_part = iris_features.iloc[:100]
iris_target_part = iris_target[:100]

x_train,x_test,y_train,y_test = train_test_split(iris_features_part,iris_target_part,test_size=0.2,random_state=20210306)



keras还提供了函数用于对数据集进行切分。如上所示，我们只取数据集的前100个用于二分类。

## 2.2 进行训练

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=20210306,solver='lbfgs')
clf.fit(x_train,y_train)



clf.coef_



clf.intercept_



## 2.3 模型预测与评估

• 模型预测
train_predict = clf.predict(x_train)
test_predict = clf.predict(x_test)
print(f"train_predict:{train_predict} \n test_predict:{test_predict}")



• 这些预测结果究竟对不对呢，下面就需要对预测的结果进行评估
from sklearn import metrics

metrics.accuracy_score(y_train,train_predict)

metrics.accuracy_score(y_test,test_predict)


plt.figure(figsize=(8,6))
sns.heatmap(confusion_matrix_result,annot=True,cmap='Blues')
plt.xlabel('Predict labels')
plt.ylabel('True labels')
plt.show()



• 优点：实现简单，易于理解和实现；计算代价不高，速度很快，存储资源低；
• 缺点：容易欠拟合，分类精度可能不高

08-13 48
08-08 540
05-10 1万+
09-29