一、鸢尾花数据集
鸢尾花(iris)数据集是一个经典数据集,在统计学习和机器学习领域都经常被用作示例。该数据集有3类(setosa、versicolour、virginica),每类各50个记录,共150条记录,每条记录有4项特征:花萼长度、花萼宽度、花瓣长度、花瓣宽度。
下面将利用前100条记录(setosa与versicolour)与前2项特征(花萼长度与花萼宽度)进行预测分类示例。
二、测试代码
分享给有需要的人,代码质量勿喷。
2.1 数据
#coding=utf-8
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
# ---------- 1 data --------- #
iris = load_iris()
# print(iris.feature_names) #['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
# print(iris.target_names) #['setosa' 'versicolor' 'virginica']
# print(iris.data)
# print(iris.target)
# 1.1 数据取值:前100个数据,前2类
x=iris.data[0:100,0:2]
y=iris.target[0:100]
# print(x.shape) # (100,2)
# print(y.shape) # (100,)
sample_0=x[y==0,:]
sample_1=x[y==1,:]
plt.scatter(sample_0[:,0],sample_0[:,1],marker='o',color='r',label='setosa')
plt.scatter(sample_1[:,0],sample_1[:,1],marker='*',color='b',label='versicolor')
plt.xlabel("x")
plt.ylabel("y")
plt.legend(loc=2)
plt.show()
# 1.2 拆分数据:80个训练,20个预测
x_train=np.vstack([x[:40],x[60:100]])
y_train=np.concatenate([y[:40],y[60:100]])
# print(x_train.shape) #(80, 2)
# print(y_train.shape) # (80,)
x_test=x[40:60,:]
y_test=y[40:60]
# print(x_test.shape) # (20, 2)
# print(y_test.shape) # (20,)
2.2 回归模型
# ---------- 2 modeling --------- #
class Logistic_Regression():
def __init__(self):
self.w=None
def sigmoid(self,z):
a=1/(1+np.exp(-z)) #激活函数
return a
def output(self,x):
z=np.dot(self.w,x.T)
a=self.sigmoid(z)
return a
def compute_loss(self,x,y):
num_train=x.shape[0]
a=self.output(x)
loss=np.sum(-y*np.log(a)-(1-y)*np.log(1-a)) #损失函数
loss/=num_train
dw=np.dot((a-y),x) / num_train #梯度
return loss,dw
def train(self,x,y,learning_rate=0.01,num_iterations=100001):
num_train,num_features = x.shape
self.w=0.001*np.random.randn(1,num_features) #权重初始化
loss=[]
for i in range(num_iterations):
error_loss,dw=self.compute_loss(x,y)
loss.append(error_loss)
self.w-=dw*learning_rate #更新权重
if i%500==0:
print('setps=[%d/%d], loss=%f' % (i,num_iterations,error_loss))
return loss
def predict(self,x):
a=self.output(x)
y_pre=np.where(a>=0.5,1,0)
return y_pre
2.3 训练
# ---------- 3 train --------- #
lr=Logistic_Regression()
loss=lr.train(x_train,y_train)
plt.plot(loss)
plt.show()
print('权重=',lr.w)
x1=np.arange(4,7.5,0.05)
x2=(-lr.w[0][0]*x1)/lr.w[0][1]
## sigmoid=1/(1+np.exp(-z))=0.5
## z=0=x1*w1+x2*w2
plt.scatter(sample_0[:,0],sample_0[:,1],marker='o',color='r',label='setosa')
plt.scatter(sample_1[:,0],sample_1[:,1],marker='*',color='b',label='versicolor')
plt.xlabel("x")
plt.ylabel("y")
plt.legend(loc=2)
plt.plot(x1,x2,'-',color='black')
plt.show()
2.4 预测
# ---------- 4 predict --------- #
num_test=x_test.shape[0]
prediction=lr.predict(x_test)
print("预测结果=",prediction)
print("真实结果=",y_test)
accuracy=np.sum(prediction==y_test)/num_test
print("预测准确率=",accuracy)