#coding:utf-8
from numpy import *
import pandas as pd
theta=mat([0,0,0]) #初始theta值
alpha=0.001 #step
#datasets
file_name=("D:\\python_text\\logist.csv")
#加载数据集
def loadDataSet(file_name):
data_x = pd.read_csv(file_name,usecols=[0,1]) #只读取前四列,最后一列是因变量
data_y = pd.read_csv(file_name,usecols=[2]) #打印最后一行作为因变量
X_array=mat(data_x) #将csv中的数据读成一个矩阵,且第一行的标签也没有打印,有点神奇
y_array=mat(data_y)
_x0=ones(X_array.shape[0])
_X_array=c_[_x0,X_array]
return _X_array,y_array
#定义sigmoid函数
def sigmoid(list_x):
return 1.0/(1+exp(-list_x))
def J(sig_z,y):
return -(y.T*(log(sig_z.T)).T + (1-y.T)*(log(1-sig_z.T)).T) #Loss function
#定义求解最佳回归系数
def gradAscent(dataMatIn,classLabels,theta):
while (True):
H_1=sigmoid(dataMatIn*(theta.T))
trive_1=J(H_1, classLabels) #calculate J(theta)
theta=theta-alpha*((H_1-classLabels).T*data) #update theta
H_2=sigmoid(dataMatIn*(theta.T))
trive_2=J(H_2, classLabels) #calculate update J(theta)
if(abs(trive_1-trive_2)<0.001):
return theta
data,label=loadDataSet(file_name)
T_value=gradAscent(data, label,theta)
#画出决策边界
def plotBestFit(wei,dataMatrix,labelMat):
import matplotlib.pyplot as plt
weights = wei.getA() #将矩阵wei转化为list
dataArr = array(dataMatrix) #将矩阵转化为数组
n = shape(dataMatrix)[0]
xcord1 = [];ycord1=[]
xcord2 = [];ycord2=[]
for i in range(n):
if int(labelMat[i])==1:
xcord1.append(dataArr[i,1])
ycord1.append(dataArr[i,2])
else:
xcord2.append(dataArr[i,1])
ycord2.append(dataArr[i,2])
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(xcord1,ycord1,s=30,c='red', marker='s')
ax.scatter(xcord2,ycord2,s=30,c="green")
x = arange(-3.0,3.0,0.1)
y = (-weights[0][0]-weights[0][1] * x)/weights[0][2]
ax.plot(x,y)
plt.xlabel("x1") #X轴的标签
plt.ylabel("x2") #Y轴的标签
plt.show()
plotBestFit(T_value, data, label)
我的loss是用最大似然函数