一、其中LogisticRegression是自己写的模块,该模块是波士顿房价预测里的代码 波士顿房价预测
二、完全代码
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from LogisticRegression import gradDescent,cost_function,accuracy,feature_scalling
def load_data():
data = pd.read_csv('./data/LogiReg_data.txt', names=['exam1', 'exam2', 'label']).as_matrix()#加names是添加pa格式的列的标签,as_matrix是做成numpy格式的数据,没有了横和竖的标签
X = data[:, :-1] # 取前两列
y = data[:, -1:] # 取最后一列
print(X)
print(X.shape)#100*2
print(X.shape[0])#100
shuffle_index = np.random.permutation(X.shape[0])#其中X.shape[0]=100,这一行的作用是将列的索引打乱。np.random.permutation为随机打乱函数
print(shuffle_index)#从0到99随机打乱的数组
X = X[shuffle_index]
print(X)
y = y[shuffle_index]
return X, y#得到打乱的特征X和打乱的标签y
def visualize_data(X, y):
positive = np.where(y == 1)[0]#得到正样本的索引
negative = np.where(y == 0)[0]#得到负样本的索引
plt.scatter(X[positive,0],X[positive,1],s=30,c='b',marker='o',label='Admitted')
plt.scatter(X[negative,0],X[negative,1],s=30,c='r',marker='o',label='Not Admitted')
plt.legend()#用于显示标注admitted和not admitted
plt.show()
def visualize_cost(ite,cost):#用来画出代价函数和迭代次数曲线
plt.plot(np.linspace(0,ite,ite),cost,linewidth=1)#np.linspace(0,ite,ite)表均分为#10000份,不是横坐标显示出一万份,因为cost对应有10000份,所以np.linspace也应该有10000份
plt.title('cost history',color='r')
plt.xlabel('iterations')
plt.ylabel('cost J')
plt.show()
if __name__ == '__main__':
# Step 1. Load data
X, y = load_data()
# Step 2. Visualize data
visualize_data(X, y)
#
m, n = X.shape#100*2
X = feature_scalling(X)
alpha = 0.1
W = np.random.randn(n, 1)
b = 0.1
maxIt = 10000
W, b, cost_history = gradDescent(X, y, W, b, alpha, maxIt)
print("******************")
print(cost_history[:20])
visualize_cost(maxIt,cost_history)
print("accuracys is : " + str(accuracy(X, y, W, b)))
print("W:",W)
print("b: ",b)
print("******************")
三、数据
https://github.com/TolicWang/MachineLearningWithMe/blob/master/Lecture_02/data/LogiReg_data.txt
四、用sklearn来实现
代码:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from LogisticRegression import feature_scalling
from sklearn.linear_model import LogisticRegression
def load_data():
data = pd.read_csv('./data/LogiReg_data.txt', names=['exam1', 'exam2', 'label']).as_matrix()
X = data[:, :-1] # 取前两列
y = data[:, -1:] # 取最后一列
shuffle_index = np.random.permutation(X.shape[0])
X = X[shuffle_index]
y = y[shuffle_index]
return X, y
def visualize_cost(ite,cost):
plt.plot(np.linspace(0,ite,ite),cost,linewidth=1)
plt.title('cost history',color='r')
plt.xlabel('iterations')
plt.ylabel('cost J')
plt.show()
if __name__ == '__main__':
X, y = load_data()
X = feature_scalling(X)
lr = LogisticRegression()
lr.fit(X,y)
print("******************")
print("accuracys is :" ,lr.score(X,y))
print("W:{},b:{}".format(lr.coef_,lr.intercept_))
print("******************")