ex1
数据集https://www.kesci.com/home/project/5da16a37037db3002d441810/dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# https://www.kesci.com/home/project/5da16a37037db3002d441810/code
# 吴恩达机器学习 单变量线性回归梯度下降python实现
path = r'C:\Users\shinelon\Desktop\MLDataset\ex1\ex1data1.txt'
# 画图
data = pd.read_csv(path,header=None, names=['Population', 'Profit'])
data.head()
data.plot(kind='scatter', x='Population', y='Profit', figsize=(12,8))
plt.show()
画出数据的散点图
def computeCost(X, y, theta):
inner = np.power(((X * theta.T) - y), 2)
return np.sum(inner) / (2 * len(X))
# 单变量代价函数
def computeCost(X, y, theta):
# theta.T 是转置矩阵 np.power是幂次函数
inner = np.power(((X * theta.T) - y), 2)
return np.sum(inner) / (2 * len(X))
# 在第一列(0) 添加名字为 ones 的一列数据,他的数值都是 1
data.insert(0, 'Ones', 1)
# 初始化X和y
cols = data.shape[1]
x = data.iloc[:,:-1]# X是所有行,去掉最后一列
y = data.iloc[:,cols-1:cols]# y是最后一列
x = np.asmatrix(x.values) #将X的值转化为矩阵形式,方便计算
y = np.asmatrix(y.values)
theta = np.asmatrix(np.array([0,0]))
print("computeCost(x,y,theta)= ", computeCost(x,y,theta))
# 单变量梯度下降
# alpha是学习速率 iters是要执行的迭代次数
def gradientDescent(x, y, theta, alpha, iters):
# 生成一个零矩阵
temp = np.asmatrix(np.zeros(theta.shape))
parameters = int(theta.ravel().shape[1])
cost = np.zeros(iters)
for i in range(iters):
error = (x * theta.T) - y
for j in range(parameters):
term = np.multiply(error, x[:,j])
temp[0,j] = theta[0,j] - ((alpha / len(x)) * np.sum(term))
theta = temp
cost[i] = computeCost(x,y,theta)
return theta,cost
# 设置初始参数
alpha = 0.01
iters = 1500
g, cost = gradientDescent(x, y, theta,alpha,iters)
print(g)
print(cost)
# 输出 g = [[-3.88858772 1.19231101]]
# 输出 cost = [6.73719046 5.93159357 5.90115471 ... 4.47697612 4.4769761 4.47697609]
#画误差迭代图
fig, ax = plt.subplots(figsize=(10,8))
ax.plot(np.arange(iters), cost, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs. Training Epoch')
plt.show()
误差迭代图
#预测35000和70000城市规模的小吃摊利润
# g就是计算出的θ
predict1 = [1,3.5]*g.T
print("predict1:",predict1)
predict2 = [1,7]*g.T
print("predict2:",predict2)
# 输出 predict1: [[0.28450083]]
# 输出 predict2: [[4.45758937]]
# x为data的population的分布,最大最小值均由data的Population列决定,一共生成100个点
x = np.linspace(data.Population.min(), data.Population.max(), 100)
f = g[0, 0] + (g[0, 1] * x)
#画图 分别是拟合的线性回归方程以及初始数据
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(data.Population, data.Profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
# 大功告成!!!
拟合的线性回归方程
多变量线性回归拟合
# 多变量线性回归
path = r'C:\Users\shinelon\Desktop\MLDataset\ex1\ex1data2.txt'
# 读取数据
data2 = pd.read_csv(path,header=None, names=['size','bedrooms','price'])
# 检查数据
# print(data2.head())
# 特征归一化 mean求均值 std求标准差 (数据-数据平均值)/数据的标准差
data2 = (data2 - data2.mean()) / data2.std()
# 加一列1
data2.insert(0,'Ones',1)
# 初始化x和y
cols2 = data2.shape[1]
x2 = data2.iloc[:,:-1]
y2 = data2.iloc[:,cols2-1:cols2]
# 转换格式
x2 = np.asmatrix(x2.values)
y2 = np.asmatrix(y2.values)
theta2 = np.asmatrix(np.array([0,0,0]))
g2,cost2 = gradientDescent(x2,y2,theta2,alpha,iters)
# g2=matrix([[-1.10898288e-16, 8.84042349e-01, -5.24551809e-02]])
正规方程
# 正规方程
def normal(X,y):
# np.linalg.inv():矩阵求逆
# X.T@X等价于X.T.dot(X) 矩阵乘法
theta = np.linalg.inv(X.T@X)@X.T@y
return theta
final_theta2 = normal(x,y)
# final_theta2 = matrix([[-3.89578088],
# [ 1.19303364]])
ex2
参考:https://www.kesci.com/home/project/5da1829c037db3002d445baa
数据集下载:https://www.kesci.com/home/project/5da1829c037db3002d445baa/dataset
要求:
ex2data1.txt需要根据学生的2次测试成绩,预测该学生是否被录取。第一列是第一次测试成绩,第二列是第二次测试成绩
ex2data2.txt需要根据芯片2种测试的成绩,预测该芯片是要被接受或抛弃。第一列是第一种测试的成绩,第二列是第二种测试的成绩
具体题目可以参考数据集中的ex1.pdf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 逻辑回归 https://www.kesci.com/home/project/5da1829c037db3002d445baa
path = r'C:\Users\Administrator\Desktop\MLDataSet\ex2\ex2data1.txt'
# ex2data1 数据分别为成绩1,成绩2,以及是否录用
data = pd.read_csv(path,header=None,names=('Exam 1','Exam 2','Admitted'))
# print(data)
# 把数据分为录用以及非录用
# 使用isin() 清洗数据 格式为 df[df[某列].isin(条件)
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]
# 画散点图
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam 1'],positive['Exam 2'],s=50,c='b',marker='o',label='Admitted')
ax.scatter(negative['Exam 1'],negative['Exam 2'],s=50,c='r',marker='x',label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
散点图
# sigmoid函数
# 公式 https://www.jianshu.com/p/506595ec4b58
# https://www.kesci.com/home/project/5da1829c037db3002d445baa
def sigmoid(z):
return 1 / (1 + np.exp(-z))
# 画sigmoid函数
# x1 = np.linspace(-100, 100, 100)
# y1 = sigmoid(x1)
# fig, ax = plt.subplots(figsize=(12,8))
# ax.plot(x1,y1,'r',label='Prediction')
# plt.show()
# 代价函数和梯度
# 公式 cnblogs.com/qkloveslife/p/9851598.html
def cost(theta, X, y):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
# np.multiply累加 np.log求对数
first = np.multiply(-y,np.log(sigmoid(X * theta.T)))
second = np.multiply((1-y) , np.log(1 - sigmoid(X * theta.T)))
return np.sum(first - second) / (len(X))
# 初始化数据
# 给data加上第一列常数列1
data.insert(0,'Ones',1)
# 初始化x,y,θ 取得data列数
cols = data.shape[1]
X = data.iloc[:,:cols-1]
y = data.iloc[:,cols-1:]
# theta有三个,分别对应常数列1以及成绩1和2
theta = np.zeros(3)
# 转换类型 把数组转换为矩阵
X = np.matrix(X.values)
y = np.matrix(y.values)
# 测试
theta = np.matrix(theta)
# 用初始θ计算代价
print('cost(theta, X, y)= ',cost(theta, X, y) )
# 输出 cost(theta, X, y)= 0.6931471805599453
# 实现梯度计算的函数
def gradient(theta, X, y):
# 转换类型
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
# 参数是theta的个数
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
# 误差
error = sigmoid(X * theta.T)-y
for i in range(parameters):
term = np.multiply(error, X[:,i])
grad[i] = np.sum(term) / len(X)
return grad
# 使用工具库计算θ的值
import scipy.optimize as opt
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X,y))
print('result = ',result)
# 输出 result = (array([-25.1613186 , 0.20623159, 0.20147149]), 36, 0)
# 最优theta对应的代价
print('最优θ对应的代价: ',cost(result[0],X,y))
# 输出: 最优θ对应的代价: 0.20349770158947464
# 画出决策曲线
plotting_x1 = np.linspace(30,100,100)
plotting_h1 = ( - result[0][0] - result[0][1] * plotting_x1) / result[0][2]
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(plotting_x1, plotting_h1, 'y', label='Prediction')
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
决策曲线
从图中可以看到,有差不多9个数据点不在决策好的范围内,下面就通过数据评价该逻辑回归模型算出的θ值是否准确
# 评价逻辑回归模型
# 实现hθ
def hfunc1(theta, X):
# 点乘
return sigmoid(np.dot(theta.T, X))
# 预测成绩1为45,成绩2为85的学生,的录用概率
print('成绩1为45,成绩2为85的学生的录用概率为 ',hfunc1(result[0], [1,45,85]))
# 输出: 成绩1为45,成绩2为85的学生的录用概率为 0.7762906232076073
# 定义预测函数
def predict(theta, X):
probability = sigmoid(X * theta.T)
# 如果预测值大于等于0.5 返回1 认为会被录取 否则返回0
return [1 if x >= 0.5 else 0 for x in probability]
# 统计预测正确率
# 这是计算出的最小值θ
theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
# 正确率
correct = [1 if(a==1 and b==1) or (a==0 and b==0) else 0 for (a,b) in zip(predictions, y)]
# 准确率 分类正确的样本数占总样本数的比例
accuracy = (sum(map(int,correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))
# 准确率为89%
ex2-data2
在训练的第二部分,我们将实现加入正则项提升逻辑回归算法。 设想你是工厂的生产主管,你有一些芯片在两次测试中的测试结果,测试结果决定是否芯片要被接受或抛弃。
正则化可以避免数据过拟合
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 设想你是工厂的生产主管,你有一些芯片在两次测试中的测试结果,测试结果决定是否芯片要被接受或抛弃。
# https://www.kesci.com/home/project/5da1829c037db3002d445baa/code
path = r'C:\Users\Administrator\Desktop\MLDataSet\ex2\ex2data2.txt'
data = pd.read_csv(path, header=None, names=(['Test 1','Test 2','Accepted']))
positive = data[data['Accepted'].isin([1])]
negative = data[data['Accepted'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accpted')
ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Not Accpted')
ax.legend()
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
plt.show()
这次的数据不像之前的数据一样有线性的决策界限,因此对于这种数据,这个数据集不能像之前一样使用直线将两部分分割。而逻辑回归只适用于线性的分割,所以,这个数据集不适合直接使用逻辑回归。
一种更好的使用数据集的方式是为每组数据创造更多的特征。所以我们为每组x1,x2添加了最高到6次幂的特征
(一共新增了27列的特征列)
# 特征映射
#
degree = 6
x1 = data['Test 1']
x2 = data['Test 2']
data.insert(3, 'Ones', 1)
# 维度增加
for i in range(1, degree+1):
for j in range(0, i+1):
data['F' + str(i-j) + str(j)] = np.power(x1, i-j) * np.power(x2, j)
# 除去原来维度(test1 test2)
# axis: 1沿着列 0沿着行 inplace: Ture对原数据改动 flase新建数据
data.drop('Test 1', axis=1, inplace=True)
data.drop('Test 2', axis=1, inplace=True)
print(data)
# sigmoid函数
def sigmoid(z):
return 1 / (1 + np.exp(-z))
我们需要修改代价和梯度函数,包括正则化项。首先是代价函数
然后是梯度下降函数
# 实现正则化的代价参数
def costReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1-y), np.log(1 - sigmoid(X * theta.T)))
reg = (learningRate / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]],2))
return np.sum(first - second) / len(X) +reg
# 实现正则化的梯度函数
def gradientReg(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:i])
if(i==0):
grad[i] = np.sum(term) /len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
return grad
接下来初始化变量
# 初始化X, y,θ
cols = data.shape[1]
X = data.iloc[:,1:cols]
y = data.iloc[:,0:1]
theta = np.zeros(cols-1)
# 类型转换
X = np.matrix(X.values)
y = np.matrix(y.values)
# λ设为1
learningRate = 1
print('costReg(theta, X, y, learningRate) = ',costReg(theta, X, y, learningRate))
# costReg(theta, X, y, learningRate) = 0.6931471805599454
使用工具库计算Θ的值
# 使用工具库计算θ的值
import scipy.optimize as opt
result = opt.fmin_tnc(func=costReg, x0=theta, fprime=gradientReg, args=(X, y, learningRate))
print(result)
# 计算出参数
# result = (array([ 1.27271027, 0.62529965, 1.18111686, -2.01987399, -0.9174319 ,
# -1.43166929, 0.12393227, -0.36553118, -0.35725403, -0.17516291,
# -1.45817009, -0.05098418, -0.61558554, -0.27469165, -1.19271298,
# -0.2421784 , -0.20603299, -0.04466178, -0.2777895 , -0.29539514,
# -0.45645981, -1.04319154, 0.02779373, -0.2924487 , 0.0155576 ,
# -0.32742405, -0.1438915 , -0.92467487]),
# 32,
# 1)
验证参数
# 验证
# 定义预测函数
def predict(theta, X):
probability = sigmoid(X * theta.T)
# 如果预测值大于等于0.5 返回1 认为会被录取 否则返回0
return [1 if x >= 0.5 else 0 for x in probability]
theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
# 计算正确个数
correct = [1 if((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a,b) in zip(predictions, y)]
画出决策曲线
# 画出决策曲线
def hfunc2(theta, x1, x2):
temp = theta[0][0]
place = 0
for i in range(1, degree+1):
for j in range(0, i+1):
temp+= np.power(x1, i-j) * np.power(x2, j) * theta[0][place+1]
place+=1
return temp
def find_decision_boundary(theta):
t1 = np.linspace(-1, 1.5, 1000)
t2 = np.linspace(-1, 1.5, 1000)
cordinates = [(x, y) for x in t1 for y in t2]
x_cord, y_cord = zip(*cordinates)
h_val = pd.DataFrame({'x1':x_cord, 'x2':y_cord})
h_val['hval'] = hfunc2(theta, h_val['x1'], h_val['x2'])
decision = h_val[np.abs(h_val['hval']) < 2 * 10**-3]
return decision.x1, decision.x2
# fig, ax = plt.subplots(figsize=(12,8))
# ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted')
# ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Rejected')
# ax.set_xlabel('Test 1 Score')
# ax.set_ylabel('Test 2 Score')
# x, y = find_decision_boundary(result)
# plt.scatter(x, y, c='y', s=10, label='Prediction')
# ax.legend()
# plt.show()
# 改变λ,观察决策曲线
learningRate2 = 1
result3 = opt.fmin_tnc(func=costReg, x0=theta, fprime=gradientReg, args=(X, y, learningRate2))
fig1, ax1 = plt.subplots(figsize=(12,8))
ax1.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax1.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax1.set_xlabel('Test 1 Score')
ax1.set_ylabel('Test 2 Score')
x1, y1 = find_decision_boundary(result3)
plt.scatter(x1, y1, c='y', s=10, label='Prediction')
ax1.legend()
plt.show()
当学习率λ为1时
当学习率λ为0时
出现过拟合
当学习率λ为100时
出现欠拟合
scipy.optimize 优化函数
上面代码中用到了scipy.optimize.fmin_tnc()
常见的输入参数
func:优化的目标函数
x0:初值
fprime:提供优化函数func的梯度函数,不然优化函数func必须返回函数值和梯度,或者设置approx_grad=True
approx_grad :如果设置为True,会给出近似梯度
args:元组,是传递给优化函数的参数
输出值
x : 数组,返回的优化问题目标值
nfeval : 整数,function evaluations的数目
在进行优化的时候,每当目标优化函数被调用一次,就算一个function evaluation。在一次迭代过程中会有多次function evaluation。这个参数不等同于迭代次数,而往往大于迭代次数。
rc : int,Return code, see below
例子:
result = (array([ 1.27271027, 0.62529965, 1.18111686, -2.01987399, -0.9174319 ,
-1.43166929, 0.12393227, -0.36553118, -0.35725403, -0.17516291,
-1.45817009, -0.05098418, -0.61558554, -0.27469165, -1.19271298,
-0.2421784 , -0.20603299, -0.04466178, -0.2777895 , -0.29539514,
-0.45645981, -1.04319154, 0.02779373, -0.2924487 , 0.0155576 ,
-0.32742405, -0.1438915 , -0.92467487]),
32,
1)