本文是吴恩达《机器学习》课程中的习题一的python实现。
简介
习题一围绕线性回归进行,分别在人口(Population)-利润(Profit)数据集和房屋信息数据集(size, num,price)中训练模型,并进行预测。所使用的数据可在文末获取。
人口-利润数据集
# import libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# read data
original_data = pd.read_csv('data/ex1data1.txt', header=None, names=["population","profit"])
original_data.head()
# outputs
# population profit
# 0 6.1101 17.5920
# 1 5.5277 9.1302
# 2 8.5186 13.6620
# 3 7.0032 11.8540
# 4 5.8598 6.8233
# plot data
original_data.plot(kind='scatter',x='population',y='profit',figsize=(20,10))
plt.show
# define cost function. here we use Euclidean distance(欧几里得距离)
# X,y,theta are matrices
def cost_function(X,y,theta):
error = X * theta.T - y
return np.sum(np.power(error,2)) /(2 * len(X))
# insert 1 vertically, used to match constant b
original_data.insert(0,'ones',1)
X = original_data.iloc[:,:-1]
y = original_data.iloc[:,original_data.shape[1]-1:original_data.shape[1]]
X = np.matrix(X.values)
y = np.matrix(y.values)
# init theta as zero. although random value almost always get better performance, we use zero here
theta = np.matrix(np.array([0,0]))
X.shape,y.shape,theta.shape
# outputs
# ((97, 2), (97, 1), (1, 2))
# define gradient_descent function
# none vectorized implementation
# alpha here means learning rate
def gradient_descent(X,y,theta,alpha,iter):
cost = np.zeros(iter)
temp = np.matrix(np.zeros(theta.shape))
for i in range(iter):
y_prediction= X * theta.T
error = y_prediction - y
for j in range(theta.ravel().shape[1]):
step = alpha * np.sum(np.multiply(X[:,j],error)) /len(X)
temp[0,j]= theta[0,j] - step
theta = temp
cost[i] = cost_function(X,y,theta)
return theta, cost
# vectorized implementation gets rid of inner loops to improve performance
def gradient_descent(X,y,theta,alpha,iter):
cost = np.zeros(iter)
temp = np.matrix(np.zeros(theta.shape))
for i in range(iter):
y_prediction= X * theta.T
error = y_prediction - y
temp = theta - (alpha / len(X)) * error.T * X
theta = temp
cost[i] = cost_function(X,y,theta)
return theta, cost
# fit linear model
alpha = 0.01
iter = 50000
theta, cost =gradient_descent(X,y,theta,alpha, iter)
theta, cost
# outputs
# (matrix([[-3.89578088, 1.19303364]]),array([6.73719046, 5.93159357, 5.90115471, ..., 4.47697138, 4.47697138,4.47697138]))
# plot our model
x = np.linspace(original_data.population.min(), original_data.population.max(), 100)
f = theta[0, 0] + (theta[0, 1] * x)
fig, ax = plt.subplots(figsize=(20,10))
ax.plot(x, f, 'r', label='Prediction')
ax.scatter(original_data.population, original_data.profit, label='Traning Data')
ax.legend(loc=2)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs. Population Size')
plt.show()
房屋信息数据集
# import libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# read data
original_data2 = pd.read_csv('data/ex1data2.txt', header=None, names=["population","profit"])
original_data2.head()
# outputs
# size num price
#0 2104 3 399900
#1 1600 3 329900
#2 2400 3 369000
#3 1416 2 232000
#4 3000 4 539900
# data ranges in the columns vary, so the data needs to be normalized.
# the normalization in the next line is incorrect, as only variable X should be normalized.
original_data2 = (original_data2 - original_data2.mean())/original_data2.std()
original_data2.insert(0,'ones',1)
# get params ready
original_data2 = pd.read_csv('data/ex1data2.txt', header=None, names=["population","profit"])
original_data2.insert(0,'ones',1)
X = original_data2.iloc[:,0:-1]
X = np.matrix(X.values)
X = (X-X.mean())/X.std()
y = original_data2.iloc[:,cols-1:cols]
y = np.matrix(y.values)
theta = np.matrix(np.zeros(3))
def cost_function(X,y,theta):
error = X * theta.T - y
return np.sum(np.power(error,2)) /(2 * len(X))
def gradient_descent(X,y,theta,alpha,iter):
cost = np.zeros(iter)
temp = np.matrix(np.zeros(theta.shape))
for i in range(iter):
y_prediction= X * theta.T
error = y_prediction - y
temp = theta - (alpha / len(X)) * error.T * X
theta = temp
cost[i] = cost_function(X,y,theta)
return theta, cost
alpha = 0.01
iter = 5000
theta,cost = gradient_descent(X,y,theta,alpha,iter)
theta,cost
# outputs
# (matrix([[-126521.82261505, 140760.46376147, -126330.73512184]]) array([6.20680077e+10, 5.87415374e+10, 5.56011097e+10, ..., 2.05772493e+09, 2.05772493e+09, 2.05772493e+09]))
在归一化数据的时候,训练集和测试集要保证使用同样的过程。否则预测的结果将会不准确。
数据集下载
链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah