机器学习练习1-线性回归

YukinoPon

已于 2022-03-05 15:47:24 修改

阅读量1k

点赞数 3

分类专栏： Machine Learning 文章标签：机器学习线性回归 python

于 2022-03-05 15:45:39 首次发布

本文链接：https://blog.csdn.net/m0_56049115/article/details/123287010

版权

Machine Learning 专栏收录该内容

9 篇文章 3 订阅

订阅专栏

本文基于Andrew_Ng的ML课程作业

1-Linear Regression with one variable with gradientDescent:根据城市城市人口数量，预测开小吃店的利润

导入库

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

函数：计算代价函数J(theta)

def computeCost(X,y,theta): #计算代价函数J(theta)
    inner=np.power(((X*theta.T)-y),2)
        #np.power(a,b):若b为单个数字，则对a各个元素分别求b次方;若b为数组，其列数与a相同，求相应的次方
    return np.sum(inner)/(2*len(X)) #len(X)/X.shape[0]

函数：梯度下降法

def gradientDescent(X,y,theta,alpha,iters): #梯度下降法
    temp=np.matrix(np.zeros(theta.shape))   #np.zeros(1,5):生成用0填充的一个一行五列的数组
    parameters=int(theta.ravel().shape[1])  #ravel()将多维数组拉成一维数组
    cost=np.zeros(iters)
    for i in range(iters):
        error=(X*theta.T)-y
        for j in range(parameters):
            term=np.multiply(error,X[:,j])  #X[:,j]:取所有行的第j个数据
            temp[0,j]=theta[0,j]-(alpha/len(X))*np.sum(term)  #temp[0,j]:矩阵第0行第j列
        theta=temp
        cost[i]=computeCost(X,y,theta)
    return cost,theta

函数：代价函数的3D曲面图

def visualCost3D(X, y): #代价函数的3D曲面图
    theta0 = np.linspace(-10, 10, num=200)
    theta1 = np.linspace(-1, 4, num=200)
    jvals = np.zeros((theta0.size, theta1.size))    #np.zeros((m,n)):生成一个元素全为0的m行n列的矩阵  #np.size:计算当前数组中元素总个数
    for i in np.arange(theta0.size):
        for j in np.arange(theta1.size):
            theta = np.matrix(np.array([theta0[i], theta1[j]]))
            jvals[i, j] = computeCost(X,y,theta)
    fig = plt.figure()
    ax = Axes3D(fig,auto_add_to_figure=False)
    fig.add_axes(ax)
    x, y = np.meshgrid(theta0, theta1)
        #np.meshgrid():将参数1当作第1个结果的每一行,有参数2的长度个行;将参数2当作第2个结果的每一列,有参数1的长度个列
    ax.plot_surface(x, y, jvals.T, cmap='rainbow')  #转置jvals,否则轴将被翻转
    ax.set_xlabel('theta0',c='grey')
    ax.set_ylabel('theta1',c='grey')
    ax.set_zlabel('Cost',c='grey')
    plt.show()

函数：代价函数的等高线图

def contour(g, X, y): #代价函数的等高线图
    fig, ax = plt.subplots()
    res = g.A1
    theta0 = np.linspace(-10, 10, num=200)
    theta1 = np.linspace(-1, 4, num=200)
    jvals = np.zeros((theta0.size, theta1.size))
    for i in np.arange(theta0.size):
        for j in np.arange(theta1.size):
            theta = np.matrix(np.array([theta0[i], theta1[j]]))
            jvals[i, j] = computeCost(X,y,theta)
    x, y = np.meshgrid(theta0, theta1)
    ax.scatter(res[0], res[1])
    ax.contour(x, y, jvals.T, np.logspace(-2, 2, 50))   #np.logspace(start,stop,num,base=指定对数的底,默认为10):构造等比数列:在这里用来确定轮廓线的数量和位置
    ax.set_title('Contour of Cost Function')
    plt.show()

主函数：

# Linear Regression with one variable with gradientDescent:根据城市城市人口数量，预测开小吃店的利润

# Plot the data
path='ex1data1.txt'
data=pd.read_csv(path,header=None,names=['Population','Profit'])
#data.head() #读取前5行数据
#data.plot(kind='scatter',x='Population',y='Profit',figsize=(12,8))

data.insert(0,'Ones',1) #insert(index,obj):在第一列插入列名为Ones的一系列1(X_0)
cols=data.shape[1]  #shape[0]输出矩阵行数;shape[1]输出矩阵列数;shape输出矩阵行数和列数
X=data.iloc[:,0:cols-1] #iloc[:,:]行列切片，前取行后取列(左闭右开) #X是前n-1列
y=data.iloc[:,cols-1:cols]  #y是第n列
X=np.matrix(X.values)   #取出X中值放入矩阵  #代价函数用numpy矩阵进行运算,故需将X,y转换为矩阵
y=np.matrix(y.values)

theta=np.matrix(np.array([0,0]))    #初始化theta
alpha=0.01  #初始化学习速率
iters=1000  #初始化迭代次数
cost,g=gradientDescent(X,y,theta,alpha,iters)   #gradientDescent返回两个值,依次传递给cost和g

x=np.linspace(data.Population.min(),data.Population.max(),num=100)  ##np.linspace(start, stop, num=100):返回num个等间距的在区间[start, stop]中的样本
f=g[0,0]+g[0,1]*x

fig,ax=plt.subplots(figsize=(9,6),dpi=128) #plt.subplot():返回fig(整个图像)和ax(坐标轴和画的图)
ax.plot(x,f,'r',label='Prediction') #画线
ax.scatter(data.Population,data.Profit,label='Training data')
ax.legend(loc='upper left')    #ax.legend(loc='upper left'):设置图例位于左上角
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs Population Size')
plt.show()

# gradientDescent()在每次迭代中输出cost(cost总是降低,凸优化问题)
fig2,ax=plt.subplots(figsize=(9,6),dpi=128)
ax.plot(np.arange(iters),cost,'r')  #np.arange(n):生成起点为0,终点为n-1的步长为1的排列
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title('Error vs Training Epoch')
plt.show()

visualCost3D(X, y)
contour(g,X,y)

预测结果

代价函数-迭代次数

代价函数-theta0-theta1的3D曲面图

代价函数-theta0-theta1的等高线图

2-Linear Regression with multiple variables with normalEquation:根据房屋大小和卧室数量，预测房屋售价

import numpy as np
import pandas as pd

def computeCost(X,y,theta): #计算代价函数J(theta)
    inner=np.power(((X*theta.T)-y),2)
        #np.power(a,b):若b为单个数字，则对a各个元素分别求b次方;若b为数组，其列数与a相同，求相应的次方
    return np.sum(inner)/(2*len(X))

def normalEquation(X, y):   #正规方程法/最小二乘法
    theta = np.linalg.inv(X.T@X)@X.T@y
        #np.linalg.inv()：矩阵求逆   #X.T矩阵转置   #@:矩阵乘法符号;X.T@X等价于X.T.dot(X)
    return theta

# Linear Regression with multiple variables with normalEquation:根据房屋大小和卧室数量，预测房屋售价

path='ex1data2.txt'
data=pd.read_csv(path,header=None,names=['Size','Bedrooms','Price'])
#Feature Scaling-Mean Normalization:观察data发现size远大于
data=(data-data.mean())/data.std()  #因为size变量是bedrooms变量的1000倍大小,统一量级会让梯度下降收敛地更快

data.insert(0,'Ones',1)
cols=data.shape[1]
X=data.iloc[:,0:cols-1]
y=data.iloc[:,cols-1:cols]
X=np.matrix(X.values)
y=np.matrix(y.values)

theta=normalEquation(X,y)
print(theta)

3-Linear Regression with one variable with LinearRegression in scikit-learn:根据城市城市人口数量，预测开小吃店的利润（直接用 scikit-learn 中的线性回归模型）

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Linear Regression with one variable with LinearRegression in scikit-learn:根据城市城市人口数量，预测开小吃店的利润

path='ex1data1.txt'
data=pd.read_csv(path,header=None,names=['Population','Profit'])

data.insert(0,'Ones',1)
cols=data.shape[1]
X=data.iloc[:,0:cols-1]
y=data.iloc[:,cols-1:cols]
X=np.matrix(X.values)
y=np.matrix(y.values)

model=LinearRegression()
model.fit(X,y)
x=np.array(X[:,1].A1)   #np.array():产生数组 #X[:,1]:取所有行的第1个数据(从0开始计数)
f=model.predict(X).flatten()
    #flatten()/ravel()对array直接展平为一维数组,对matrix展平后仍保持矩阵类型和原来维数
    #A1命令只针对matrix类型,matrix.A1:将矩阵展平为一维数组
fig,ax=plt.subplots(figsize=(9,6),dpi=128) #plt.subplot():返回fig(整个图像)和ax(坐标轴和画的图)
ax.plot(x,f,'r',label='Prediction') #画线
ax.scatter(data.Population,data.Profit,label='Training data')
ax.legend(loc='upper left')    #ax.legend(loc='upper left'):设置图例位于左上角
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('Predicted Profit vs Population Size')
plt.show()