目录
一、线性回归
算法实现:正规方程法、梯度下降法
linearRegression.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: linearRegression.py
# @Blog : https://blog.csdn.net/lzbmc
'''
线性回归:单变量(一元)线性回归、多变量(多元)线性回归
下面代码是单变量线性回归。
多变量只需要在读取数据的时候x对应列增加,注意数据归一化,绘图需要降维
'''
import numpy as np
import matplotlib.pyplot as plt
from numpy import *
import pandas as pd
def loadDataSet():
data = pd.read_csv('ex0.txt', header=None, delimiter='\t') # 没有表头
# print(data.head()) # 默认查看前五行,可以用来查看数据
# # 查看数据特征之间的相关系数
# r = data.corr() # 0-0.3:弱相关;0.3-0.6:中等程度相关;0.6-1:强相关
# print(r)
xArr = data.values[:, 0:2] # 数据第一列全为1,即X0,对应的W0是b (200,2)
yArr = data.values[:, -1] # (200,1)
return xArr, yArr
def analyzeData(x, y):
x = x[:, 1] # (200,)
plt.scatter(x, y, s=15)
return plt
def plotLrEquation(weights, xArr, yArr):
plt = analyzeData(xArr, yArr)
xCopy = xArr.copy()
xCopy.sort(0) # 按列排序
y = xCopy * weights
plt.plot(xCopy[:, 1], y, c='r')
plt.show()
# 回归任务常用性能度量:均方误差。
def lr_normalEquation(x, y):
'''
方法一、正规方程法:求导为零时,得到w
'''
print('====', y.shape)
xTx = x.T * x # (n,m)*(m,n)=(n,n)
if np.linalg.det(xTx) == 0: # 计算行列式,判断是否可逆
print('This matrix is singular, cannot do inverse')
return
ws = xTx.I * x.T * y.T # (n,n)*(n,m)=(n,m) * (m,1) = (n,1)
return ws
def lr_gradientDecent(x, y, epoch):
'''
方法二、梯度下降法:设置学习率和迭代次数,更新w
'''
alpha = 0.01
m, n = x.shape
weights = ones((n, 1)) # (n,1)
cost = [] # 统计每次迭代之后的cost
for i in range(epoch):
error = x * weights - y.T # (m,1)
weights = weights - alpha * (1 / m) * x.T * error
currCost = cost_function(x, y, weights)
cost.append(currCost)
return weights, cost
# 代价函数
def cost_function(x, y, weights):
m = x.shape[0] # 数据的个数
cost = 1 / (2 * m) * sum(np.power((x * weights - y.T), 2))
return cost
# 迭代次数与代价函数的关系
def plot_epochCost(cost, epoch=1000): # 默认值参数必须放在后面
x = range(epoch)
y = cost
plt.plot(x, y)
plt.xlabel('epoch')
plt.ylabel('cost')
plt.show()
if __name__ == "__main__":
dataArr, labels = loadDataSet()
xMat = np.mat(dataArr) # (m,n)
yMat = np.mat(labels) # (1,m)
# plt = analyzeData(dataArr, labels)
# plt.show()
# # 最小二乘正规方程
w = lr_normalEquation(xMat, yMat)
print(w)
plotLrEquation(w, dataArr, labels)
# # 梯度下降法
# epoch = 1000
# w, cost = lr_gradientDecent(xMat, yMat, epoch)
# plotLrEquation(w, dataArr, labels)
# plot_epochCost(cost, epoch)
sklearn实现
linearRegression_sklearn.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# @Author : Cabbage
# @project : ML
# @FileName: linearRegression_sklearn.py
# @Blog : https://blog.csdn.net/lzbmc
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
def loadDataSet():
data = pd.read_csv('ex0.txt', header=None, delimiter='\t') # 没有表头
# print(data.head()) # 默认查看前五行,可以用来查看数据
# # 查看数据特征之间的相关系数
# r = data.corr() # 0-0.3:弱相关;0.3-0.6:中等程度相关;0.6-1:强相关
# print(r)
trainSet &#