1、首先将utils工具包直接导入项目
normalize.py
"""Normalize features"""
import numpy as np
def normalize(features):
features_normalized = np.copy(features).astype(float)
# 计算均值
features_mean = np.mean(features, 0)
# 计算标准差
features_deviation = np.std(features, 0)
# 标准化操作
if features.shape[0] > 1:
features_normalized -= features_mean
# 防止除以0
features_deviation[features_deviation == 0] = 1
features_normalized /= features_deviation
return features_normalized, features_mean, features_deviation
generate_sinusoids.py
import numpy as np
def generate_sinusoids(dataset, sinusoid_degree):
"""
sin(x).
"""
num_examples = dataset.shape[0]
sinusoids = np.empty((num_examples, 0))
for degree in range(1, sinusoid_degree + 1):
sinusoid_features = np.sin(degree * dataset)
sinusoids = np.concatenate((sinusoids, sinusoid_features), axis=1)
return sinusoids
generate_polynomials.py
"""Add polynomial features to the features set"""
import numpy as np
from .normalize import normalize
def generate_polynomials(dataset, polynomial_degree, normalize_data=False):
"""变换方法:
x1, x2, x1^2, x2^2, x1*x2, x1*x2^2, etc.
"""
features_split = np.array_split(dataset, 2, axis=1)
dataset_1 = features_split[0]
dataset_2 = features_split[1]
(num_examples_1, num_features_1) = dataset_1.shape
(num_examples_2, num_features_2) = dataset_2.shape
if num_examples_1 != num_examples_2:
raise ValueError('Can not generate polynomials for two sets with different number of rows')
if num_features_1 == 0 and num_features_2 == 0:
raise ValueError('Can not generate polynomials for two sets with no columns')
if num_features_1 == 0:
dataset_1 = dataset_2
elif num_features_2 == 0:
dataset_2 = dataset_1
num_features = num_features_1 if num_features_1 < num_features_2 else num_features_2
dataset_1 = dataset_1[:, :num_features]
dataset_2 = dataset_2[:, :num_features]
polynomials = np.empty((num_examples_1, 0))
for i in range(1, polynomial_degree + 1):
for j in range(i + 1):
polynomial_feature = (dataset_1 ** (i - j)) * (dataset_2 ** j)
polynomials = np.concatenate((polynomials, polynomial_feature), axis=1)
if normalize_data:
polynomials = normalize(polynomials)[0]
return polynomials
然后就是重要的prepare_for_training.py
"""Prepares the dataset for training"""
import numpy as np
from .normalize import normalize
from .generate_sinusoids import generate_sinusoids
from .generate_polynomials import generate_polynomials
def prepare_for_training(data, polynomial_degree=0, sinusoid_degree=0, normalize_data=True):
# 计算样本总数
num_examples = data.shape[0]
data_processed = np.copy(data)
# 预处理
features_mean = 0
features_deviation = 0
data_normalized = data_processed
if normalize_data:
(
data_normalized,
features_mean,
features_deviation
) = normalize(data_processed)
data_processed = data_normalized
# 特征变换sinusoidal
if sinusoid_degree > 0:
sinusoids = generate_sinusoids(data_normalized, sinusoid_degree)
data_processed = np.concatenate((data_processed, sinusoids), axis=1)
# 特征变换polynomial
if polynomial_degree > 0:
polynomials = generate_polynomials(data_normalized, polynomial_degree, normalize_data)
data_processed = np.concatenate((data_processed, polynomials), axis=1)
# 加一列1
data_processed = np.hstack((np.ones((num_examples, 1)), data_processed))
return data_processed, features_mean, features_deviation
hypothesis包中的文件:
sigmoid.py
"""Sigmoid function"""
import numpy as np
def sigmoid(matrix):
"""Applies sigmoid function to NumPy matrix"""
return 1 / (1 + np.exp(-matrix))
sigmoid_gradient.py
"""Sigmoid gradient function"""
from .sigmoid import sigmoid
def sigmoid_gradient(matrix):
"""Computes the gradient of the sigmoid function evaluated at z."""
return sigmoid(matrix) * (1 - sigmoid(matrix))
然后就是线性回归linear_regression.py
import numpy as np
from utils.features import prepare_for_training
class LinearRegression:
def __init__(self, data, labels, polynomial_degree = 0, sinusoid_degree = 0, normalize_data = True):
"""
1.对数据进行预处理操作
2.先得到所有的特征个数
3.初始化参数矩阵
"""
# 数据预处理
(data_processed,
feature_mean,
feature_deviation) = prepare_for_training(data, polynomial_degree = 0, sinusoid_degree = 0, normalize_data = True)
self.data = data
self.labels = labels
self.feature_mean = feature_mean
self.feature_deviation = feature_deviation
self.polynomial_degree = polynomial_degree
self.sinusoid_degree = sinusoid_degree
self.normalize_data = normalize_data
num_features = self.data.shape[1]
self.theta = np.zeros((num_features, 1))
def train(self, alpha, num_iterations = 500):
"""
训练模块,执行梯度下降
"""
cost_history = self.gradient_descent(alpha, num_iterations)
return self.theta, cost_history
# 梯度下降
def gradient_descent(self, alpha, num_iterations):
"""
实际迭代模块,会迭代num_iterations次
"""
cost_history = []
for i in range(num_iterations):
self.gradient_step(alpha)
cost_history.append(self.cost_function(self.data, self.labels))
return cost_history
# 进行参数更新的步骤
def gradient_step(self, alpha):
"""
梯度下降参数更新计算方法,注意是矩阵运算
"""
# [0] 索引操作符用于访问元组的第一个元素,即行数。
num_examples = self.data.shape[0]
prediction = LinearRegression.hypothesis(self.data, self.theta)
delta = prediction - self.labels
theta = self.theta
theta = theta - alpha * (1/num_examples) * (np.dot(delta.T, self.data)).T
self.theta = theta
def cost_function(self, data, labels):
"""
损失计算方法
"""
# self.data.shape[0]样本的个数
num_examples = data.shape[0]
delta = LinearRegression.hypothesis(self.data, self.theta) - labels
cost = (1/2)*np.dot(delta.T, delta)
return cost[0][0]
@staticmethod
def hypothesis(data, theta):
"""
直接调用得到预测值
"""
predictions = np.dot(data, theta)
return predictions
def get_cost(self, data, labels):
data_processed = prepare_for_training(data,
self.polynomial_degree,
self.sinusoid_degree,
self.normalize_data
)[0]
return self.cost_function(data_processed, labels)
def predict(self, data):
"""
用训练好的参数模型,去预测得到回归值结果
"""
data_processed = prepare_for_training(data,
self.polynomial_degree,
self.sinusoid_degree,
self.normalize_data
)[0]
predictions = LinearRegression.hypothesis(data_processed, self.theta)
return predictions
最后是主函数:UnivariteLinearRegression.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from linear_regression import LinearRegression
data = pd.read_csv('../data/world-happiness-report-2017.csv')
# 得到训练和测试数据
train_data = data.sample(frac=0.8)
test_data = data.drop(train_data.index)
input_param_name = 'Economy..GDP.per.Capita.'
output_param_name = 'Happiness.Score'
x_train = train_data[[input_param_name]].values
y_train = train_data[[output_param_name]].values
x_test = test_data[input_param_name].values
y_test = test_data[output_param_name].values
plt.scatter(x_train, y_train, label = 'Train data')
plt.scatter(x_test, y_test, label = 'Test data')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Happiness')
plt.legend()
plt.show()
num_iterations = 500
learning_rate = 0.01
linear_regression = LinearRegression(x_train, y_train)
(theta, cost_history) = linear_regression.train(learning_rate, num_iterations)
print('开始时的损失:', cost_history[0])
print('训练后的损失:', cost_history[-1])
plt.plot(range(num_iterations), cost_history)
plt.xlabel('Iter')
plt.ylabel('Cost')
plt.title('GD')
plt.show()
predictions_num = 100
x_predictions = np.linspace(x_train.min(), x_train.max(), predictions_num).reshape(predictions_num, 1)
y_predictions = linear_regression.predict(x_predictions)
plt.scatter(x_train, y_train, label = 'Train data')
plt.scatter(x_test, y_test, label = 'Test data')
plt.plot(x_predictions, y_predictions, 'r', label='Prediction')
plt.xlabel(input_param_name)
plt.ylabel(output_param_name)
plt.title('Happiness')
plt.legend()
plt.show()
最后的结果为什么会出现错误:
"D:\PycharmProjectsCode\Machine Learning\.venv\Scripts\python.exe" "D:\PycharmProjectsCode\Machine Learning\LinearRegression\UnivariteLinearRegression.py"
开始时的损失: 1802.075384630608
训练后的损失: 131.20906966096783
Traceback (most recent call last):
File "D:\PycharmProjectsCode\Machine Learning\LinearRegression\UnivariteLinearRegression.py", line 46, in <module>
y_predictions = linear_regression.predict(x_predictions)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\PycharmProjectsCode\Machine Learning\LinearRegression\linear_regression.py", line 97, in predict
predictions = LinearRegression.hypothesis(data_processed, self.theta)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "D:\PycharmProjectsCode\Machine Learning\LinearRegression\linear_regression.py", line 75, in hypothesis
predictions = np.dot(data, theta)
^^^^^^^^^^^^^^^^^^^
ValueError: shapes (100,2) and (1,1) not aligned: 2 (dim 1) != 1 (dim 0)
Process finished with exit code 1
我是跟着唐宇迪老师的视频讲解一步一步写的,唐老师没有出现这个错误,我找了好久也不知道为什么,请各位大佬帮忙解答一下