python模拟简单线性回归

通过输入已有的数值对(x,y),建立模型,再次输入x的值时可以预测y的值

 

代码如下:

from math import sqrt
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from matplotlib import pyplot as plt
import numpy as np

#计算平均值
def mean(values):
    return sum(values) / float(len(values))


# 计算方差
def variance(values, mean):
    return sum([(x - mean) ** 2 for x in values])


#计算协方差
def convariance(x, mean_x, y, mean_y):
    convar = 0.0
    for i in range(len(x)):
        convar += (x[i] - mean_x) * (y[i] - mean_y)
    return convar

#计算回归系数函数
def coefficients(dataset):
    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    x_mean, y_mean = mean(x), mean(y)
    w1 = convariance(x, x_mean, y, y_mean) / variance(x, x_mean)
    w0 = y_mean - w1 * x_mean
    return w0, w1


#构建简单的线性回归
def simple_linear_regression(train, test):
    predict = list()         #构建空列表
    w0, w1 = coefficients(train)  #从训练集合中获取回归系数

    global  w_k
    w_k = w1
    global  w_b
    w_b = w0

    for row in test:                   #从测试集中读取每一个不同的x
        y_model = w1 * row[0] + w0    #用模型预测y
        predict.append(y_model)     #记录每一个预测值y
    return predict


#计算均方根误差RMSE
def rmse_metric(actual, predicted):
    sum_error = 0.0
    for i in range(len(actual)):
        prediction_error = predicted[i] - actual[i]
        sum_error += (prediction_error ** 2)
    mean_error = sum_error / float(len(actual))
    return sqrt(mean_error)


#评估算法数据准备及协调
def evaluate_algorithm(dataset, algorithm):
    test_set = list()
    for row in dataset:
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(dataset, test_set)
    for val in predicted:
        print('%.3f\t' % val)

    actual = [row[-1] for row in dataset]
    rmse = rmse_metric(actual, predicted)
    return rmse


#导入CSV文件
def load_csv(filename):
    dataset = list()
    try:
        with open(filename, 'r') as file:
            csv_reader = reader(file)
            #读取表头X,Y
            heading = next(csv_reader)
            #将文件指针下移至第一条真正的数据
            for row in csv_reader:
                if not row :   #判定是否有空行,如果有,则跳入下一行,继续读取数据
                    continue
                dataset.append(row)
    except IOError as err:
        print("file error :",str(err))
    return dataset


#将字符串列转换为浮点数
def str_column_to_float(dataset, column):
    for row in dataset:
        row[column] = float(row[column].strip())


#将数据集分割为训练集合和测试两部分
def train_test_split(dataset, percent):
    train = list()
    train_size = percent * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy


#使用分隔开的训练集合和测试集合运行评估算法
def evaluate_algorithm(dataset, algorithm, split_percent, *args):
    train, test = train_test_split(dataset, split_percent)
    test_set = list()
    for row in test :
        row_copy = list(row)
        row_copy[-1] = None
        test_set.append(row_copy)
    predicted = algorithm(train, test_set, *args)
    actual = [row[-1] for row in test]
    rmse = rmse_metric(actual, predicted)
    return rmse

def visualization(dateset):
    fig = plt.figure()
    # 画图区域分成1行1列。选择第一块区域。
    ax1 = fig.add_subplot(1, 1, 1)

    x = [row[0] for row in dataset]
    y = [row[1] for row in dataset]
    ax1.plot(x, y,'bs')

    x_1 = np.linspace(min(x),max(x))
    y_1 = x_1 * w_k + w_b
    ax1.plot(x_1,y_1)
    plt.grid()
    plt.show()


#设置随机数种子,为随机数训练和测试数据集做准备
seed(2)

#导入保险数据并做数据分割准备
filename = 'insurance.csv'
dataset = load_csv(filename)
print(dataset)
for col in range(len(dataset[0])):
    str_column_to_float(dataset, col)

#设置数据集合分割百分比
percent = 0.6

rmse = evaluate_algorithm(dataset, simple_linear_regression, percent)
print('RMSE : %.3f' %  rmse)
visualization(dataset)
# dataset = [[1.2, 1.1], [2.4, 3.5], [4.1, 3.2], [3.4, 2.8], [5, 5.4]]
# x = [row[0] for row in dataset]
# y = [row[1] for row in dataset]
# mean_x, mean_y = mean(x), mean(y)       #获取均值
# var_x, var_y = variance(x, mean_x), variance(y, mean_y)
# convar = convariance(x, mean_x, y, mean_y)       #获取协方差
#
# print('x的统计特性:均值 = % .3f 方差 = %.3f' % (mean_x, var_x))
# print('y的统计特性:均值 = % .3f 方差 = %.3f' % (mean_y, var_y))
# print('协方差 = :%.3f' % convar)
#
# w0, w1 = coefficients(dataset)
# print('回归系数分别为: w0 = %.3f, w1 = %.3f' % (w0, w1))
#
# rmse = evaluate_algorithm(dataset, simple_linear_regression)
# print('RMSE : %.3f' % rmse)

 

代码中的insurance.csv文件需要与代码文件在同一个目录下

文件内容如下:

 

运行结果:

 

  • 3
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值