通过输入已有的数值对(x,y),建立模型,再次输入x的值时可以预测y的值
代码如下:
from math import sqrt
from random import seed
from random import randrange
from csv import reader
from math import sqrt
from matplotlib import pyplot as plt
import numpy as np
#计算平均值
def mean(values):
return sum(values) / float(len(values))
# 计算方差
def variance(values, mean):
return sum([(x - mean) ** 2 for x in values])
#计算协方差
def convariance(x, mean_x, y, mean_y):
convar = 0.0
for i in range(len(x)):
convar += (x[i] - mean_x) * (y[i] - mean_y)
return convar
#计算回归系数函数
def coefficients(dataset):
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
x_mean, y_mean = mean(x), mean(y)
w1 = convariance(x, x_mean, y, y_mean) / variance(x, x_mean)
w0 = y_mean - w1 * x_mean
return w0, w1
#构建简单的线性回归
def simple_linear_regression(train, test):
predict = list() #构建空列表
w0, w1 = coefficients(train) #从训练集合中获取回归系数
global w_k
w_k = w1
global w_b
w_b = w0
for row in test: #从测试集中读取每一个不同的x
y_model = w1 * row[0] + w0 #用模型预测y
predict.append(y_model) #记录每一个预测值y
return predict
#计算均方根误差RMSE
def rmse_metric(actual, predicted):
sum_error = 0.0
for i in range(len(actual)):
prediction_error = predicted[i] - actual[i]
sum_error += (prediction_error ** 2)
mean_error = sum_error / float(len(actual))
return sqrt(mean_error)
#评估算法数据准备及协调
def evaluate_algorithm(dataset, algorithm):
test_set = list()
for row in dataset:
row_copy = list(row)
row_copy[-1] = None
test_set.append(row_copy)
predicted = algorithm(dataset, test_set)
for val in predicted:
print('%.3f\t' % val)
actual = [row[-1] for row in dataset]
rmse = rmse_metric(actual, predicted)
return rmse
#导入CSV文件
def load_csv(filename):
dataset = list()
try:
with open(filename, 'r') as file:
csv_reader = reader(file)
#读取表头X,Y
heading = next(csv_reader)
#将文件指针下移至第一条真正的数据
for row in csv_reader:
if not row : #判定是否有空行,如果有,则跳入下一行,继续读取数据
continue
dataset.append(row)
except IOError as err:
print("file error :",str(err))
return dataset
#将字符串列转换为浮点数
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
#将数据集分割为训练集合和测试两部分
def train_test_split(dataset, percent):
train = list()
train_size = percent * len(dataset)
dataset_copy = list(dataset)
while len(train) < train_size:
index = randrange(len(dataset_copy))
train.append(dataset_copy.pop(index))
return train, dataset_copy
#使用分隔开的训练集合和测试集合运行评估算法
def evaluate_algorithm(dataset, algorithm, split_percent, *args):
train, test = train_test_split(dataset, split_percent)
test_set = list()
for row in test :
row_copy = list(row)
row_copy[-1] = None
test_set.append(row_copy)
predicted = algorithm(train, test_set, *args)
actual = [row[-1] for row in test]
rmse = rmse_metric(actual, predicted)
return rmse
def visualization(dateset):
fig = plt.figure()
# 画图区域分成1行1列。选择第一块区域。
ax1 = fig.add_subplot(1, 1, 1)
x = [row[0] for row in dataset]
y = [row[1] for row in dataset]
ax1.plot(x, y,'bs')
x_1 = np.linspace(min(x),max(x))
y_1 = x_1 * w_k + w_b
ax1.plot(x_1,y_1)
plt.grid()
plt.show()
#设置随机数种子,为随机数训练和测试数据集做准备
seed(2)
#导入保险数据并做数据分割准备
filename = 'insurance.csv'
dataset = load_csv(filename)
print(dataset)
for col in range(len(dataset[0])):
str_column_to_float(dataset, col)
#设置数据集合分割百分比
percent = 0.6
rmse = evaluate_algorithm(dataset, simple_linear_regression, percent)
print('RMSE : %.3f' % rmse)
visualization(dataset)
# dataset = [[1.2, 1.1], [2.4, 3.5], [4.1, 3.2], [3.4, 2.8], [5, 5.4]]
# x = [row[0] for row in dataset]
# y = [row[1] for row in dataset]
# mean_x, mean_y = mean(x), mean(y) #获取均值
# var_x, var_y = variance(x, mean_x), variance(y, mean_y)
# convar = convariance(x, mean_x, y, mean_y) #获取协方差
#
# print('x的统计特性:均值 = % .3f 方差 = %.3f' % (mean_x, var_x))
# print('y的统计特性:均值 = % .3f 方差 = %.3f' % (mean_y, var_y))
# print('协方差 = :%.3f' % convar)
#
# w0, w1 = coefficients(dataset)
# print('回归系数分别为: w0 = %.3f, w1 = %.3f' % (w0, w1))
#
# rmse = evaluate_algorithm(dataset, simple_linear_regression)
# print('RMSE : %.3f' % rmse)
代码中的insurance.csv文件需要与代码文件在同一个目录下
文件内容如下:
运行结果: