一.数据来源:
数据来源:kaggle
二.使用方法
线性回归(Linear Regression)+随机梯度下降(Stochatisc Gradient Descent)
随机梯度下降原理:
损失函数为:
其中:
通过设定一个学习比率,不断进行拟合迭代,以找到全局最优解,获得的值
在训练每一组数据时:
……
三.代码实现
从数据读取开始,不调取三方库,纯手工推。
导入基础库
from csv import reader
from math import sqrt
from random import randrange,seed
设置读取文件函数
读取csv文件,由于读取后内容为string,需转换为float。
#读取文件
def csv_loader(file): #file为文件的位置
dataset=list()
with open(file,'r') as file:
csv_reader=reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
#读取文件后,内容为字符串,需要将字符串转换为浮点型
def string_to_float_converter(dataset):
dataset= dataset[5:] #因为下载文件的数据从第5行开始
for i in range(len(dataset[0])):
for row in dataset:
row[i]=float(row[i].strip())
数据切分处理
对数据进行切分,切分为训练集和测试集(train/test)
def train_test_split(dataset,split):
train_data=list()
train_size=split*len(dataset)
dataset_copy=list(dataset) #复制原数据集,避免对原数据的损坏
while len(train_data)<train_size:
index=randrange(len(dataset_copy))
train_data.append(dataset_copy.pop(index))
test_data=dataset_copy
return train_data, test_data
线性回归预测模型
线性回归模型:
def predict(row,coefficient):
predicted_data=coefficient[0]
for i in range(len(row)-1):
predicted_data+= coefficient[i+1]*row[i]
return predicted_dat
用随机梯度下降法获得系数
不断迭代,以达到最小error时,获得系数coefficient
def calulate_coefficient_by_sgd(train_data,learn_rate,n_epochs):
coefficient=[0.0 for i in range(len(train_data[0]))]
for epoch in range(n_epochs):
for row in train_data:
y_hat=predict(row,coefficient)
error=y_hat-row[-1]
coefficient[0]=coefficient[0]-learn_rate*error
for i in range(len(row)-1):
coefficient[i+1]=coefficient[i+1]-learn_rate*error*row[i]
return coefficient
通过线性回归,对测试集数据预测
def linear_regression(train_data,test_data,learn_rate,n_epochs):
predicted_datas=list()
coefficient=calulate_coefficient_by_sgd(train_data,learn_rate,n_epochs)
for row in test_data:
predicted_data= predict(row,coefficient)
predicted_datas.append(predicted_data)
return predicted_datas
模型拟合度判断方法(Coefficient of Determine)
设置决定系数的计算
def coefficent_of_determine(actual,predicted):
mean_catual = sum(actual)/float(len(actual))
sse = 0.0
sst = 0.0
for i in range(len(actual)):
error = predicted[i] - actual[i]
sse += (error**2)
sst += (actual[i] - mean_catual)**2
R_2 = (1-(sse/sst))*100.0
return R_2
模型测试
def model_test(dataset,algo,split,*args):
train_data,test_data=train_test_split(dataset,split)
predicted_data=algo(train_data,test_data,*args) #算法占位
actual_data=[row[-1] for row in test_data]
R_2=coefficent_of_determine(actual_data,predicted_data)
return R_2
运行与参数调整
对模型进行测试,决定系数越接近1,说明模型拟合得越好。
file='./download_datas/insurance.csv' #输入文件地址和文件名
dataset=csv_loader(file)
string_to_float_converter(dataset)
dataset= dataset[5:]
#调整和测试以下参数,以达到最佳的拟合度
seed(1) #固定每一次randrange
split=0.7 #数据切分比例
learn_rate=0.000001 #学习系数的设置与调整
n_epochs=1000 #拟合次数
algo=linear_regression #算法设置
R_2=model_test(dataset,algo,split,learn_rate,n_epochs)
print('The score of my model is %.3f%%' %R_2)
#运行结果为:
The score of my model is 82.113%
四.完整代码
#1.导入基础库
from csv import reader
from math import sqrt
from random import randrange,seed
#2.读取csv文件和转换字符串为浮点型
def csv_loader(file):
dataset=list()
with open(file,'r') as file:
csv_reader=reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
def string_to_float_converter(dataset):
dataset= dataset[5:]
for i in range(len(dataset[0])):
for row in dataset:
row[i]=float(row[i].strip())
#3.数据切分
def train_test_split(dataset,split):
train_data=list()
train_size=split*len(dataset)
dataset_copy=list(dataset)
while len(train_data)<train_size:
index=randrange(len(dataset_copy))
train_data.append(dataset_copy.pop(index))
test_data=dataset_copy
return train_data, test_data
#4.线性回归预测模型
def predict(row,coefficient):
predicted_data=coefficient[0]
for i in range(len(row)-1):
predicted_data+= coefficient[i+1]*row[i]
return predicted_data
#5.用随机梯度下降法计算系数(coefficeint,by stochastic gradient descent)
def calulate_coefficient_by_sgd(train_data,learn_rate,n_epochs):
coefficient=[0.0 for i in range(len(train_data[0]))]
for epoch in range(n_epochs):
for row in train_data:
y_hat=predict(row,coefficient)
error=y_hat-row[-1]
coefficient[0]=coefficient[0]-learn_rate*error
for i in range(len(row)-1):
coefficient[i+1]=coefficient[i+1]-learn_rate*error*row[i]
return coefficient
#6.通过线性回归,用测试集数据进行预测
def linear_regression(train_data,test_data,learn_rate,n_epochs):
predicted_datas=list()
coefficient=calulate_coefficient_by_sgd(train_data,learn_rate,n_epochs)
for row in test_data:
predicted_data= predict(row,coefficient)
predicted_datas.append(predicted_data)
return predicted_datas
#7.模型拟合度判断方法(Coefficient of Determine)
def coefficent_of_determine(actual,predicted):
mean_catual = sum(actual)/float(len(actual))
sse = 0.0
sst = 0.0
for i in range(len(actual)):
error = predicted[i] - actual[i]
sse += (error**2)
sst += (actual[i] - mean_catual)**2
R_2 = (1-(sse/sst)) *100.0
return R_2
#8.模型测试
def model_test(dataset,algo,split,*args):
train_data,test_data=train_test_split(dataset,split)
predicted_data=algo(train_data,test_data,*args)
actual_data=[row[-1] for row in test_data]
R_2=coefficent_of_determine(actual_data,predicted_data)
return R_2
#9.运行
file='./download_datas/insurance.csv'
dataset=csv_loader(file)
string_to_float_converter(dataset)
dataset= dataset[5:]
seed(1)
split=0.7
learn_rate=0.000001
n_epochs=1000
algo=linear_regression
R_2=model_test(dataset,algo,split,learn_rate,n_epochs)
print('The score of my model is %.3f%%' %R_2)