import numpy as np
from numpy import genfromtxt
from sklearn import linear_model
import matplotlib.pyplot as plt
#读入数据
data=genfromtxt('lingRegression.csv',delimiter=',')
print(data)
data=
[[ nan nan nan nan nan nan nan nan]
[1947. 83. 234.289 235.6 159. 107.608 1947. 60.323]
[1948. 88.5 259.426 232.5 145.6 108.632 1948. 61.122]
[1949. 88.2 258.054 368.2 161.8 109.773 1949. 60.171]
[1950. 89.5 284.599 335.1 165. 110.929 1950. 61.187]
[1951. 96.2 328.975 209.9 309.9 112.075 1951. 63.221]
[1952. 98.1 346.999 193.2 359.4 113.27 1952. 63.639]
[1953. 99. 365.385 187. 354.7 115.094 1953. 64.989]
[1954. 100. 363.112 357.8 335. 116.219 1954. 63.761]
[1955. 101.2 397.469 290.4 304.8 117.388 1955. 66.019]
[1956. 104.6 419.18 282.2 285.7 118.734 1956. 67.857]
[1957. 108.4 442.769 293.6 279.8 120.466 1957. 68.169]
[1958. 110.8 444.546 468.1 263.7 121.95 1958. 66.513]
[1959. 112.6 482.704 381.3 255.2 123.368 1959. 68.655]
[1960. 114.2 502.601 393.1 251.4 125.368 1960. 69.564]
[1961. 115.7 518.173 480.6 257.2 127.852 1961. 69.331]
[1962. 116.9 554.894 400.7 282.7 130.081 1962. 70.551]]
#切分数据
x_data=data[1:,2:]#第2行到最后一行,第第三列到最后一列
y_data=data[1:,1]#第2行到最后一行,第二列
print(x_data)
print(y_data)
out:
x_data=
[[ 234.289 235.6 159. 107.608 1947. 60.323]
[ 259.426 232.5 145.6 108.632 1948. 61.122]
[ 258.054 368.2 161.8 109.773 1949. 60.171]
[ 284.599 335.1 165. 110.929 1950. 61.187]
[ 328.975 209.9 309.9 112.075 1951. 63.221]
[ 346.999 193.2 359.4 113.27 1952. 63.639]
[ 365.385 187. 354.7 115.094 1953. 64.989]
[ 363.112 357.8 335. 116.219 1954. 63.761]
[ 397.469 290.4 304.8 117.388 1955. 66.019]
[ 419.18 282.2 285.7 118.734 1956. 67.857]
[ 442.769 293.6 279.8 120.466 1957. 68.169]
[ 444.546 468.1 263.7 121.95 1958. 66.513]
[ 482.704 381.3 255.2 123.368 1959. 68.655]
[ 502.601 393.1 251.4 125.368 1960. 69.564]
[ 518.173 480.6 257.2 127.852 1961. 69.331]
[ 554.894 400.7 282.7 130.081 1962. 70.551]]
#真实值,和x_data有对应关系
y_data=[ 83. 88.5 88.2 89.5 96.2 98.1 99. 100. 101.2 104.6 108.4 110.8
112.6 114.2 115.7 116.9]
#常见模型
#生成50个值
alphas_to_test=np.linspace(0.001,1)#作为勒木达的值默认是50个,可以自行设置
#RidgeCV岭回归交叉验证,用交叉验证测试alphas_to_test的50个值找到合适的值
#store_cv_values存储交叉验证得到的结果、
model=linear_model.RidgeCV(alphas=alphas_to_test,store_cv_values=True)
model.fit(x_data,y_data)
#岭系数,测试50个领系数后,得到较好的那个
print(model.alpha_)
#loss值
print(model.cv_values_.shape)#(16, 50)16个loss值,因为进行交叉验证,一共有16行数据,其中一行作为测试集。其余为训练集
#这样可以得到16组loss值,50指的是领系数有50个[[],[],[],[],[],[]]二维数组,16行,每一行50个数
out:
最好的岭回归系数:入=0.40875510204081633
(16, 50)
#画图
#领系数跟loss值的关系
plt.plot(alphas_to_test,model.cv_values_.mean(axis=0))
#选取领系数的位置
plt.plot(model.alpha_,min(model.cv_values_.mean(axis=0)),'ro')
plt.show()
入=0.40875510204081633时的损失函数的值
算法进行预测,x_data是第三行的数。真是值对应前边y_data的第三个数88.2,而预测值是88.111.,还是比较准确的
model.predict(x_data[2,np.newaxis])#x_data的带三行数据,是一维的,np.newaxis将其变为二维的,得到预测结果
array([88.11146219])