1、线性回归
给定数据:
t为车辆行驶的时间,与之相关的两个因素:x1为车辆一个运输任务行驶的里数,x2为车辆运输的次数;
(data数据保存在.csv文件中)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# #线性回归模型
from numpy import genfromtxt
from sklearn import datasets, linear_model
# #导入数据
dataPath = r"delivery.csv"
# #从文本文件加载数据,并按指定的方式处理缺少的值。
# #通过第一个“skip_header”行的每一行都以“delimiter”字符分割,并且“comments”字符后的字符将被丢弃。
deliveryData = genfromtxt(dataPath, delimiter=',')
print "data:\n",deliveryData
X = deliveryData[:,:-1]
Y = deliveryData[:,-1]
print "X:\n",X
print "Y:\n",Y
# #创建普通最小二乘线性回归模型
regr = linear_model.LinearRegression()
# #拟合线性模型
regr.fit(X,Y)
# ##打印出模型参数
print "coefficients:\n",regr.coef_
# ##打印出截距
print "intercept:\n",regr.intercept_
# #预测
xPred = [102,6]
yPred = regr.predict(xPred)
print "predicted y:\n",yPred
打印出结果为:
data:
[[ 100. 4. 9.3]
[ 50. 3. 4.8]
[ 100. 4. 8.9]
[ 100. 2. 6.5]
[ 50. 2. 4.2]
[ 80. 2. 6.2]
[ 75. 3. 7.4]
[ 65. 4. 6. ]
[ 90. 3. 7.6]
[ 90. 2. 6.1]]
X:
[[ 100. 4.]
[ 50. 3.]
[ 100. 4.]
[ 100. 2.]
[ 50. 2.]
[ 80. 2.]
[ 75. 3.]
[ 65. 4.]
[ 90. 3.]
[ 90. 2.]]
Y:
[ 9.3 4.8 8.9 6.5 4.2 6.2 7.4 6. 7.6 6.1]
coefficients:
[ 0.0611346 0.92342537]
intercept:
-0.868701466782
predicted y:
[ 10.90757981]
则t=b0+b1*x1+b2*x2
b0=-0.868701466782, b1=0.0611346, b2=0.92342537
2、自变量中有分类型变量
给定数据:
t为车辆行驶的时间,与之相关的两个因素:车辆一个运输任务行驶的里数,车辆运输的次数,以及车型(0:卡车,1:MPV,2:SUV);
首先将分类型变量车型转化为数字型变量(0:100, 1:010, 2:001)
#!/usr/bin/python
# -*- coding: utf-8 -*-
# #线性回归模型——自变量中有分类型变量
from numpy import genfromtxt
from sklearn import datasets, linear_model
# #导入数据
dataPath = r"delivery.csv"
# #从文本文件加载数据,并按指定的方式处理缺少的值。
# #通过第一个“skip_header”行的每一行都以“delimiter”字符分割,并且“comments”字符后的字符将被丢弃。
deliveryData = genfromtxt(dataPath, delimiter=',')
print "data:\n",deliveryData
X = deliveryData[:,:-1]
Y = deliveryData[:,-1]
print "X:\n",X
print "Y:\n",Y
# #创建普通最小二乘线性回归模型
regr = linear_model.LinearRegression()
# #拟合线性模型
regr.fit(X,Y)
# ##打印出模型参数
print "coefficients:\n",regr.coef_
# ##打印出截距
print "intercept:\n",regr.intercept_
# #预测
xPred = [82,4,1,0,0]
yPred = regr.predict(xPred)
print "predicted y:\n",yPred
打印出的结果为:
data:
[[ 100. 4. 0. 1. 0. 9.3]
[ 50. 3. 1. 0. 0. 4.8]
[ 100. 4. 0. 1. 0. 8.9]
[ 100. 2. 0. 0. 1. 6.5]
[ 50. 2. 0. 0. 1. 4.2]
[ 80. 2. 0. 1. 0. 6.2]
[ 75. 3. 0. 1. 0. 7.4]
[ 65. 4. 1. 0. 0. 6. ]
[ 90. 3. 1. 0. 0. 7.6]
[ 100. 4. 0. 1. 0. 9.3]
[ 50. 3. 1. 0. 0. 4.8]
[ 100. 4. 0. 1. 0. 8.9]
[ 100. 2. 0. 0. 1. 6.5]]
X:
[[ 100. 4. 0. 1. 0.]
[ 50. 3. 1. 0. 0.]
[ 100. 4. 0. 1. 0.]
[ 100. 2. 0. 0. 1.]
[ 50. 2. 0. 0. 1.]
[ 80. 2. 0. 1. 0.]
[ 75. 3. 0. 1. 0.]
[ 65. 4. 1. 0. 0.]
[ 90. 3. 1. 0. 0.]
[ 100. 4. 0. 1. 0.]
[ 50. 3. 1. 0. 0.]
[ 100. 4. 0. 1. 0.]
[ 100. 2. 0. 0. 1.]]
Y:
[ 9.3 4.8 8.9 6.5 4.2 6.2 7.4 6. 7.6 9.3 4.8 8.9 6.5]
coefficients:
[ 0.05452507 0.70930079 -0.18019642 0.60821607 -0.42801964]
intercept:
0.198995895632
predicted y:
[ 7.32705805]
则预测模型中的b0=0.198995895632
b1=0.05452507 ,b2=0.70930079 ,b3=-0.18019642 ,b4=0.60821607 ,b5=-0.42801964
3、多元线性回归模型
# # 多元线性回归模型
#!/usr/bin/python
# -*- coding: utf-8 -*-
import math
import numpy as np
# #计算X、Y的相关系数
def computeCorrelation(X,Y):
xBar = np.mean(X)
yBar = np.mean(Y)
SSR = 0
varX = 0
varY = 0
for i in range(0,len(X)):
diffXXBar = X[i] - xBar
diffYYBar = Y[i] - yBar
# #回归平方和
SSR += (diffXXBar * diffYYBar)
varX += diffXXBar**2
varY += diffYYBar**2
# #总离差平方和
SST = math.sqrt(varX*varY)
# # R^2=SSR / SST 测定多元线性回归的拟合度程度
return SSR / SST
# #
def polyfit(x,y,degree):
results = {}
# #最小二乘法多项式拟合,degree为出现的最高次
coeffs = np.polyfit(x, y, degree)
# #将polyfit的返回值转换为list
results['polynomial'] = coeffs.tolist()
# #一维多项式类
p = np.poly1d(coeffs)
yhat = p(x)
ybar = np.sum(y)/len(y)
ssreg = np.sum((yhat-ybar)**2)
sstot = np.sum((y-ybar)**2)
# #得到决定系数
results['determination'] = ssreg/sstot
return results
testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]
print "r:\n", computeCorrelation(testX, testY)
print "r^2:\n", str(computeCorrelation(testX, testY)**2)
print polyfit(testX, testY, 1)["determination"]
打印结果为:
r:
0.940310076545
r^2:
0.884183040052
0.88424448141
4、非线性回归
采用梯度下降法:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import random
# #更新法则;采用梯度下降法更新 theta向量;
# # alpha为学习率/步长,m 为实例个数,numIterations重复更新的次数;
def gradientDescent(x, y, theta, alpha, m, numIterations):
# #求转置
xTrans = x.transpose()
for i in range(0,numIterations):
# # x*theta
hypothesis = np.dot(x, theta)
loss = hypothesis - y
cost = np.sum(loss ** 2)/(2 * m)
print "Iteration %d | Cost: %f" % (i, cost)
gradient = np.dot(xTrans, loss)/m
theta = theta - alpha * gradient
return theta
# #创建测试实例数据
def genData(numPoints, bias, variance):
x = np.zeros(shape = (numPoints, 2))
y = np.zeros(shape = numPoints)
for i in range(0,numPoints):
x[i][0] = 1
x[i][1] = i
y[i] = (i + bias) + random.uniform(0,1) * variance
return x,y
x,y = genData(100,25,10)
# print "x:",x
# print "y:",y
m,n = np.shape(x)
numIterations = 100000
alpha = 0.0005
theta = np.ones(n)
theta = gradientDescent(x,y,theta,alpha,m,numIterations)
print theta
打印结果:
Iteration 68227 | Cost: 3.658161
Iteration 68228 | Cost: 3.658161
Iteration 68229 | Cost: 3.658161
Iteration 68230 | Cost: 3.658161
Iteration 68231 | Cost: 3.658161
Iteration 68232 | Cost: 3.658161
Iteration 68233 | Cost: 3.658161
Iteration 68234 | Cost: 3.658161
Iteration 68235 | Cost: 3.658161
...
Iteration 99992 | Cost: 3.658158
Iteration 99993 | Cost: 3.658158
Iteration 99994 | Cost: 3.658158
Iteration 99995 | Cost: 3.658158
Iteration 99996 | Cost: 3.658158
Iteration 99997 | Cost: 3.658158
Iteration 99998 | Cost: 3.658158
Iteration 99999 | Cost: 3.658158
[ 29.57407612 1.01316847]