机器学习笔记——Logistic Regression

最新推荐文章于 2022-05-29 21:14:13 发布

阿卡蒂奥

最新推荐文章于 2022-05-29 21:14:13 发布

阅读量593

点赞数

分类专栏：机器学习文章标签：机器学习

本文链接：https://blog.csdn.net/akadiao/article/details/77990308

版权

机器学习专栏收录该内容

17 篇文章 13 订阅

订阅专栏

1、线性回归

给定数据：
t为车辆行驶的时间，与之相关的两个因素：x1为车辆一个运输任务行驶的里数，x2为车辆运输的次数；
这里写图片描述

（data数据保存在.csv文件中）

#!/usr/bin/python
# -*- coding: utf-8 -*-

# #线性回归模型
from numpy import genfromtxt
from sklearn import datasets, linear_model

# #导入数据
dataPath = r"delivery.csv"
# #从文本文件加载数据，并按指定的方式处理缺少的值。
# #通过第一个“skip_header”行的每一行都以“delimiter”字符分割，并且“comments”字符后的字符将被丢弃。
deliveryData = genfromtxt(dataPath, delimiter=',')
print "data:\n",deliveryData

X = deliveryData[:,:-1]
Y = deliveryData[:,-1]

print "X:\n",X
print "Y:\n",Y

# #创建普通最小二乘线性回归模型
regr = linear_model.LinearRegression()
# #拟合线性模型
regr.fit(X,Y)
# ##打印出模型参数
print "coefficients:\n",regr.coef_
# ##打印出截距
print "intercept:\n",regr.intercept_

# #预测
xPred = [102,6]
yPred = regr.predict(xPred)
print "predicted y:\n",yPred

打印出结果为：

data:
[[ 100.     4.     9.3]
 [  50.     3.     4.8]
 [ 100.     4.     8.9]
 [ 100.     2.     6.5]
 [  50.     2.     4.2]
 [  80.     2.     6.2]
 [  75.     3.     7.4]
 [  65.     4.     6. ]
 [  90.     3.     7.6]
 [  90.     2.     6.1]]
X:
[[ 100.    4.]
 [  50.    3.]
 [ 100.    4.]
 [ 100.    2.]
 [  50.    2.]
 [  80.    2.]
 [  75.    3.]
 [  65.    4.]
 [  90.    3.]
 [  90.    2.]]
Y:
[ 9.3  4.8  8.9  6.5  4.2  6.2  7.4  6.   7.6  6.1]
coefficients:
[ 0.0611346   0.92342537]
intercept:
-0.868701466782
predicted y:
[ 10.90757981]

则t=b0+b1*x1+b2*x2
b0=-0.868701466782, b1=0.0611346, b2=0.92342537

2、自变量中有分类型变量

给定数据：
t为车辆行驶的时间，与之相关的两个因素：车辆一个运输任务行驶的里数，车辆运输的次数，以及车型（0：卡车，1：MPV，2：SUV）；
这里写图片描述
首先将分类型变量车型转化为数字型变量（0:100, 1:010， 2:001）

#!/usr/bin/python
# -*- coding: utf-8 -*-

# #线性回归模型——自变量中有分类型变量
from numpy import genfromtxt
from sklearn import datasets, linear_model

# #导入数据
dataPath = r"delivery.csv"
# #从文本文件加载数据，并按指定的方式处理缺少的值。
# #通过第一个“skip_header”行的每一行都以“delimiter”字符分割，并且“comments”字符后的字符将被丢弃。
deliveryData = genfromtxt(dataPath, delimiter=',')
print "data:\n",deliveryData

X = deliveryData[:,:-1]
Y = deliveryData[:,-1]

print "X:\n",X
print "Y:\n",Y

# #创建普通最小二乘线性回归模型
regr = linear_model.LinearRegression()
# #拟合线性模型
regr.fit(X,Y)
# ##打印出模型参数
print "coefficients:\n",regr.coef_
# ##打印出截距
print "intercept:\n",regr.intercept_

# #预测
xPred = [82,4,1,0,0]
yPred = regr.predict(xPred)
print "predicted y:\n",yPred

打印出的结果为：

data:
[[ 100.     4.     0.     1.     0.     9.3]
 [  50.     3.     1.     0.     0.     4.8]
 [ 100.     4.     0.     1.     0.     8.9]
 [ 100.     2.     0.     0.     1.     6.5]
 [  50.     2.     0.     0.     1.     4.2]
 [  80.     2.     0.     1.     0.     6.2]
 [  75.     3.     0.     1.     0.     7.4]
 [  65.     4.     1.     0.     0.     6. ]
 [  90.     3.     1.     0.     0.     7.6]
 [ 100.     4.     0.     1.     0.     9.3]
 [  50.     3.     1.     0.     0.     4.8]
 [ 100.     4.     0.     1.     0.     8.9]
 [ 100.     2.     0.     0.     1.     6.5]]
X:
[[ 100.    4.    0.    1.    0.]
 [  50.    3.    1.    0.    0.]
 [ 100.    4.    0.    1.    0.]
 [ 100.    2.    0.    0.    1.]
 [  50.    2.    0.    0.    1.]
 [  80.    2.    0.    1.    0.]
 [  75.    3.    0.    1.    0.]
 [  65.    4.    1.    0.    0.]
 [  90.    3.    1.    0.    0.]
 [ 100.    4.    0.    1.    0.]
 [  50.    3.    1.    0.    0.]
 [ 100.    4.    0.    1.    0.]
 [ 100.    2.    0.    0.    1.]]
Y:
[ 9.3  4.8  8.9  6.5  4.2  6.2  7.4  6.   7.6  9.3  4.8  8.9  6.5]
coefficients:
[ 0.05452507  0.70930079 -0.18019642  0.60821607 -0.42801964]
intercept:
0.198995895632

predicted y:
[ 7.32705805]

则预测模型中的b0=0.198995895632
b1=0.05452507 ,b2=0.70930079 ,b3=-0.18019642 ,b4=0.60821607 ,b5=-0.42801964

3、多元线性回归模型

# # 多元线性回归模型
#!/usr/bin/python
# -*- coding: utf-8 -*-
import math
import numpy as np


# #计算X、Y的相关系数
def computeCorrelation(X,Y):
    xBar = np.mean(X)
    yBar = np.mean(Y)
    SSR = 0
    varX = 0
    varY = 0
    for i in range(0,len(X)):
        diffXXBar = X[i] - xBar
        diffYYBar = Y[i] - yBar
        # #回归平方和
        SSR += (diffXXBar * diffYYBar)
        varX += diffXXBar**2
        varY += diffYYBar**2
    # #总离差平方和
    SST = math.sqrt(varX*varY)
    # # R^2=SSR / SST 测定多元线性回归的拟合度程度
    return SSR / SST

# #
def polyfit(x,y,degree):
    results = {}
    # #最小二乘法多项式拟合,degree为出现的最高次
    coeffs = np.polyfit(x, y, degree)
    # #将polyfit的返回值转换为list
    results['polynomial'] = coeffs.tolist()
    # #一维多项式类
    p = np.poly1d(coeffs)
    yhat = p(x)
    ybar = np.sum(y)/len(y)
    ssreg = np.sum((yhat-ybar)**2)
    sstot = np.sum((y-ybar)**2)
    # #得到决定系数
    results['determination'] = ssreg/sstot
    return results


testX = [1, 3, 8, 7, 9]
testY = [10, 12, 24, 21, 34]

print "r:\n", computeCorrelation(testX, testY)
print "r^2:\n", str(computeCorrelation(testX, testY)**2)
print polyfit(testX, testY, 1)["determination"]

打印结果为：

r:
0.940310076545
r^2:
0.884183040052
0.88424448141

4、非线性回归

采用梯度下降法：

#!/usr/bin/python
# -*- coding: utf-8 -*-
import numpy as np
import random

# #更新法则;采用梯度下降法更新 theta向量;
# # alpha为学习率/步长,m 为实例个数,numIterations重复更新的次数;
def gradientDescent(x, y, theta, alpha, m, numIterations):
    # #求转置
    xTrans = x.transpose()
    for i in range(0,numIterations):
        # # x*theta
        hypothesis = np.dot(x, theta)
        loss = hypothesis - y
        cost = np.sum(loss ** 2)/(2 * m)
        print "Iteration %d | Cost: %f" % (i, cost)
        gradient = np.dot(xTrans, loss)/m
        theta = theta - alpha * gradient
    return theta

# #创建测试实例数据
def genData(numPoints, bias, variance):
    x = np.zeros(shape = (numPoints, 2))
    y = np.zeros(shape = numPoints)
    for i in range(0,numPoints):
        x[i][0] = 1
        x[i][1] = i
        y[i] = (i + bias) + random.uniform(0,1) * variance
    return x,y

x,y = genData(100,25,10)
# print "x:",x
# print "y:",y
m,n = np.shape(x)

numIterations = 100000
alpha = 0.0005
theta = np.ones(n)
theta = gradientDescent(x,y,theta,alpha,m,numIterations)

print theta

打印结果：

Iteration 68227 | Cost: 3.658161
Iteration 68228 | Cost: 3.658161
Iteration 68229 | Cost: 3.658161
Iteration 68230 | Cost: 3.658161
Iteration 68231 | Cost: 3.658161
Iteration 68232 | Cost: 3.658161
Iteration 68233 | Cost: 3.658161
Iteration 68234 | Cost: 3.658161
Iteration 68235 | Cost: 3.658161

...

Iteration 99992 | Cost: 3.658158
Iteration 99993 | Cost: 3.658158
Iteration 99994 | Cost: 3.658158
Iteration 99995 | Cost: 3.658158
Iteration 99996 | Cost: 3.658158
Iteration 99997 | Cost: 3.658158
Iteration 99998 | Cost: 3.658158
Iteration 99999 | Cost: 3.658158
[ 29.57407612   1.01316847]