机器学习（四）逻辑回归解决二分类问题

最新推荐文章于 2022-12-08 19:02:22 发布

zhiguo98

最新推荐文章于 2022-12-08 19:02:22 发布

阅读量1.3k

点赞数 1

分类专栏：机器学习文章标签： python 机器学习深度学习

本文链接：https://blog.csdn.net/yanzhiguo98/article/details/115897083

版权

机器学习专栏收录该内容

5 篇文章 2 订阅

订阅专栏

接着进行逻辑回归项目的训练，我在进行这个训练的过程中，遇到了自己无法解决的问题，在接下来的博客中会提到，我用的解决办法都不适用，最后用到了sklearn库，发现这是真香，完全不用自己造轮子，但是对于初学者，还是要弄懂里面的原理，才有助于以后的学习。

一、导入库和数据集

数据集是前两列100个学生的两科成绩，最后一列是是否被大学录取。

34.62365962451697,78.0246928153624,0
30.28671076822607,43.89499752400101,0
35.84740876993872,72.90219802708364,0
60.18259938620976,86.30855209546826,1
79.0327360507101,75.3443764369103,1
45.08327747668339,56.3163717815305,0
61.10666453684766,96.51142588489624,1
75.02474556738889,46.55401354116538,1
76.09878670226257,87.42056971926803,1
84.43281996120035,43.53339331072109,1
95.86155507093572,38.22527805795094,0
75.01365838958247,30.60326323428011,0
82.30705337399482,76.48196330235604,1
69.36458875970939,97.71869196188608,1
39.53833914367223,76.03681085115882,0
53.9710521485623,89.20735013750205,1
69.07014406283025,52.74046973016765,1
67.94685547711617,46.67857410673128,0
70.66150955499435,92.92713789364831,1
76.97878372747498,47.57596364975532,1
67.37202754570876,42.83843832029179,0
89.67677575072079,65.79936592745237,1
50.534788289883,48.85581152764205,0
34.21206097786789,44.20952859866288,0
77.9240914545704,68.9723599933059,1
62.27101367004632,69.95445795447587,1
80.1901807509566,44.82162893218353,1
93.114388797442,38.80067033713209,0
61.83020602312595,50.25610789244621,0
38.78580379679423,64.99568095539578,0
61.379289447425,72.80788731317097,1
85.40451939411645,57.05198397627122,1
52.10797973193984,63.12762376881715,0
52.04540476831827,69.43286012045222,1
40.23689373545111,71.16774802184875,0
54.63510555424817,52.21388588061123,0
33.91550010906887,98.86943574220611,0
64.17698887494485,80.90806058670817,1
74.78925295941542,41.57341522824434,0
34.1836400264419,75.2377203360134,0
83.90239366249155,56.30804621605327,1
51.54772026906181,46.85629026349976,0
94.44336776917852,65.56892160559052,1
82.36875375713919,40.61825515970618,0
51.04775177128865,45.82270145776001,0
62.22267576120188,52.06099194836679,0
77.19303492601364,70.45820000180959,1
97.77159928000232,86.7278223300282,1
62.07306379667647,96.76882412413983,1
91.56497449807442,88.69629254546599,1
79.94481794066932,74.16311935043758,1
99.2725269292572,60.99903099844988,1
90.54671411399852,43.39060180650027,1
34.52451385320009,60.39634245837173,0
50.2864961189907,49.80453881323059,0
49.58667721632031,59.80895099453265,0
97.64563396007767,68.86157272420604,1
32.57720016809309,95.59854761387875,0
74.24869136721598,69.82457122657193,1
71.79646205863379,78.45356224515052,1
75.3956114656803,85.75993667331619,1
35.28611281526193,47.02051394723416,0
56.25381749711624,39.26147251058019,0
30.05882244669796,49.59297386723685,0
44.66826172480893,66.45008614558913,0
66.56089447242954,41.09209807936973,0
40.45755098375164,97.53518548909936,1
49.07256321908844,51.88321182073966,0
80.27957401466998,92.11606081344084,1
66.74671856944039,60.99139402740988,1
32.72283304060323,43.30717306430063,0
64.0393204150601,78.03168802018232,1
72.34649422579923,96.22759296761404,1
60.45788573918959,73.09499809758037,1
58.84095621726802,75.85844831279042,1
99.82785779692128,72.36925193383885,1
47.26426910848174,88.47586499559782,1
50.45815980285988,75.80985952982456,1
60.45555629271532,42.50840943572217,0
82.22666157785568,42.71987853716458,0
88.9138964166533,69.80378889835472,1
94.83450672430196,45.69430680250754,1
67.31925746917527,66.58935317747915,1
57.23870631569862,59.51428198012956,1
80.36675600171273,90.96014789746954,1
68.46852178591112,85.59430710452014,1
42.0754545384731,78.84478600148043,0
75.47770200533905,90.42453899753964,1
78.63542434898018,96.64742716885644,1
52.34800398794107,60.76950525602592,0
94.09433112516793,77.15910509073893,1
90.44855097096364,87.50879176484702,1
55.48216114069585,35.57070347228866,0
74.49269241843041,84.84513684930135,1
89.84580670720979,45.35828361091658,1
83.48916274498238,48.38028579728175,1
42.2617008099817,87.10385094025457,1
99.31500880510394,68.77540947206617,1
55.34001756003703,64.9319380069486,1
74.77589300092767,89.52981289513276,1

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
np.set_printoptions(suppress=True)#防止出现科学计数法


data = pd.read_csv("students_data_1.csv",header=None,names=["Exam1","Exam2","Adimtted"])

查看数据集的信息

print(data.head())
print(data.describe())

绘出散点图查看情况

#选择正向数据
positive = data[data["Adimtted"].isin([1])]
#选择负向数据
negative = data[data['Adimtted'].isin([0])]
plt.figure(figsize=(12,8))
plt.scatter(positive['Exam1'],positive['Exam2'],color='g',label = '被接纳')
plt.scatter(negative['Exam1'],negative['Exam2'],color='r',label = '未被接纳')
plt.xlabel("Exam1成绩")
plt.ylabel("Exam2成绩")
plt.title('评分预测logistic_regression二分类')
plt.show()

二、构造矩阵

#构造特征矩阵
data.insert(0,'ones',1)
n = data.shape[1]
x = data.iloc[:,:n-1]
y = data.iloc[:,n-1:]

w = np.zeros((1,x.shape[1]))
x = np.matrix(x)
y = np.matrix(y)

三、模型函数

def sigmoid(x):
    res =  1/(1+np.exp(-x))
    return res

四、代价函数

def cost_function(x,y,w):
    hx = sigmoid(x*w.T)
    first = np.multiply(-y, np.log(hx))
    second = np.multiply((1 - y), np.log(1 - hx))
    return np.sum(first - second) / (len(x))

五、梯度下降

def gradient_descent(x,y,w,alpha,iters):
    '''
    :param x: 特征矩阵
    :param y: 实际值
    :param w: 权重矩阵
    :param alpha: 步长
    :param iters: 迭代次数
    :return: 返回迭代之后的权重矩阵w和每次迭代之后的代价函数的值组成的数组cost
    '''
    temp = np.matrix(np.zeros(w.shape))
    x_len = x.shape[0]
    w_len = w.shape[1]
    cost = np.zeros(iters)
    for i in range(iters):
        error = x*w.T-y
        for j in range(w_len):
            temp[:,j] =w[:,j] - sum(np.multiply(error,x[:,j]))*(alpha/x_len)
        w = temp
        cost[i] = cost_function(x,y,w)
    return w,cost

对于利用梯度下降来求权重矩阵，这段代码是不行的，一直会报除数类型不符合要求，因为log中传入了一个参数0，我猜测可能是由于python的精度不符合要求导致，然后我采用alpha = 0.001，结果不行，我采用0.0001行，可以，我想把alpha在提高点，采用0.0003，结果直接报错，然后我才用0.0002，iters=1000000，跑了半天，但结果也不符合要求。。。

实在没有办法了，我只能暂时先放在这，等以后有了更好的想法在尝试一下吧。。。比如说利用其他的框架之类的

六、sklearn逻辑回归

其中要把y改成一维的，要不然会报错！

还要把原先插入到data中的那一列数据删了！

lr = linear_model.LogisticRegression()
lr.fit(x,y)
w0 = lr.intercept_
w = lr.coef_
print(w0)
print(w)

利用sklearn的结果进行画图分类

lr = linear_model.LogisticRegression()
lr.fit(x,y)
w0 = lr.intercept_
w = lr.coef_
print(w0)
print(w)
w1 = w[0,0]
w2 = w[0,1]

c = np.linspace(min(data["Exam1"]),max(data['Exam2']),100)
f = [-i*(w1/w2)-w0/w2 for i in c]

plt.plot(c,f,label = '分类曲线',color = '#8B008B')
plt.legend()
plt.show()

sklearn中有逻辑回归的代码，直接拿过来用就可以，虽然很方便，但是我总觉得这不是自己写的。。。

七、注意事项（总结）

①：这几天每天都在写梯度下降的代码，感觉自己掌握的还行

②：对于alpha和iters参数设置，自己没有感觉，就是随便设，没有任何理论依据

③：学完线性回归和逻辑回归，感觉对于机器学习有点入门了，至少会一点名词了

④：最终把这个分类做出来了，我还是很高兴的！！！

八、附录源码（不能直接运行，含有两部分的代码，自己取舍）

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
np.set_printoptions(suppress=True)#防止出现科学计数法


data = pd.read_csv("students_data_1.csv",header=None,names=["Exam1","Exam2","Adimtted"])
# print(data.head())
# print(data.describe())
#选择正向数据
positive = data[data["Adimtted"].isin([1])]
#选择负向数据
negative = data[data['Adimtted'].isin([0])]
plt.figure(figsize=(12,8))
plt.scatter(positive['Exam1'],positive['Exam2'],color='g',label = '被接纳')
plt.scatter(negative['Exam1'],negative['Exam2'],color='r',label = '未被接纳')
plt.xlabel("Exam1成绩")
plt.ylabel("Exam2成绩")
plt.title('评分预测logistic_regression二分类')
# plt.show()
#构造特征矩阵
data.insert(0,'ones',1)
n = data.shape[1]
x = data.iloc[:,:n-1]
y = data.iloc[:,n-1:]

w = np.zeros((1,x.shape[1]))
x = np.matrix(x)
y = np.matrix(y)

def sigmoid(x):
    res =  1.0/(1.0+np.exp(-x))
    return res


def cost_function(x,y,w):
    hx = sigmoid(np.dot(x,w.T))
    first = np.multiply(-y, np.log(hx))
    second = np.multiply((1-y), np.log(1.0-hx))
    return np.sum(first - second) / (len(x))


def gradient_descent(x,y,w,alpha,iters):
    '''
    :param x: 特征矩阵
    :param y: 实际值
    :param w: 权重矩阵
    :param alpha: 步长
    :param iters: 迭代次数
    :return: 返回迭代之后的权重矩阵w和每次迭代之后的代价函数的值组成的数组cost
    '''
    temp= np.zeros((w.shape[0],w.shape[1]))
    x_len = x.shape[0]
    w_len = w.shape[1]
    cost = np.zeros(iters)
    for i in range(iters):
        print("第%d次迭代",i)
        error = x*w.T-y
        for j in range(w_len):
            temp[:,j] =w[:,j] - sum(np.multiply(error,x[:,j]))*(alpha/x_len)
        w = temp
        cost[i] = cost_function(x,y,w)
    return w,cost


alpha = 0.0002
iters = 1000000
w,cost = gradient_descent(x,y,w,alpha,iters)
print(w)

# lr = linear_model.LogisticRegression()
# lr.fit(x,y)
# res = lr.predict_proba([[40,80]])
# print(res)
# print(lr.score(x,y))