1.3 机器学习方法之分类问题--决策树、贝叶斯、SVM支持向量机、逻辑回归

风轻云淡_Cauchy

已于 2022-06-25 10:35:19 修改

阅读量1.2k

点赞数 1

分类专栏：机器学习文章标签：机器学习分类人工智能

于 2022-06-03 22:30:50 首次发布

本文链接：https://blog.csdn.net/duanyuwangyuyan/article/details/125116218

版权

机器学习专栏收录该内容

13 篇文章 0 订阅

订阅专栏

1.3 机器学习方法之分类问题

1. 分类问题

1. 分类问题

分类问题是监督学习的一个核心问题，它从数据中心学习一个分类决策函数或分类模型，对新的输入进行输出预测，输出变量取有限个离散值。
决策树、贝叶斯、SVM支持向量机、逻辑回归、集成学习。

1.1.决策树

给定训练数据，如何构建决策树：
以下三步完成。
(1) 特征选择
决定用哪个特征来划分特征空间，选出对训练数据集具有分类能力的特征。
算法
在这里插入图片描述
(2) 决策树生成
在决策树各个点上按照一定的方法选择特征，递归构建决策树
(3) 决策树剪枝

1.2. 贝叶斯

参见博文《机器学习之贝叶斯样本分类》

1.3. 支持向量机SVM

在这里插入图片描述
一种有监督学习的方法，尝试寻找一个最优决策边界，使距离两个类别最近的样本最远。

from sklearn import svm
# from beyes_classify import load_data
import numpy as np
import matplotlib.pyplot as plt
def load_data():
    '''
    load file data
    '''
    with open('F:/study/AI/src/ml/sklearn/bayes.csv') as csv_file:
        data = csv.reader(csv_file)
        first_row = next(data)
        n_sample = int(first_row[0])
        n_feature = int(first_row[1])
        
        train_sets = np.empty((n_sample, 2), dtype=np.float64)
        target_sets = np.empty((n_sample), dtype=np.int)
        for i, ir in enumerate(data):
            train_sets[i] = np.asarray(ir[:-1], dtype=np.float64)
            target_sets[i] = np.asarray(ir[-1], dtype=np.int)
    return n_sample, n_feature, train_sets, target_sets
if __name__ == '__main__':
    n_sample, n_feature, train_sets, target_sets = load_data()
    clf = svm.SVC(kernel='linear')
    clf.fit(train_sets, target_sets)
    # clf.fit(train_sets[:,0], train_sets[:,1])

    plt.scatter(train_sets[:5,0], train_sets[:5,1], color = 'black')
    plt.scatter(train_sets[5:,0], train_sets[5:,1], color = 'blue')

    w=clf.coef_[0]
    a=-w[0]/w[1]
    xx=np.linspace(0,5)#产生-5到5的线性连续值，间隔为1
    yy=a*xx-(clf.intercept_[0])/w[1]  #clf.intercept_[0]是w3.即为公式a1*x1+a2*x2+w3中的w3。(clf.intercept_[0])/w[1]即为直线的截距
    plt.plot(xx, yy)
    plt.show()

在这里插入图片描述
10个训练样本，二分类0/1。

1.4. 逻辑回归

参见博文《Tensorflow之逻辑回归二分类以及交叉熵》
在这里插入图片描述

import numpy as np
import matplotlib.pyplot as plt

def loadDataSet():
    X_train = np.empty((100, 3))
    y_train = np.empty((100, 1), dtype=np.int)
    fr = open('D:\\02-Work\\01-AI\\csdn_code\\machine-learning\\classification\\logistic_data.txt')
    i = 0
    for line in fr.readlines():
        lineArr = line.strip().split()
        temp = [1.0, float(lineArr[0]), float(lineArr[1])]
        X_train[i] = np.asarray(temp)
        y_train[i] = np.asarray(int(lineArr[2]), dtype=np.int)
        i += 1
    return X_train,y_train

class log(object):
    def __init__(self):
        self.W=None
 
    def sigmoid(self,X):
        return 1.0 / (1.0 + np.exp(-X))
 
    def loss(self,X_train,y_train):
        m,n=X_train.shape
        h=self.sigmoid(X_train.dot(self.W))
        # 代价函数 交叉熵
        loss=(y_train.T.dot(np.log(h))+(1-y_train).T.dot(np.log(1-h)))/-m
        loss=loss[0,0]
        # 代价函数求偏导
        dW=X_train.T.dot((h - y_train)) / m
        return loss,dW
 
    def train(self,X_train,y_train,learn_rate=0.01,iters=5000):
        m,n=X_train.shape
        print(m,n)
        self.W=np.random.rand(n,1)
        loss_list = []
        #梯度下降法迭代,迭代结束得出的最后W，即决策边界权重  
        for i in range(iters):
            loss,dW=self.loss(X_train,y_train)
            self.W-=learn_rate*dW
            loss_list.append(loss)
            if i % 1000 == 0:
                print('iters = %d,loss = %f' % (i, loss))
        print(self.W)
        return loss_list, self.W

def decision_boundary(weight):
    '''决策分界线'''
    X = np.arange(-5,5,0.5)
    y = (-weight[0]-weight[1]*X)/weight[2]
    plt.plot(X,y)

def dataLabelPlt(X_train,y_train):
    '''样本数据散点图'''
    m,n=X_train.shape
    for i in range(m):
        color = ''
        if y_train[i] == 1:
            color = 'b'
        else:
            color = 'r'
        plt.scatter(X_train[i][1],X_train[i][2],color=color)
        

if __name__ == '__main__':
    X_train,y_train = loadDataSet()
    class_object = log()
    loss_list, weight = class_object.train(X_train, y_train)
    dataLabelPlt(X_train,y_train)
    decision_boundary(weight)
    plt.show()

在这里插入图片描述
样本数据logistic_data.txt：

-0.017612  14.053064  0
-1.395634  4.662541   1
-0.752157  6.538620   0
-1.322371  7.152853   0
0.423363   11.054677  0
0.406704   7.067335   1
0.667394   12.741452  0
-2.460150  6.866805   1
0.569411   9.548755   0
-0.026632  10.427743  0
0.850433   6.920334   1
1.347183   13.175500  0
1.176813   3.167020   1
-1.781871  9.097953   0
-0.566606  5.749003   1
0.931635   1.589505   1
-0.024205  6.151823   1
-0.036453  2.690988   1
-0.196949  0.444165   1
1.014459   5.754399   1
1.985298   3.230619   1
-1.693453  -0.557540  1
-0.576525  11.778922  0
-0.346811  -1.678730  1
-2.124484  2.672471   1
1.217916   9.597015   0
-0.733928  9.098687   0
-3.642001  -1.618087  1
0.315985   3.523953   1
1.416614   9.619232   0
-0.386323  3.989286   1
0.556921   8.294984   1
1.224863   11.587360  0
-1.347803  -2.406051  1
1.196604   4.951851   1
0.275221   9.543647   0
0.470575   9.332488   0
-1.889567  9.542662   0
-1.527893  12.150579  0
-1.185247  11.309318  0
-0.445678  3.297303   1
1.042222   6.105155   1
-0.618787  10.320986  0
1.152083   0.548467   1
0.828534   2.676045   1
-1.237728  10.549033  0
-0.683565  -2.166125  1
0.229456   5.921938   1
-0.959885  11.555336  0
0.492911   10.993324  0
0.184992   8.721488   0
-0.355715  10.325976  0
-0.397822  8.058397   0
0.824839   13.730343  0
1.507278   5.027866   1
0.099671   6.835839   1
-0.344008  10.717485  0
1.785928   7.718645   1
-0.918801  11.560217  0
-0.364009  4.747300   1
-0.841722  4.119083   1
0.490426   1.960539   1
-0.007194  9.075792   0
0.356107   12.447863  0
0.342578   12.281162  0
-0.810823  -1.466018  1
2.530777   6.476801   1
1.296683   11.607559  0
0.475487   12.040035  0
-0.783277  11.009725  0
0.074798   11.023650  0
-1.337472  0.468339   1
-0.102781  13.763651  0
-0.147324  2.874846   1
0.518389   9.887035   0
1.015399   7.571882   0
-1.658086  -0.027255  1
1.319944   2.171228   1
2.056216   5.019981   1
-0.851633  4.375691   1
-1.510047  6.061992   0
-1.076637  -3.181888  1
1.821096   10.283990  0
3.010150   8.401766   1
-1.099458  1.688274   1
-0.834872  -1.733869  1
-0.846637  3.849075   1
1.400102   12.628781  0
1.752842   5.468166   1
0.078557   0.059736   1
0.089392   -0.715300  1
1.825662   12.693808  0
0.197445   9.744638   0
0.126117   0.922311   1
-0.679797  1.220530   1
0.677983   2.556666   1
0.761349   10.693862  0
-2.168791  0.143632   1
1.388610   9.341997   0
0.317029   14.739025  0

风轻云淡_Cauchy

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
1.3 机器学习方法之分类问题--决策树、贝叶斯、SVM支持向量机、逻辑回归

分类问题是监督学习的一个核心问题，它从数据中心学习一个分类决策函数或分类模型，对新的输入进行输出预测，输出变量取有限个离散值。决策树、贝叶斯、SVM支持向量机、逻辑回归、集成学习。......
复制链接

扫一扫

专栏目录