第六节机器学习--决策数、SVM

最新推荐文章于 2022-09-29 17:42:43 发布

import Successful

最新推荐文章于 2022-09-29 17:42:43 发布

阅读量196

点赞数

分类专栏：原创 python 西安机器学习人工智能文章标签：机器学习

本文链接：https://blog.csdn.net/Successful_clown/article/details/99298720

版权

python 同时被 3 个专栏收录

46 篇文章 0 订阅

订阅专栏

原创

42 篇文章 0 订阅

订阅专栏

西安

19 篇文章 0 订阅

订阅专栏

一、项目：随机生成验证码然后进行去除杂质等操作！

1、随机生成150个5位的验证码。
其中验证码的数字和每一个验证码的颜色都是随机的。并且给它加上那一些不规则的污点。作为我们的数据。

from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import random



def getRandomColor():
    """
    获取一个随机颜色(r,g,b)格式的
    :return:
    """
    c1 = random.randint(0, 255)
    c2 = random.randint(0, 255)
    c3 = random.randint(0, 255)
    if c1 == 255:
        c1 = 0
    if c2 == 255:
        c2 = 0
    if c3 == 255:
        c3 = 0
    return(c1, c2, c3)

def getRandomStr():
    """
    获取一个随机数字，每个数字的颜色也是随机的
    :return:
    """
    random_num = str(random.randint(0, 9))
    return random_num
def generate_captcha():
    # 获取一个Image对象，参数分别是RGB模式。宽150，高30， 随机颜色
    image = Image.new('RGB', (150, 50), (255,255,255))
    # 获取一个画笔对象，将图片对象传过去
    draw = ImageDraw.Draw(image)
    # 获取一个font字体对象参数是ttf的字体文件的目录，以及字体的大小
    font = ImageFont.truetype("LiberationSans-Bold.ttf", size=32)

    label = ""

    for i in range(5):
        random_char = getRandomStr()

        label += random_char

        # 在图片上写东西，参数是：定位，字符串，颜色，字体
        draw.text((10+i*30, 0), random_char, getRandomColor(), font=font)

    # 噪点噪线
    width = 150
    height = 30
    # 画线
    for i in range(3):
        x1 = random.randint(0, width)
        x2 = random.randint(0, width)
        y1 = random.randint(0, height)
        y2 = random.randint(0, height)
        draw.line((x1, y1, x2, y2), fill=(0, 0, 0))
    # 画点
    for i in range(5):
        draw.point([random.randint(0, width), random.randint(0, height)], fill=getRandomColor())
        x = random.randint(0, width)
        y = random.randint(0, height)
        draw.arc((x, y, x + 4, y + 4), 0, 90, fill=(0, 0, 0))

    # 保存到硬盘，名为test.png格式为png的图片
    image.save(open(''.join(['captcha_images/', label, '.png']), 'wb'), 'png')
    # image.save(open(''.join(['captcha_predict/', label, '.png']), 'wb'), 'png')
if __name__ == '__main__':
    for i in range(150):
        generate_captcha()

效果如下图：
在这里插入图片描述
2、对上述的验证码图片进行去除污点的操作。
代码案例如下：

from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os



def binarizaion(path):
    img = Image.open(path)
    img_gray = img.convert('L')
    img_gray = np.array(img_gray)
    w, h = img_gray.shape
    for x in range(w):
        for y in range(h):
            gray = img_gray[x, y]
            if gray <= 220:
                img_gray[x, y] = 0
            else:
                img_gray[x, y] = 1

    plt.figure('')
    plt.imshow(img_gray, cmap='gray')
    plt.axis('off')
    # plt.show()

    return img_gray


def noiseReduction(img_gray, label):
    height, width = img_gray.shape
    for x in range(height):
        for y in range(width):
            cnt = 0
            # 白色的点不用管
            if img_gray[x, y] == 1:
                continue
            else:
                try:
                    if img_gray[x-1, y-1] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x-1, y] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x-1, y+1] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x, y-1] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x, y+1] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x+1, y-1] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x+1, y] == 0:
                        cnt += 1
                except:
                    pass

                try:
                    if img_gray[x+1, y+1] == 0:
                        cnt += 1
                except:
                    pass

                if cnt < 4:  # 周围少于4点就算是噪点
                    img_gray[x, y] = 1

    plt.figure('')
    plt.imshow(img_gray, cmap='gray')
    plt.axis('off')
    # plt.show()
    plt.savefig(''.join(['clean_captcha_img/', label, '.png']))

def cutImg(label):
    labels = list(label)
    img = Image.open(''.join(['clean_captcha_img/', label, '.png']))
    for i in range(5):
        pic = img.crop((100*(1+i), 170, 100*(1+i)+100, 280))
        plt.imshow(pic)
        seq = get_save_seq(label[i])
        pic.save(''.join(['cut_number/', str(label[i]), '/', str(seq), '.png']))

def get_save_seq(num):
    numlist = os.listdir(''.join(['cut_number/', num, '/']))
    if len(numlist) == 0 or numlist is None:
        return 0
    else:
        max_file = 0
        for file in numlist:
            if int(file.split('.')[0]) > max_file:
                max_file = int(file.split('.')[0])
        return int(max_file)+1

def create_dir():
    for i in range(10):
        os.mkdir(''.join(['cut_number/', str(i)]))

def img_2_clean():
    captchas = os.listdir(''.join(['captcha_images/']))
    for captcha in captchas:
        label = captcha.split('.')[0]
        img_path = ''.join(['captcha_images/', captcha])
        # 二值化
        im = binarizaion(img_path)
        # 降噪
        noiseReduction(im, label)

def clean_to_cut():
    captchas = os.listdir(''.join(['clean_captcha_img/']))
    for captcha in captchas:
        label = captcha.split('.')[0]
        cutImg(label)


if __name__ == '__main__':
    img_2_clean()
    create_dir()
    clean_to_cut()
    path = 'captcha_images/00227.png'
    binarizaion(path)

然后对图片进行同一化，最后我们将所有图片内的相同数字提取出来存放在文件夹中。
代码如下：

import os
from PIL import Image
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib

# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import confusion_matrix
# from sklearn.metrics import precision_score
# from sklearn.metrics import recall_score

from CAPTCHA.captcha_logistic import *

def load_data():
    # 假设20*5像素块构成 20*5 = 100
    # [[11...1111]
    #  [111...111]
    #  ....
    #  [11111111]]
    # X = [[11111.....11111]] 100位 Y = [0]
    X, Y = [], []
    cut_list = os.listdir('cut_number')
    for numC in cut_list:
        num_list_dir = ''.join(['cut_number/', str(numC), '/'])
        nums_dir = os.listdir(num_list_dir)
        for num_file in nums_dir:
            img = Image.open(''.join(['cut_number/', str(numC), '/', num_file]))
            img_gray = img.convert('L')
            img_array = np.array(img_gray)
            w, h = img_array.shape
            for x in range(w):
                for y in range(h):
                    gray = img_array[x, y]
                    if gray <= 240:
                        img_array[x, y] = 0
                    else:
                        img_array[x, y] = 1

            img_re = img_array.reshape(1, -1)
            X.append(img_re[0])
            Y.append(int(numC))
    return np.array(X), np.array(Y)

def generate_model(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
    log_clf = LogisticRegression(multi_class='ovr', solver='sag', max_iter=10000)

    # 利用交叉验证选择参数
    # param_grid = {"tol": [1e-4, 1e-3, 1e-2],
    #               "C": [0.4, 0.6, 0.8]}
    # grid_search = GridSearchCV(log_clf, param_grid=param_grid, cv=3)
    # grid_search.fit(X_train, Y_train)

    log_clf.fit(X_train, Y_train)
    # 将模型持久化
    joblib.dump(log_clf, 'captcha_model/captcha_model.model')

def get_model():
    model = joblib.load('captcha_model/captcha_model.model')
    return model

def capthca_predict():
    path = 'captcha_predict/unknown.png'
    pre_img_gray = binarizaion(path)
    noiseReduction(pre_img_gray, 'unknown')
    # cut image
    labels = ['0', '1', '2', '3', '4']
    img = Image.open(''.join(['clean_captcha_img/unknown.png']))
    for i in range(5):
        pic = img.crop((100*(1+i), 170, 100*(1+i)+100, 280))
        plt.imshow(pic)
        pic.save(''.join(['captcha_predict/', labels[i], '.png']))

    result = ''
    model = get_model()
    for i in range(5):
        path = ''.join(['captcha_predict/', labels[i], '.png'])
        img = Image.open(path)
        img_gray = img.convert('L')
        img_array = np.array(img_gray)
        w, h = img_array.shape
        for x in range(w):
            for y in range(h):
                gray = img_array[x, y]
                if gray <= 220:
                    img_array[x, y] = 0
                else:
                    img_array[x, y] = 1

        img_re = img_array.reshape(1, -1)
        X = img_re[0]
        y_pre = model.predict([X])
        result = ''.join([result, str(y_pre[0])])
    return result

if __name__ == '__main__':
    # X, Y = load_data()
    # generate_model(X, Y)
    model = get_model()
    result = capthca_predict()
    print(result)

效果如下：
在这里插入图片描述
每个数字文件夹：

在这里插入图片描述

二、决策树

1、贝叶斯
P(A|B) = P(B|A) P(A) / P(B)= P(A) x P(B|A)/P(B)
后验概率 = 先验概率 x 调整因子

先验概率（Prior probability）:即在B事件发生之前，我们对A事件概率的一个判断

后验概率（Posterior probability）：即在B事件发生后，我们对A事件概率的重新评估

调整因子（Likelihood 可能性函数）：使预估概率更接近。

如果"可能性函数"P(B|A)/P(B)>1，意味着"先验概率"被增强，事件A的发生的可能性变大；

如果"可能性函数"=1，意味着B事件无助于判断事件A的可能性；如果"可能性函数"<1，意味着"先验概率"被削弱，事件A的可能性变小。
2、决策数是一种非线性有监督离散型分类模型。
随机森林是一种非线性有监督离散型分类模型。
案例分析：离散化
在描述图一
数据类型

离散的数据需指明取值数量 2^M 种分割方式

天气：晴天雨天多云
学历：高中本科研究生

连续的数据需离散化，需指明离散化后的数量

车速：
低速（60）中速（80 ）高速
M+1种分割方式

3、决策树是通过固定的条件来对类别进行判断：
如图一的情况，我们通过决策树分析如下图
在这里插入图片描述
4、决策树的生成：数据在不断分裂的递归过程，每一次分裂，尽可能让类别一样的数据在树的一边，当树的叶子节点的数据都是一类的时候，则停止分裂（if lese语句）

5、计算纯度的方式
在这里插入图片描述
基尼系数：基尼系数是指国际上通用的、用以衡量一个国家或地区居民收入差距的常用指标
。熵越大，方差越大，数据集越不一样，纯度越高。
6、决策树的分割方式：非线性

单颗决策树的缺点：

运算量大，需要一次加载所有数据进内存。并且找寻分割条件是一个极耗资源的工作。
训练样本中出现异常数据时，将会对决策树产生很大影响。抗干扰能力差。

解决方法：

减少决策树所需训练样本（减少列或者减少行
随机采样，降低异常数据的影响

逻辑回归的优点：和逻辑回归比，逻辑回归可以给告诉我们概率（或者设置阈值），二决策树只能0， 1。

7、随机森林
森林：由树组成

随机：生成树的数据都是从数据集中随机选取的。

当数据集很大的时候，我们随机选取数据集的一部分，生成一颗树，重复上述过程，我们可以生成一堆形态各异的树，这些树放在一起就叫森林。
在这里插入图片描述

8、随机森林和逻辑回归的比较

9、剪枝
剪枝: 在这棵树还没开始分裂的时候，提前设定好一些条件，在达到这些条件以后就不长了

后剪枝：先长，长完了再去掉（比如合并叶子节点）

预剪枝的方式：
（1）控制分裂的层次
（2）控制叶子节点的样本数量

剪枝保证了模型的通用性。
10、决策数代码

# encoding:utf-8
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
# 决策数的分类器
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

iris = load_iris()
data = pd.DataFrame(iris.data)
data.columns = iris.feature_names
data['Species'] = iris.target
print(data)

# 花萼长度和宽度
x = data.iloc[:, :2]
y = data.iloc[:, -1]

# 对数据集进行切分
# 相同的随机种子产生的随机数是一样
# 不同的随机种子产生的随机数不一样
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=42)

# 创建一个决策数的模型
tree_clf = DecisionTreeClassifier(max_depth=4, criterion='entropy')
tree_clf.fit(x_train, y_train)
y_test_hat = tree_clf.predict(x_test)
print("acc score:", accuracy_score(y_test, y_test_hat))

depth = np.arange(1, 15)
err_list = []

"""
splitter: 分裂的方式
             best   找最好的维度进行分裂
             random 随机找维度进行分裂
feature: splitter方法中，找best分割维度的时候，需要在多少个维度中找那个最好的 
            none 就是不限制找的数量
            int 就是找n个数进行考量
            float就是找0.5（50%）去试
            sqrt就是找所有特征数开根号个特征
max_depth: 树分裂的最大深度 none
min_sample_split:分裂前我需要保证这个叶子有几个样本 
                      int就是要保证叶子里的样本数大于n，
                      float 就是要保证叶子里的样本数大于某个百分比
min_sample_leaf: 分裂后需要保证每个分裂的叶子有几个样本
min_weight_fraction_leaf:每个叶子节点里的样本数，必须是所有样本的10%
max_leaf_nodes:最多的叶子数
min_impurity_split:每个叶子节点里的不纯度，这个参数是为了保证树不会过早的停止生长；达不到指标就会继续往下分裂 
"""

for d in depth:
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=d)
    clf.fit(x_train, y_train)
    y_test_hat = clf.predict(x_test)
    result = (y_test_hat == y_test)
    err = 1 - np.mean(result)
    err_list.append(err)
    print(d, '错误率： %.2f%%' % (100 * err))

plt.figure(facecolor='w')
plt.plot(depth, err_list, 'ro-', lw=2)
plt.xlabel('Decision Tree Depth', fontsize=15)
plt.ylabel('Error Rate', fontsize=15)
plt.title('Decision Tree Depth & Over Fit', fontsize=18)
plt.grid(True)
plt.show()

三、支持向量机SVM

1、逻辑回归的改进
在这里插入图片描述

在这里插入图片描述
当y = 1时，我们希望

当y = 0时，我们希望

2、逻辑回归的损失函数：

如果y=1

如果y = 0：

逻辑回归的损失函数为：

支持向量机的损失函数：

控制权衡的方式：
逻辑回归： A + λB
SVM： CA + B

3、支持向量机的损失函数：
在这里插入图片描述
当y=1时我们希望

当y=0时我们希望

支持向量机的损失函数：
当y = 1时

当y = 0时

4、支持向量机的决策边界：线性可分的例子

大间距分类器
5、SVM决策边界：

SVM的核函数：用来使SVM能够处理非线性分类
非线性类：
在这里插入图片描述
SVM的核函数：用来使SVM能够处理非线性分类。
6、核函数和相似度

如果 x ≈ l(1)

如果 x 远大于l(1)

7、SVM代码案例

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC

# dataset.make_moons生成半环形图
X, y = datasets.make_moons(noise=0.15, random_state=666)
# print(X, y)
# scatter绘制散点图
# 绘制蓝色点
plt.scatter(X[y==0, 0], X[y==0, 1])
# 绘制红色点
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()


def plot_decision_boundary(model, axis):
    # 生成网格点坐标矩阵
    x0, x1 = np.meshgrid(
        np.linspace(axis[0], axis[1], int((axis[1] - axis[0]) * 100)).reshape(-1, 1),
        np.linspace(axis[2], axis[3], int((axis[3] - axis[2]) * 100)).reshape(-1, 1)
    )
    # ravel将多为数组转化为一维数字
    X_new = np.c_[x0.ravel(), x1.ravel()]

    y_predict = model.predict(X_new)
    zz = y_predict.reshape(x0.shape)

    from matplotlib.colors import ListedColormap
    custom_cmap = ListedColormap(['#EF9A9A', '#FFF59D', '#90CAF9'])

    plt.contourf(x0, x1, zz, linewidth=5, cmap=custom_cmap)


def PolynomialSVC(degree, C=1.0):
    # pipline的作用是将数据处理和模型拟合结合在一起，减少代码量
    return Pipeline([
        ('poly', PolynomialFeatures(degree=degree)),  # 多项式回归
        ('std_scaler', StandardScaler()),  # 标准化的类
        ('linearSVC', LinearSVC(C=C))    # 线性SVM
    ])

poly_svc = PolynomialSVC(degree=3)
poly_svc.fit(X, y)

plot_decision_boundary(poly_svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()

# 当算法SVC()的参数 kernel='poly'时，SVC()能直接打到一种多项式特征的效果；
# 使用 SVC() 前，也需要对数据进行标准化处理
def PolynomialKernelSVC(degree, C=1.0):
    return Pipeline([
        ('std_scaler', StandardScaler()),
        ('kernelSVC', SVC(kernel='poly', degree=degree, C=C))
    ])

poly_kernel_svc = PolynomialKernelSVC(degree=3)
poly_kernel_svc.fit(X, y)

plot_decision_boundary(poly_kernel_svc, axis=[-1.5, 2.5, -1.0, 1.5])
plt.scatter(X[y==0, 0], X[y==0, 1])
plt.scatter(X[y==1, 0], X[y==1, 1])
plt.show()

效果图如下：
在这里插入图片描述

import Successful

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
第六节机器学习--决策数、SVM

一、项目：随机生成验证码然后进行去除杂质等操作！1、随机生成150个5位的验证码。其中验证码的数字和每一个验证码的颜色都是随机的。并且给它加上那一些不规则的污点。作为我们的数据。from PIL import Imagefrom PIL import ImageDrawfrom PIL import ImageFontimport randomdef getRandomColo...
复制链接

扫一扫