通过matplotlib分析LR

Lr的学习除了理论学习外,还有一些具象化的dd会理解比较深入。本文实现了3D的效果图,回想起来还是http://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic.html#sphx-glr-auto-examples-linear-model-plot-logistic-py 的解释会更容易理解一些


当前内容没有完全说清楚,后续会找时间补齐


以下代码可以分析以下问题:

1.对于一维特征,其参数对logit的影响:

      参见  draw_logit() 函数,参数为0时,lr模型就是一条水平线。随着参数值越来越大,logit曲线越来越陡峭。对正反例集合的区分度也就越高


2.为什么要求特征分布单调(伴随值越大,label=单一值的占比越高),因为lr曲线本身只能切一刀(与y=1,y=0两条曲线只有2个焦点,而正态分布需要4个点),对正态分布是无法提供良好支持的,反而效果会变弱


3.为什么woe的方式会让效果变好:因为对特征的分布进行了调整,让特征分布变为递增


4.为什么分桶会效果会变好:这个跟是不是lr没关系。。。。


以下是后续计划分析的:

5.梯度下降与牛顿方法,迭代过程中,会怎么影响logit曲线的形态:后续补充

6.sklearn的调参会怎样影响LR模型:后续分析

7.如何确定特征有效,或可以进一步深挖

8.woe与oneHot方法对于LR哪一个更好

9.如何评价最佳分桶?



#!/usr/bin/env python
# encoding: utf-8

import numpy as np
import matplotlib.pyplot as plt
import math
import random
def get_x2_by_x1(x1,type):
    x2=0
    if type=='=':
        x2=x1
    if type=='sqrt':
        x2=x1*x1
    if type=='circle':
        r=random.random()
        if r<0.5:
            x2=math.sqrt(1-x1*x1)-random.random()
        else:
            x2 = -(math.sqrt(1 - x1 * x1)-random.random())

    return x2

def get_cirle(offset_x=0,offset_y=0,_step=500):
    x = np.arange(-1, +1, 1.0 / _step);
    x2 = [get_x2_by_x1(k, 'circle')+offset_y for k in x]
    x1=[k+offset_x for k in x]
    return x1,x2


def draw_defalut_circle(_each_sample_num):
    import numpy as np
    each_sample_num=_each_sample_num
    fig, ax = plt.subplots()
    tx1=[]
    tx2=[]
    x1, x2 = get_cirle(0, 3,each_sample_num)
    tx1+=x1
    tx2 += x2
    ax.scatter(x1, x2)

    x1, x2 = get_cirle(2, 1,each_sample_num)
    tx1 += x1
    tx2 += x2
    ax.scatter(x1, x2,c='r', marker='x')  #, s=10)

    #print np.array([tx1, tx2]).reshape(2, each_sample_num)
    f=np.c_[tx1,tx2]
    label=[0]*each_sample_num*2+[1]*each_sample_num*2

    "draw logit regress result"
    from sklearn import datasets
    from sklearn.model_selection import cross_val_predict
    from sklearn import linear_model
    lr = linear_model.LinearRegression()
    print f.shape,len(label)
    lr.fit(f, label)
    print('Coefficients: \n', lr.coef_)
    x1p=lr.coef_[0]
    x2p=lr.coef_[1]

    def get_y(f,coef_):

        lines_num=f.shape[0]
        col_num=f.shape[1]
        y=[]
        for i in range(0,lines_num):
            k=0
            for j in range(0,col_num):
                k+=f[i,j]*coef_[j]
            y.append(1.0 /(1+math.exp(-1*k)))
        return y

    #y=[1.0 /(1+math.exp(-1*(x1p * k))) for k in f[:,0]]
    y=get_y(f,lr.coef_)
    #ax.scatter(f[:,0], y, c='g', marker='o')  # , s=10)
    #ax.scatter(f[:,1], y, c='g', marker='o')  # , s=10)

    #y=[1.0 /(1+math.exp(-1*x2p * k)) for k in f[:,1]]
    #ax.scatter(f[:,1], y, c='g', marker='o')  # , s=10)
    plt.show()



def draw_logit(_each_sample_num):
    import numpy as np
    each_sample_num=_each_sample_num
    fig, ax = plt.subplots()

    def set_a(ax,_a,c='y',marker='x'):
        a = _a
        x = np.arange(-2, 2, 1.0 / 500);
        y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
        ax.scatter(x, y, c=c, marker=marker)
    set_a(ax,-2)

    a = -1
    x = np.arange(-2, 2, 1.0 / 500);
    y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
    ax.scatter(x, y,c='y', marker='x')

    a = 0
    x = np.arange(-2, 2, 1.0 / 500);
    y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
    ax.scatter(x, y)

    a=1
    x = np.arange(-2, 2, 1.0 / 500);
    y = [1.0 /(1+math.exp(-1*a * k))-0.5 for k in x]
    ax.scatter(x, y)

    a = 2
    y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
    ax.scatter(x, y,c='r', marker='x')


    a = 4
    y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
    ax.scatter(x, y,c='r', marker='x')

    a = 8
    y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
    ax.scatter(x, y,c='r', marker='x')
    plt.show()

def draw_defalut_circle_3D(_each_sample_num):
    import numpy as np
    from mpl_toolkits.mplot3d import Axes3D

    each_sample_num=_each_sample_num
    #fig, ax = plt.subplots()
    fig = plt.figure()
    ax = Axes3D(fig)

    tx1=[]
    tx2=[]
    x1, x2 = get_cirle(0, 0,each_sample_num)
    tx1+=x1
    tx2 += x2
    ax.scatter(x1, x2,[0]*each_sample_num*2)

    x1, x2 = get_cirle(2, 2,each_sample_num)
    tx1 += x1
    tx2 += x2
    ax.scatter(x1, x2,[1]*each_sample_num*2,c='r', marker='x')

    f = np.c_[tx1, tx2]
    label = [0] * each_sample_num*2 + [1] * each_sample_num*2

    #print len(f[:,0]),len(f[:,1]),len(label)
    #ax.scatter(f[:,0], f[:,1],label,c='g', marker='o')  #, s=10)

    #print np.array([tx1, tx2]).reshape(2, each_sample_num)

    "draw logit regress result"
    from sklearn import datasets
    from sklearn.model_selection import cross_val_predict
    from sklearn import linear_model
    lr = linear_model.LinearRegression()
    print f.shape,len(label)
    lr.fit(f, label)
    print('Coefficients: \n', lr.coef_)
    x1p=lr.coef_[0]
    x2p=lr.coef_[1]

    def get_y(f,coef_):

        lines_num=f.shape[0]
        col_num=f.shape[1]
        y=[]
        for i in range(0,lines_num):
            k=0
            for j in range(0,col_num):
                k+=f[i,j]*coef_[j]
            #k=(k-0.5)*2+0.5
            y.append(1.0 /(1+math.exp(-1*k)))
        return y

    #y=[1.0 /(1+math.exp(-1*(x1p * k))) for k in f[:,0]]
    y=get_y(f,lr.coef_)
    print y

    ax.scatter(f[:,0], f[:, 1], y, c='g', marker='o')  # , s=10)
    plt.show()

def draw_splashes():
        print 'draw'
        plt.show()
#draw_splashes()
#draw_defalut_circle(500)
draw_logit(50)
#draw_defalut_circle_3D(5)


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值