Lr的学习除了理论学习外,还有一些具象化的dd会理解比较深入。本文实现了3D的效果图,回想起来还是http://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic.html#sphx-glr-auto-examples-linear-model-plot-logistic-py 的解释会更容易理解一些
当前内容没有完全说清楚,后续会找时间补齐
以下代码可以分析以下问题:
1.对于一维特征,其参数对logit的影响:
参见 draw_logit() 函数,参数为0时,lr模型就是一条水平线。随着参数值越来越大,logit曲线越来越陡峭。对正反例集合的区分度也就越高
2.为什么要求特征分布单调(伴随值越大,label=单一值的占比越高),因为lr曲线本身只能切一刀(与y=1,y=0两条曲线只有2个焦点,而正态分布需要4个点),对正态分布是无法提供良好支持的,反而效果会变弱
3.为什么woe的方式会让效果变好:因为对特征的分布进行了调整,让特征分布变为递增
4.为什么分桶会效果会变好:这个跟是不是lr没关系。。。。
以下是后续计划分析的:
5.梯度下降与牛顿方法,迭代过程中,会怎么影响logit曲线的形态:后续补充
6.sklearn的调参会怎样影响LR模型:后续分析
7.如何确定特征有效,或可以进一步深挖
8.woe与oneHot方法对于LR哪一个更好
9.如何评价最佳分桶?
#!/usr/bin/env python
# encoding: utf-8
import numpy as np
import matplotlib.pyplot as plt
import math
import random
def get_x2_by_x1(x1,type):
x2=0
if type=='=':
x2=x1
if type=='sqrt':
x2=x1*x1
if type=='circle':
r=random.random()
if r<0.5:
x2=math.sqrt(1-x1*x1)-random.random()
else:
x2 = -(math.sqrt(1 - x1 * x1)-random.random())
return x2
def get_cirle(offset_x=0,offset_y=0,_step=500):
x = np.arange(-1, +1, 1.0 / _step);
x2 = [get_x2_by_x1(k, 'circle')+offset_y for k in x]
x1=[k+offset_x for k in x]
return x1,x2
def draw_defalut_circle(_each_sample_num):
import numpy as np
each_sample_num=_each_sample_num
fig, ax = plt.subplots()
tx1=[]
tx2=[]
x1, x2 = get_cirle(0, 3,each_sample_num)
tx1+=x1
tx2 += x2
ax.scatter(x1, x2)
x1, x2 = get_cirle(2, 1,each_sample_num)
tx1 += x1
tx2 += x2
ax.scatter(x1, x2,c='r', marker='x') #, s=10)
#print np.array([tx1, tx2]).reshape(2, each_sample_num)
f=np.c_[tx1,tx2]
label=[0]*each_sample_num*2+[1]*each_sample_num*2
"draw logit regress result"
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
lr = linear_model.LinearRegression()
print f.shape,len(label)
lr.fit(f, label)
print('Coefficients: \n', lr.coef_)
x1p=lr.coef_[0]
x2p=lr.coef_[1]
def get_y(f,coef_):
lines_num=f.shape[0]
col_num=f.shape[1]
y=[]
for i in range(0,lines_num):
k=0
for j in range(0,col_num):
k+=f[i,j]*coef_[j]
y.append(1.0 /(1+math.exp(-1*k)))
return y
#y=[1.0 /(1+math.exp(-1*(x1p * k))) for k in f[:,0]]
y=get_y(f,lr.coef_)
#ax.scatter(f[:,0], y, c='g', marker='o') # , s=10)
#ax.scatter(f[:,1], y, c='g', marker='o') # , s=10)
#y=[1.0 /(1+math.exp(-1*x2p * k)) for k in f[:,1]]
#ax.scatter(f[:,1], y, c='g', marker='o') # , s=10)
plt.show()
def draw_logit(_each_sample_num):
import numpy as np
each_sample_num=_each_sample_num
fig, ax = plt.subplots()
def set_a(ax,_a,c='y',marker='x'):
a = _a
x = np.arange(-2, 2, 1.0 / 500);
y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
ax.scatter(x, y, c=c, marker=marker)
set_a(ax,-2)
a = -1
x = np.arange(-2, 2, 1.0 / 500);
y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
ax.scatter(x, y,c='y', marker='x')
a = 0
x = np.arange(-2, 2, 1.0 / 500);
y = [1.0 / (1 + math.exp(-1 * a * k)) - 0.5 for k in x]
ax.scatter(x, y)
a=1
x = np.arange(-2, 2, 1.0 / 500);
y = [1.0 /(1+math.exp(-1*a * k))-0.5 for k in x]
ax.scatter(x, y)
a = 2
y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
ax.scatter(x, y,c='r', marker='x')
a = 4
y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
ax.scatter(x, y,c='r', marker='x')
a = 8
y = [1.0 / (1 + math.exp(-1 * (a * k)))-0.5 for k in x]
ax.scatter(x, y,c='r', marker='x')
plt.show()
def draw_defalut_circle_3D(_each_sample_num):
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
each_sample_num=_each_sample_num
#fig, ax = plt.subplots()
fig = plt.figure()
ax = Axes3D(fig)
tx1=[]
tx2=[]
x1, x2 = get_cirle(0, 0,each_sample_num)
tx1+=x1
tx2 += x2
ax.scatter(x1, x2,[0]*each_sample_num*2)
x1, x2 = get_cirle(2, 2,each_sample_num)
tx1 += x1
tx2 += x2
ax.scatter(x1, x2,[1]*each_sample_num*2,c='r', marker='x')
f = np.c_[tx1, tx2]
label = [0] * each_sample_num*2 + [1] * each_sample_num*2
#print len(f[:,0]),len(f[:,1]),len(label)
#ax.scatter(f[:,0], f[:,1],label,c='g', marker='o') #, s=10)
#print np.array([tx1, tx2]).reshape(2, each_sample_num)
"draw logit regress result"
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
lr = linear_model.LinearRegression()
print f.shape,len(label)
lr.fit(f, label)
print('Coefficients: \n', lr.coef_)
x1p=lr.coef_[0]
x2p=lr.coef_[1]
def get_y(f,coef_):
lines_num=f.shape[0]
col_num=f.shape[1]
y=[]
for i in range(0,lines_num):
k=0
for j in range(0,col_num):
k+=f[i,j]*coef_[j]
#k=(k-0.5)*2+0.5
y.append(1.0 /(1+math.exp(-1*k)))
return y
#y=[1.0 /(1+math.exp(-1*(x1p * k))) for k in f[:,0]]
y=get_y(f,lr.coef_)
print y
ax.scatter(f[:,0], f[:, 1], y, c='g', marker='o') # , s=10)
plt.show()
def draw_splashes():
print 'draw'
plt.show()
#draw_splashes()
#draw_defalut_circle(500)
draw_logit(50)
#draw_defalut_circle_3D(5)