import numpy as np
import pandas as pd
import matplotlib as plt
import statsmodels. formula. api as smf
from sklearn import linear_model
import matplotlib. pyplot as plt
% matplotlib inline
data= pd. read_excel( "bankloan_binning.xlsx" )
data. head( 3 )
x1= np. array( [ 1 , 2 , 3 ] )
x2= np. array( [ 3 , 2 , 2 ] )
y= np. array( [ 1 , 0 , 1 ] )
plt. scatter( x1, y)
"""
w=w+alpha*(y-p)*p*(1-p)*x
注:w为回归系数,alpha学习率,在[0 1]之间,通常设为0.3
方程:log(p/(1-p))=b0+b1*x1+b2*x2,or p=1/(1+exp(-(b0+b1*x1+b2*x2)))
"""
"""
迭代#1
令:b0=0,b1=0,b2=0,则p=1/(1+exp(-(0+0*1+0*3)))=0.5
因为w=w+alpha*(y-p)*p*(1-p)*x,
所以,
b0=0+0.3*(1-0.5)*0.5*(1-0.5)*1=0.0375 #这里的1和下面的1不一样,这里的是默认为1
b1=0+0.3*(1-0.5)*0.5*(1-0.5)*1=0.0375 #这里的1是x1的值
b2=0+0.3*(1-0.5)*0.5*(1-0.5)*3=0.1125 #这里的1是x2的值
迭代#2
令:b0=0.0375,b1=0.0375,b2=0.1125,则p=1/(1+exp(-(0.0375+0.0375*2+0.01125*2)))=0.534
因为w=w+alpha*(y-p)*p*(1-p)*x,
所以,
b0=0.0375+0.3*(0-0.534)*0.534*(0-0.534)*1=
b1=0.0375+0.3*(0-0.534)*0.534*(0-0.534)*2=
b2=0.01125+0.3*(0-0.534)*0.534*(0-0.534)*2=
迭代#3......
注:一共6行数据,迭代后重新从第一行开始,则6次迭代为一个epoch(全部样本训练一次)
"""
print ( )
import numpy as np
import pandas as pd
import statsmodels. formula. api as smf
from sklearn import linear_model
import matplotlib. pyplot as plt
% matplotlib inline
from sklearn. model_selection import train_test_split, cross_val_score
df= pd. read_excel( "bankloan_binning.xlsx" )
print ( df. head( 1 ) )
xtrain, xtest, ytrain, ytest= train_test_split( df. iloc[ : , [ 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ] ] , df. iloc[ : , - 1 ]
, test_size= 0.2 , random_state= 0 )
xtrain1, xvalid, ytrain1, yvalid= train_test_split( xtrain, ytrain
, test_size= 0.2 , random_state= 0 )
== == == == == == == = 1 . python数据处理标准流程== == == == == == == =
from sklearn. linear_model import SGDClassifier
sgd_clf= SGDClassifier( loss= 'log' , random_state= 123 )
sgd_clf. fit( xtrain, ytrain)
sgd_clf. score( xtest, ytest)
y_sgd= sgd_clf. predict( xtest)
from sklearn. metrics import classification_report
print ( classification_report( ytest, y_sgd, target_names= [ '非违约' , '违约' ] ) )
sgd_clf. coef_, sgd_clf. intercept_
sgd_clf= SGDClassifier( loss= 'hinge' ,
penalty= 'l2' ,
alpha= 0.0001 ,
l1_ratio= 0.15 ,
fit_intercept= True ,
max_iter= 1000 ,
tol= 0.001 ,
shuffle= True ,
verbose= 0 ,
epsilon= 0.1 ,
n_jobs= None ,
random_state= None ,
learning_rate= 'optimal' ,
eta0= 0.0 ,
power_t= 0.5 ,
early_stopping= False ,
validation_fraction= 0.1 ,
n_iter_no_change= 5 ,
class_weight= None ,
warm_start= False ,
average= False ,
)
print ( """
#特点:1.SGD允许minibatch(在线/核外oob)学习,使用partial_fit方法;
2.拟合大型列和行;
3.稀疏数据处理(loss参数和罚值控制),
4.SGDClassifier支持多分类,依”one-vs-all”的形式
#损失函数:
loss=”hinge”: (soft-margin)线性svm;
loss=”modified_huber”:稳健的异常值处理;
loss=”log”:logistic回归
loss=”perceptron”:感知器算法
其他损失函数如回归张的'huber', 'epsilon_insensitive'
#惩罚项(或正则化):l1与elasticnet可用于稀疏数据
penalty=”l2”: 对coef_的L2范数罚项;
penalty=”l1”: L1范数罚项;
penalty=”elasticnet”: L1与L2的convex组合;
#alpha:乘以正则项的常数或变量(最优化算法);
#l1_ratio:弹性网混合参数,默认为0.15。取值[0,1],l1_ratio=0为L2,l1_ratio=1则为L1,注:(1-l1_ratio)*L2+l1_ratio*L1
#max_iter: int,可选(默认=1000):遍历训练数据的最大值(又名epochs)。只影响fit(),对partial_fit无效;
#shuffle : bool,默认值为True,是否在每次epoch后随机打乱训练数据(洗牌)。
#epsilon : float,如果loss='huber'或 'epsilon_insensitive'或 'squared_epsilon_insensitive'时可用;
如果预测和观测值间的差值小于此阈值,则忽略,即异常值修正参数;
#learning_rate : string,默认'optimal';
learning_rate='constant':eta = eta0,注eta0为初始学习率;
learning_rate='optimal':eta = 1.0 / (alpha * (t + t0)),<---最好的学习率
learning_rate='invscaling':eta = eta0 / pow(t, power_t),注power_t选项另外指定;
learning_rate='adaptive':如果误差持续下降,则eta = eta0,否则(n_iter_no_change等参数满足)学习率除以5;
#validation_fraction : float, default=0.1,验证集比例;
#warm_start : bool,默认False,如果True,调用之前的解决拟合值作为初始化,否则清除;
""" )