除了交叉验证之外的lgm代码

# -*- coding:utf-8 -*-
import json
import pandas as pd
import pymysql
import sklearn
from sqlalchemy import create_engine
import numpy as np
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split

## 加上字符集参数,防止中文乱码
dbconn = pymysql.connect(
        host="127.0.0.1",
        database="test",
        user="root",
        password="111111",
        port=3306,
        charset='utf8')
conn = create_engine('mysql+mysqldb://root:111111@localhost:3306/yes?charset=utf8')
#上面这一大段等同于conn = create_engine('mysql+mysqldb://root:111111@localhost:3306/test?charset=utf8')

# sql语句
sqlcmd = "select * from smjy2"
data = pd.read_sql(sqlcmd, dbconn)


#目标和变量
X = data.drop(['cp4_flag','cust_no'],axis=1)
y = data['cp4_flag']

print X.dtypes
# #标准化  归一化#
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler(feature_range=(0,1))
# 取所有数值型变量
quantity = [attr for attr in X.columns if X.dtypes[attr] != 'float64']  # 数值变量集合
print quantity#循环的方法来把他们都改成int,不然dataframe里面不是很接受这种数据类型
from sklearn.preprocessing import LabelEncoder
for c in quantity:
     data[[c]] = LabelEncoder().fit_transform(data[[c]])

#print X.head()
#解决样本不均衡问题,大比小为10:3
# from imblearn.over_sampling import RandomOverSampler
# ratio = {1:50000,0:1009881}#这里的数字每天都不一样,最好先统计一下
# sm = RandomOverSampler(ratio=ratio,random_state=13)

# sm = SMOTE(random_state=42,m_neighbors=5,ratio=0.3)
#X_res,y_res = sm.fit_sample(X,y)

#分成训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=None)#分测试集和训练集

params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.1,
          'num_leaves':30,
          'max_depth': 5,
          'subsample': 0.8,
          'colsample_bytree': 0.8,

         }

#还要转成lightgbm格式的形式,直方图算法,并指定类目变量
train_data = lgb.Dataset(X_train,label=y_train,categorical_feature=quantity)
test_data = lgb.Dataset(X_test,label=y_test,categorical_feature=quantity)

#这里是交叉验证,参数优化后直接带入模型
# params = {
#           'boosting_type': 'gbdt',
#           'objective': 'binary',
#           'metric': 'auc',
#           'nthread':4,
#           'learning_rate':0.1,
#           'num_leaves':30,
#           'max_depth': 5,
#           'subsample': 0.8,
#           'colsample_bytree': 0.8,
#           'boost_from_average':'false',
#
#
#     }
# from sklearn.grid_search import GridSearchCV
# params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}
#
# gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=188, max_depth=6, bagging_fraction = 0.8,feature_fraction = 0.8),
#                       param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1)
# gsearch1.fit(X_train,y_train)
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
#根据结果,我们取max_depth=4,num_leaves=10。

#params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)}

#print X
#把参数直接带入模型
#lgb.cv(categorical_feature=quantity,)
from sklearn import metrics
model=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',
                         metrics='auc',learning_rate=0.01,
                         n_estimators=1000, max_depth=4, num_leaves=8,
                         max_bin=255,min_data_in_leaf=81,bagging_fraction=0.7,
                         bagging_freq= 30, feature_fraction= 0.8,lambda_l1=0.1,
                         lambda_l2=0,min_split_gain=0.1,
                         )
model.fit(X_train,y_train)

y_pre=model.predict(X_test)

print("acc:",metrics.accuracy_score(y_test,y_pre))
print("auc:",metrics.roc_auc_score(y_test,y_pre))



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值