# -*- coding:utf-8 -*- import json import pandas as pd import pymysql import sklearn from sqlalchemy import create_engine import numpy as np import lightgbm as lgb from sklearn import datasets from sklearn.model_selection import train_test_split ## 加上字符集参数,防止中文乱码 dbconn = pymysql.connect( host="127.0.0.1", database="test", user="root", password="111111", port=3306, charset='utf8') conn = create_engine('mysql+mysqldb://root:111111@localhost:3306/yes?charset=utf8') #上面这一大段等同于conn = create_engine('mysql+mysqldb://root:111111@localhost:3306/test?charset=utf8') # sql语句 sqlcmd = "select * from smjy2" data = pd.read_sql(sqlcmd, dbconn) #目标和变量 X = data.drop(['cp4_flag','cust_no'],axis=1) y = data['cp4_flag'] print X.dtypes # #标准化 归一化# # from sklearn.preprocessing import MinMaxScaler # scaler = MinMaxScaler(feature_range=(0,1)) # 取所有数值型变量 quantity = [attr for attr in X.columns if X.dtypes[attr] != 'float64'] # 数值变量集合 print quantity#循环的方法来把他们都改成int,不然dataframe里面不是很接受这种数据类型 from sklearn.preprocessing import LabelEncoder for c in quantity: data[[c]] = LabelEncoder().fit_transform(data[[c]]) #print X.head() #解决样本不均衡问题,大比小为10:3 # from imblearn.over_sampling import RandomOverSampler # ratio = {1:50000,0:1009881}#这里的数字每天都不一样,最好先统计一下 # sm = RandomOverSampler(ratio=ratio,random_state=13) # sm = SMOTE(random_state=42,m_neighbors=5,ratio=0.3) #X_res,y_res = sm.fit_sample(X,y) #分成训练集和测试集 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=None)#分测试集和训练集 params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'nthread':4, 'learning_rate':0.1, 'num_leaves':30, 'max_depth': 5, 'subsample': 0.8, 'colsample_bytree': 0.8, } #还要转成lightgbm格式的形式,直方图算法,并指定类目变量 train_data = lgb.Dataset(X_train,label=y_train,categorical_feature=quantity) test_data = lgb.Dataset(X_test,label=y_test,categorical_feature=quantity) #这里是交叉验证,参数优化后直接带入模型 # params = { # 'boosting_type': 'gbdt', # 'objective': 'binary', # 'metric': 'auc', # 'nthread':4, # 'learning_rate':0.1, # 'num_leaves':30, # 'max_depth': 5, # 'subsample': 0.8, # 'colsample_bytree': 0.8, # 'boost_from_average':'false', # # # } # from sklearn.grid_search import GridSearchCV # params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)} # # gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=188, max_depth=6, bagging_fraction = 0.8,feature_fraction = 0.8), # param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1) # gsearch1.fit(X_train,y_train) # gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ #根据结果,我们取max_depth=4,num_leaves=10。 #params_test1={'max_depth': range(3,8,1), 'num_leaves':range(5, 100, 5)} #print X #把参数直接带入模型 #lgb.cv(categorical_feature=quantity,) from sklearn import metrics model=lgb.LGBMClassifier(boosting_type='gbdt',objective='binary', metrics='auc',learning_rate=0.01, n_estimators=1000, max_depth=4, num_leaves=8, max_bin=255,min_data_in_leaf=81,bagging_fraction=0.7, bagging_freq= 30, feature_fraction= 0.8,lambda_l1=0.1, lambda_l2=0,min_split_gain=0.1, ) model.fit(X_train,y_train) y_pre=model.predict(X_test) print("acc:",metrics.accuracy_score(y_test,y_pre)) print("auc:",metrics.roc_auc_score(y_test,y_pre))
除了交叉验证之外的lgm代码
最新推荐文章于 2022-04-28 00:09:04 发布