lightbgm训练

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 21:19:09 2018

@author: hello4720
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
import argparse
import os
import json
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument('--onehot_path',type=str,default=None)
parser.add_argument('--train_file',type=str,default=None)
parser.add_argument('--train_output',type=str,default=None)
parser.add_argument('--thread',type=int,default=None)
parser.add_argument('--model_name',type=str,default=None)
args = parser.parse_args()
np.set_printoptions(threshold=5000)

if not os.path.exists(args.onehot_path):
    os.mkdir(args.onehot_path)
dataset = pd.read_csv(args.train_file,encoding='gbk')  # 注意自己数据路径
train = dataset.iloc[:, 2:12].values
train2 = dataset.iloc[:, 12].values
labels = dataset.iloc[:, -11].values

with open('{}/定损项目名称.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary0 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,0]])

with open('{}/合作类型.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary1 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,1]])

with open('{}/操作类型.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary2 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,2]])

with open('{}/喷漆类型.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary3 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,3]])

with open('{}/国别.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary4 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,4]])

with open('{}/厂牌.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary5 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,5]])

with open('{}/车系.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary6 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,6]])

with open('{}/修理厂类型.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary7 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,7]])

with open('{}/工时单价类型.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary8 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,8]])

with open('{}/是否承修厂牌.json'.format(args.onehot_path)) as f:
    D = json.load(f)
intermediary9 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,9]])


num = []
num.append(len(intermediary0[0]))
num.append(len(intermediary1[0]))
num.append(len(intermediary2[0]))
num.append(len(intermediary3[0]))
num.append(len(intermediary4[0]))
num.append(len(intermediary5[0]))
num.append(len(intermediary6[0]))
num.append(len(intermediary7[0]))
num.append(len(intermediary8[0]))
num.append(len(intermediary9[0]))

trains = np.zeros(shape=(len(intermediary0),sum(num)))
for i in range(len(intermediary0)):
    trains[i,:num[0]] = intermediary0[i]
    trains[i, num[0]:sum(num[:2])] = intermediary1[i]
    trains[i, sum(num[:2]):sum(num[:3])] = intermediary2[i]
    trains[i, sum(num[:3]):sum(num[:4])] = intermediary3[i]
    trains[i, sum(num[:4]):sum(num[:5])] = intermediary4[i]
    trains[i, sum(num[:5]):sum(num[:6])] = intermediary5[i]
    trains[i, sum(num[:6]):sum(num[:7])] = intermediary6[i]
    trains[i, sum(num[:7]):sum(num[:8])] = intermediary7[i]
    trains[i, sum(num[:8]):sum(num[:9])] = intermediary8[i]
    trains[i, sum(num[:9]):sum(num)] = intermediary9[i]
print(trains.shape,train2.shape)
train2 = np.expand_dims(train2,axis=1)
trainss = np.concatenate((trains,train2),axis=1)
print('开始')
lgb_train = lgb.Dataset(trainss, label=labels)
### 开始训练
print('设置参数')
params = {
            'boosting_type': 'gbdt',
            'boosting': 'dart',
            'objective': 'regression',
            'metric': 'rmse',

            'learning_rate': 0.01,
            'num_leaves':50,
            'max_depth':7,

            'max_bin':10,
            'min_data_in_leaf':8,

            'feature_fraction': 0.6,
            'bagging_fraction': 1,
            'bagging_freq':0,
            'num_thread':args.thread,
            'lambda_l1': 0,
            'lambda_l2': 0,
            'min_split_gain': 0
}

print("开始训练")
gbm = lgb.train(params,                     # 参数字典
                lgb_train,                  # 训练集
                num_boost_round=2000,       # 迭代次数2000
                early_stopping_rounds=50)   # 早停系数
# cv_results = lgb.cv(
#     params, lgb_train, num_boost_round=2000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
#     early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
#
# print('best n_estimators:', len(cv_results['rmse-mean']))
# print('best cv score:', cv_results['rmse-mean'][-1])
from sklearn.model_selection import GridSearchCV
# model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
#                               learning_rate=0.01, n_estimators=1000, max_depth=6,
#                               metric='rmse', bagging_fraction = 1,feature_fraction = 0.6)
#
# params_test1={
#     'max_depth': range(3,8,2),
#     'num_leaves':range(50, 170, 30)
# }
# gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
# gsearch1.fit(trainss,labels)
# print(gsearch1.best_params_, gsearch1.best_score_)
# print(gsearch1.grid_scores_)
preds_offline = gbm.predict(trainss, num_iteration=gbm.best_iteration)
dataset['预测结果'] = np.c_[preds_offline]
from sklearn.metrics import r2_score
print(r2_score(labels,preds_offline))
dataset.to_csv(args.train_output)
gbm.save_model(args.model_name)
# gbm = lgb.Booster(model_file='model.txt')
## 特征选择
# df = pd.DataFrame(train.columns.tolist(), columns=['feature'])
# df['importance']=list(gbm.feature_importance())
# df = df.sort_values(by='importance',ascending=False)
# df.to_csv("feature_score_20180405.csv",index=None,encoding='gbk')

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值