#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 21:19:09 2018
@author: hello4720
"""
import numpy as np
import pandas as pd
import lightgbm as lgb
import argparse
import os
import json
parser = argparse.ArgumentParser(description='manual to this script')
parser.add_argument('--onehot_path',type=str,default=None)
parser.add_argument('--train_file',type=str,default=None)
parser.add_argument('--train_output',type=str,default=None)
parser.add_argument('--thread',type=int,default=None)
parser.add_argument('--model_name',type=str,default=None)
args = parser.parse_args()
np.set_printoptions(threshold=5000)
if not os.path.exists(args.onehot_path):
os.mkdir(args.onehot_path)
dataset = pd.read_csv(args.train_file,encoding='gbk') # 注意自己数据路径
train = dataset.iloc[:, 2:12].values
train2 = dataset.iloc[:, 12].values
labels = dataset.iloc[:, -11].values
with open('{}/定损项目名称.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary0 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,0]])
with open('{}/合作类型.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary1 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,1]])
with open('{}/操作类型.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary2 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,2]])
with open('{}/喷漆类型.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary3 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,3]])
with open('{}/国别.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary4 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,4]])
with open('{}/厂牌.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary5 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,5]])
with open('{}/车系.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary6 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,6]])
with open('{}/修理厂类型.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary7 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,7]])
with open('{}/工时单价类型.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary8 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,8]])
with open('{}/是否承修厂牌.json'.format(args.onehot_path)) as f:
D = json.load(f)
intermediary9 = np.array([np.array([int(n) for n in list(D[data].replace('.','').replace(' ','').replace('[','').replace(']',''))]) for data in train[:,9]])
num = []
num.append(len(intermediary0[0]))
num.append(len(intermediary1[0]))
num.append(len(intermediary2[0]))
num.append(len(intermediary3[0]))
num.append(len(intermediary4[0]))
num.append(len(intermediary5[0]))
num.append(len(intermediary6[0]))
num.append(len(intermediary7[0]))
num.append(len(intermediary8[0]))
num.append(len(intermediary9[0]))
trains = np.zeros(shape=(len(intermediary0),sum(num)))
for i in range(len(intermediary0)):
trains[i,:num[0]] = intermediary0[i]
trains[i, num[0]:sum(num[:2])] = intermediary1[i]
trains[i, sum(num[:2]):sum(num[:3])] = intermediary2[i]
trains[i, sum(num[:3]):sum(num[:4])] = intermediary3[i]
trains[i, sum(num[:4]):sum(num[:5])] = intermediary4[i]
trains[i, sum(num[:5]):sum(num[:6])] = intermediary5[i]
trains[i, sum(num[:6]):sum(num[:7])] = intermediary6[i]
trains[i, sum(num[:7]):sum(num[:8])] = intermediary7[i]
trains[i, sum(num[:8]):sum(num[:9])] = intermediary8[i]
trains[i, sum(num[:9]):sum(num)] = intermediary9[i]
print(trains.shape,train2.shape)
train2 = np.expand_dims(train2,axis=1)
trainss = np.concatenate((trains,train2),axis=1)
print('开始')
lgb_train = lgb.Dataset(trainss, label=labels)
### 开始训练
print('设置参数')
params = {
'boosting_type': 'gbdt',
'boosting': 'dart',
'objective': 'regression',
'metric': 'rmse',
'learning_rate': 0.01,
'num_leaves':50,
'max_depth':7,
'max_bin':10,
'min_data_in_leaf':8,
'feature_fraction': 0.6,
'bagging_fraction': 1,
'bagging_freq':0,
'num_thread':args.thread,
'lambda_l1': 0,
'lambda_l2': 0,
'min_split_gain': 0
}
print("开始训练")
gbm = lgb.train(params, # 参数字典
lgb_train, # 训练集
num_boost_round=2000, # 迭代次数2000
early_stopping_rounds=50) # 早停系数
# cv_results = lgb.cv(
# params, lgb_train, num_boost_round=2000, nfold=5, stratified=False, shuffle=True, metrics='rmse',
# early_stopping_rounds=50, verbose_eval=50, show_stdv=True, seed=0)
#
# print('best n_estimators:', len(cv_results['rmse-mean']))
# print('best cv score:', cv_results['rmse-mean'][-1])
from sklearn.model_selection import GridSearchCV
# model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
# learning_rate=0.01, n_estimators=1000, max_depth=6,
# metric='rmse', bagging_fraction = 1,feature_fraction = 0.6)
#
# params_test1={
# 'max_depth': range(3,8,2),
# 'num_leaves':range(50, 170, 30)
# }
# gsearch1 = GridSearchCV(estimator=model_lgb, param_grid=params_test1, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
# gsearch1.fit(trainss,labels)
# print(gsearch1.best_params_, gsearch1.best_score_)
# print(gsearch1.grid_scores_)
preds_offline = gbm.predict(trainss, num_iteration=gbm.best_iteration)
dataset['预测结果'] = np.c_[preds_offline]
from sklearn.metrics import r2_score
print(r2_score(labels,preds_offline))
dataset.to_csv(args.train_output)
gbm.save_model(args.model_name)
# gbm = lgb.Booster(model_file='model.txt')
## 特征选择
# df = pd.DataFrame(train.columns.tolist(), columns=['feature'])
# df['importance']=list(gbm.feature_importance())
# df = df.sort_values(by='importance',ascending=False)
# df.to_csv("feature_score_20180405.csv",index=None,encoding='gbk')