from pymongo import MongoClient from sklearn.linear_model import LinearRegression from sklearn.externals import joblib import time import numpy as np import pandas as pd import xgboost as xgb from sklearn.cross_validation import train_test_split import os import csv conn2 = MongoClient('192.168.12.152', 27017) tradeDB = conn2.autoTrading # 连接mydb数据库,没有则自动创建 tradeTable2 = tradeDB.stockDataTable2 tradeNameTable=tradeDB.stockNameTable "limit with time and stock" def limitWithStock(oldtime,newtime,symbol): """限定条件: oldtime<date<newtime""" return tradeTable2.find({"timess": { '$gt': oldtime, '$lt': newtime },"stock":symbol}) "limit with Time" def limitCondition(oldtime,newtime): """限定条件: oldtime<date<newtime""" return tradeTable2.find({"timess": { '$gt': oldtime, '$lt': newtime }}) def getTrainData(): """获取所以数据的train & valid """ train_Mx = [] train_My = [] """ ==================================train=============================================== """ trainData = limitCondition(20070101, 20090101) # print(trainData.count()) for item in trainData: train_Mx.append(item['feature']) train_My.append(item['label']) # """ ==================================Valid=============================================== """ # validData = limitCondition(20160101, 20500101) # for item in validData: # valid_Mx.append(item['feature']) # valid_My.append(item['label']) # , valid_Mx, valid_My return [train_Mx,train_My] def trans_df(a): feature = np.array(a[0]) label = np.array(a[1]) label = label.reshape(-1, 1) data = np.hstack((feature, label)) data = pd.DataFrame(data) data.rename(columns={4: 'y'}, inplace=True) return data def xg_boost_train(params,num_rounds,xgb_train,stopping_rounds): plst = list(params.items()) train_xy, val = train_test_split(xgb_train, test_size=0.3, random_state=1) y = train_xy.y X = train_xy.drop(['y'], axis=1) val_y = val.y val_X = val.drop(['y'], axis=1) xgb_val = xgb.DMatrix(val_X, label=val_y) xgb_train = xgb.DMatrix(X, label=y) watchlist = [(xgb_train, 'train'), (xgb_val, 'val')] model = xgb.train(plst, xgb_train, num_rounds, watchlist, early_stopping_rounds=stopping_rounds) model.save_model('./xgb.model') # 用于存储训练出的模型 print("best best_ntree_limit", model.best_ntree_limit) if __name__ == '__main__': a=getTrainData() data=trans_df(a) train_data,test_data = train_test_split(data, test_size=0.2, random_state=1) params = { 'booster': 'gbtree', 'objective': 'reg:linear', # 多分类的问题 'gamma': 0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 'max_depth': 12, # 构建树的深度,越大越容易过拟合 'lambda': 2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 'subsample': 0.7, # 随机采样训练样本 'colsample_bytree': 0.7, # 生成树时进行的列采样 'min_child_weight': 3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 'eta': 0.007, # 如同学习率 'seed': 1000, 'nthread': 7, # cpu 线程数 # 'eval_metric': 'auc' } num_rounds = 50 stopping_rounds = 100 xg_boost_train(params, num_rounds, train_data, stopping_rounds) print("跑到这里了model.predict") model = xgb.Booster(model_file='./xgb.model')#LOAD模型 test_data = test_data.drop(['y'], axis=1) test_data = xgb.DMatrix(test_data) preds = model.predict(test_data) preds_list = preds.tolist() print(preds_list)
XGBoost Demo2
最新推荐文章于 2019-06-28 08:39:48 发布