gcforest训练:
import argparse
import numpy as np
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from gcforest.gcforest import GCForest
from gcforest.utils.config_utils import load_json
def parse_args():
parser=argparse.ArgumentParser()
parser.add_argument("--model",type=str,default='gcforest',help='Train Model File')
parser.add_argument("--data",type=str,default='mooc_data.txt',help='Dataset')
args=parser.parse_args()
return args
def get_toy_config():
config = {}
ca_config = {}
ca_config["random_state"] = 0
ca_config["max_layers"] = 100
ca_config["early_stopping_rounds"] = 3
ca_config["n_classes"] = 10
ca_config["estimators"] = []
ca_config["estimators"].append(
{"n_folds": 5, "type": "XGBClassifier", "n_estimators": 10, "max_depth": 5,
"objective": "multi:softprob", "silent": True, "nthread": -1, "learning_rate": 0.1} )
ca_config["estimators"].append({"n_folds": 5, "type": "RandomForestClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
ca_config["estimators"].append({"n_folds": 5, "type": "ExtraTreesClassifier", "n_estimators": 10, "max_depth": None, "n_jobs": -1})
ca_config["estimators"].append({"n_folds": 5, "type": "LogisticRegression"})
config["cascade"] = ca_config
return config
if __name__ == "__main__":
args = parse_args()
if args.model =='gcforest':
config = get_toy_config()
else:
config = load_json(args.model)
file=args.data
dir='dataset/'+file
if os.path.exists(dir)==False:
raise ValueError("The file does not exist!")
dataset=np.loadtxt(dir)
np.random.shuffle(dataset)
index=int(dataset.shape[0]*0.2)
X_train=dataset[:index,1:-1]
y_train=dataset[:index,-1]
X_test=dataset[index:,1:-1]
y_test=dataset[index:,-1]
gc = GCForest(config)
# If the model you use cost too much memory for you.
# You can use these methods to force gcforest not keeping model in memory
# gc.set_keep_model_in_mem(False), default is TRUE.
X_train = X_train[:, np.newaxis, :]
X_test = X_test[:, np.newaxis, :]
X_train_enc = gc.fit_transform(X_train, y_train)
# X_enc is the concatenated predict_proba result of each estimators of the last layer of the GCForest model
# X_enc.shape =
# (n_datas, n_estimators * n_classes): If cascade is provided
# (n_datas, n_estimators * n_classes, dimX, dimY): If only finegrained part is provided
# You can also pass X_test, y_test to fit_transform method, then the accracy on test data will be logged when training.
# X_train_enc, X_test_enc = gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test)
# WARNING: if you set gc.set_keep_model_in_mem(True), you would have to use
# gc.fit_transform(X_train, y_train, X_test=X_test, y_test=y_test) to evaluate your model.
y_pred = gc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of GcForest = {:.2f} %".format(acc * 100))
# You can try passing X_enc to another classfier on top of gcForest.e.g. xgboost/RF.
'''
X_test_enc = gc.transform(X_test)
X_train_enc = X_train_enc.reshape((X_train_enc.shape[0], -1))
X_test_enc = X_test_enc.reshape((X_test_enc.shape[0], -1))
X_train_origin = X_train.reshape((X_train.shape[0], -1))
X_test_origin = X_test.reshape((X_test.shape[0], -1))
X_train_enc = np.hstack((X_train_origin, X_train_enc))
X_test_enc = np.hstack((X_test_origin, X_test_enc))
print("X_train_enc.shape={}, X_test_enc.shape={}".format(X_train_enc.shape, X_test_enc.shape))
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, n_jobs=-1)
clf.fit(X_train_enc, y_train)
y_pred = clf.predict(X_test_enc)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of Other classifier using gcforest's X_encode = {:.2f} %".format(acc * 100))
'''
# dump
with open("test_gcforest.pkl", "wb") as f:
pickle.dump(gc, f, pickle.HIGHEST_PROTOCOL)
'''
# load
with open("test.pkl", "rb") as f:
gc = pickle.load(f)
y_pred = gc.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Test Accuracy of GcForest (save and load) = {:.2f} %".format(acc * 100))
'''