from h2o.estimators.gbm import H2OGradientBoostingEstimator
from pysparkling import *
#from h2o.estimators.deeplearning import H2ODeepLearningEstimator
ss = SparkSession.builder.appName("H2O-GBM").master("local[*]").config("spark.executor.memory", "2g").config("spark.driver.memory", "4g").config("spark.default.parallelism", "8").getOrCreate()
hc = H2OContext.getOrCreate(ss)
df = spark.read.csv(path='/home/jerry/geoplatform/gai_platform/data/feature_filter/user.csv', header=True, inferSchema=True)
h2o_df = hc.as_h2o_frame(df,framename='df_h20')
model_gbm = H2OGradientBoostingEstimator(ntrees=50,max_depth=6,learn_rate=0.1,distribution="bernoulli")
predictors = h2o_df.names[:]
ratios = [0.6,0.2]
frs = h2o_df.split_frame(ratios,seed=12345)
train = frs[0]
train.frame_id = "Train"
valid = frs[2]
valid.frame_id = "Validation"
test = frs[1]
test.frame_id = "Test"
model_gbm.train(x=predictors,y="target",training_frame=train,validation_frame=valid)
importance_cols = dict()
for i in model_gbm.varimp():
importance_cols.setdefault(i[0],i[3])