# encoding=utf-8
import time
import matplotlib.pyplot as plt
import pylab as pl
print time.strftime("%I:%M:%S")
from operator import itemgetter
from sklearn import linear_model
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.cross_validation import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
# cross_validation.cross_val_score
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import auc, roc_curve, roc_auc_score
import math
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--filename')
parser.add_argument('--output')
parser.add_argument('--feat_flag')
parser.add_argument('--model')
# parser = argparse.ArgumentParser(description="skeanrn Classification")
parser.add_argument('--train_file', type=str,
default="", help='训练数据集')
parser.add_argument('--test_file', type=str,
default="", help='测试数据集')
parser.add_argument('--n_estimators', type=int,
default=200, help='n_estimators')
parser.add_argument('--learning_rate', type=float,
default=0.2, help='learning_rate')
parser.add_argument('--max_depth', type=int,
default=3, help='max_depth')
parser.add_argument('--subsample', type=float,
default=1.0, help='subsample')
parser.add_argument('--max_features', type=str,
default=None, help='max_features: None,auto,sqrt,log2')
parser.add_argument('--threshold', type=float,
default=0.5, help='threshold for classifier')
parser.add_argument('--feature', type=str,
default=None, help='file to save feature importances')
parser.add_argument('--city', type=str,
default="", help='city file to predict')
parser.add_argument('--load_model', type=int,
default=0, help='load model skip trainning if not zero')
parser.add_argument('--model_file', type=str,
default="", help='save model_file if not null. '
'require not null if load_model.')
parser.add_argument('--cost', type=float,
default=1, help='weight of samples')
parser.add_argument('--clf', type=str,
default="gbdt", help='Classification 分类器')
parser.add_argument('--roc', type=int,
default="0", help='compute roc table if not zero')
parser.add_argument('--roc_file', type=str,
default="roc", help='roc_file prefix file name')
parser.add_argument('--roc_percent', type=float,
default=-1, help='roc_percent to predict')
parser.add_argument('--roc_thres', type=str,
default="0", help='roc_thres to predict, a num or list')
parser.add_argument('--cv', type=int,
default=1, help='Cross_Validation')
parser.add_argument('--kfold', type=int, default=0,
help='use kfold if not zero')
parser.add_argument('--size', type=float, default=0.7,
help='trian set size, float or int')
parser.add_argument('--issvmformat', type=int,
default=0, help='issvmformat')
parser.add_argument('--startfield', type=int, default=0,
help='which field to start anasy if not svmformat')
parser.add_argument('--scale', type=int, default=0,
help='scale the data,标准化')
parser.add_argument('--aggr', type=int, default=0,
help='融合模型')
args = parser.parse_args()
n_estimators = args.n_estimators
subsample = args.subsample
learning_rate = args.learning_rate
max_depth = args.max_depth
cost = args.cost
threshold = args.threshold
feature = args.feature
model_file = args.model_file
roc = args.roc
cv = args.cv
issvmformat = args.issvmformat
kfold = args.kfold
aggr = args.aggr
file_name = args.filename
model_name = args.model
output = args.output
# feat_flag = int(args.feat_flag)
import plot_features
def getdata(sourceFile,index_type,class_res=None):
f = open(sourceFile, "r")
train_x_res = []
train_y_res = []
train_y_reg = []
flag = 0
while True:
data = f.readline()
if not data:
break
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split('\t')
index_label = 0
if index_type:
if datablock[0] == '0.0':
index_label = 0.0
else:
index_label = 1.0
train_y_reg.append(float(datablock[0]))
train_y_res.append(index_label)
else:
if class_res:
train_y_res.append(float(datablock[0]))
else:
# if float(datablock[0]):
# train_y_res.append(float(datablock[0]))
# else:
# continue
train_y_res.append(float(datablock[0]))
# train_y_res.append(index_label)
index_value = []
for ii in datablock[1:len(datablock)]:
index_value.append(float(ii))
train_x_res.append(index_value)
flag += 1
f.close()
if index_type:
return train_x_res,train_y_res,train_y_reg
else:
return train_x_res,train_y_res
def confusematrix(testlabel=[], prey=[]):
"""
计算混肴矩阵,并输出结果
:param testlabel:
:param prey:
:return:
"""
# dic = {'00': 0, '01': 0, '10': 0, '11': 0}
# 第一个数字代表真实,第二个数字代表预测
# for i in range(len(testlabel)):
# key = str(int(testlabel[i])) + str(int(prey[i]))
# dic[key] += 1
cnt = 0
true_one = 0
true_zero = 0
tt = 0
tf = 0
ft = 0
ff = 0
for i in range(len(testlabel)):
cnt += 1
if cnt % 100000 == 0:
# print "读取第[%d]万行" % (cnt / 10000)
pass
# print testlabel[i],prey[i]
tagkey = str(int(testlabel[i])) + str(int(prey[i]))
try:
if tagkey == "11":
true_one += 1
tt += 1
elif tagkey == "01":
true_zero += 1
tf += 1
elif tagkey == "10":
true_one += 1
ft += 1
elif tagkey == "00":
true_zero += 1
ff += 1
except Exception as e:
print line, e
predict_one = tt + tf
predict_zero = ft + ff
print "数据总数: %d" % (cnt)
if cnt <= 1:
exit()
print "真实不为0: ", true_one, ", 真实为0: ", true_zero
print "预测不为0: ", predict_one, ", 预测为0: ", predict_zero
print "类别\t\t真实不为0\t真实为0\t精确率"
if predict_one == 0:
print "预测不为0\t%d\t\t%d\t\t%.4f" % (tt, tf, 0)
else:
print "预测不为0\t%d\t\t%d\t\t%.4f" % (tt, tf, 1.0 * tt / predict_one)
if predict_zero == 0:
print "预测为0: \t\t%d\t\t%d\t\t%.4f" % (ft, ff, 0)
else:
print "预测为0\t\t%d\t\t%d\t\t%.4f" % (ft, ff, 1.0 * ff / predict_zero)
if true_one == 0 and true_zero != 0:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
0, 1.0 * ff / true_zero, 1.0 * (tt + ff) / cnt)
elif true_one == 0 and true_zero == 0:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (0, 0, 1.0 * (tt + ff) / cnt)
elif true_one != 0 and true_zero == 0:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
1.0 * tt / true_one, 0, 1.0 * (tt + ff) / cnt)
else:
print "召回率\t\t%.4f\t\t%.4f\t\t%.4f" % (
1.0 * tt / true_one, 1.0 * ff / true_zero, 1.0 * (tt + ff) / cnt)
print "F1值\t\t%.4f" % (2.0 * ff / (predict_zero + true_zero))
print "--------------------------------------------------------------"
def write_ks(traindata, trainlabel, testdata, testlabel,write_file,feat_name):
feat_lenth = len(traindata[0])
func_name = {"plot_ks"}
feat_ks = []
alldata_len = len(traindata) + len(testdata)
for i in range(feat_lenth):
data_value = []
lable_value = []
feat_true_num = 0
index_res = []
index_res.append(feat_name[i])
for ii in range(len(traindata)):
data_value.append(traindata[ii][i])
lable_value.append(trainlabel[ii])
if not (traindata[ii][i] == -1):
feat_true_num += 1
for ii in range(len(testdata)):
data_value.append(testdata[ii][i])
lable_value.append(testlabel[ii])
if not (testdata[ii][i] == -1):
feat_true_num += 1
print feat_name[i]
resent_ks = plot_features.plot_one_feature(data_value, lable_value, 100,write_file,func_name, 0,feat_name[i], feat_name[i],0)
index_res.append(resent_ks)
index_res.append(feat_true_num / float(alldata_len))
feat_ks.append(index_res)
return feat_ks
def cacul_feat(feat_imp,feat_ks):
content = ''
for ii in range(len(feat_ks)):
feat_ks[ii].append(feat_imp[ii])
ks_sort = sorted(feat_ks, key=lambda feat_ks: feat_ks[1], reverse=True)
for ii in range(len(feat_ks)):
content += ks_sort[ii][0] + ', ks:' + str(ks_sort[ii][1]) + ', import:' + str(ks_sort[ii][3]) + ', radio:' + str(ks_sort[ii][2]) + '\n'
fp = open('./feat_result_kssort' + output ,'w')
fp.write(content)
fp.close()
content = ''
imp_sort = sorted(feat_ks, key=lambda feat_ks: feat_ks[3], reverse=True)
for ii in range(len(feat_ks)):
content += imp_sort[ii][0] + ', ks:' + str(imp_sort[ii][1]) + ', import:' + str(imp_sort[ii][3]) + ', radio:' + str(imp_sort[ii][2]) + '\n'
fp = open('./feat_result_impsort' + output ,'w')
fp.write(content)
fp.close()
def write_res(prey_value,testlabel):
content = ''
for ii in range(len(prey_value)):
content += str(prey_value[ii]) + ',' + str(testlabel[ii]) + '\n'
fp = open('./result_' + output,'w')
fp.write(content)
fp.close()
def cross_spilit(traindata,trainlabel,number):
train_x = []
train_y = []
val_x = []
val_y = []
for ii in range(len(trainlabel)):
flag = ii % 10
if flag in number:
val_x.append(traindata[ii])
val_y.append(trainlabel[ii])
else:
train_x.append(traindata[ii])
train_y.append(trainlabel[ii])
return train_x,train_y,val_x,val_y
def calcul_spilit(traindata,trainlabel):
clf = GradientBoostingClassifier(n_estimators=n_estimators,
subsample=subsample,
learning_rate=learning_rate,
max_features=0.5,
max_depth=max_depth)
sum_split = 0.0
sum_auc = 0.0
for i in range(10):
next_i = i + 1
if next_i > 9:
next_i = next_i % 10
train_x,train_y,val_x,val_y = cross_spilit(traindata,trainlabel,[i])
clf.fit(train_x, train_y)
prey_prob = clf.predict_proba(val_x)
prey_value = []
new_label_all = []
index_flag = 0
for ii in prey_prob:
prey_value.append(ii[1])
func_name = {"plot_ks"}
sum_split += plot_features.plot_one_feature(prey_value, val_y, 200,'./ks/',func_name, 0,'classfication' + str(i), 'classfication',0)
sum_auc += auc_score(clf,val_x, val_y)[4]
print 'average_split',sum_split / float(1000)
print 'average_auc',sum_auc / float(10)
return sum_split / float(1000)
def Classification(traindata, trainlabel, testdata, testlabel,feat_ks,test_y_reg=None,
clf="gbdt", cost=1):
"""
分类,保存结果,输出
:param traindata:
:param trainlabel:
:param testdata:
:param testlabel:
:param clf:
:param cost:
:return:
"""
if cost != 1:
sample_weight = (cost - 1) * trainlabel + 1
else:
sample_weight = None
clfname = clf.lower()
# n_estimators = 200
if clfname == "rf":
clf = RandomForestClassifier(n_estimators=n_estimators,
# subsample=subsample,
# learning_rate=learning_rate,
max_depth=max_depth)
elif clfname == "svm":
clf = SVC(C=0.01,
kernel="rbf", degree=3, gamma=1,
coef0=0.0, probability=True, max_iter=n_estimators)
elif clfname == "bg":
clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(
max_depth=max_depth),
n_estimators=n_estimators,
max_samples=subsample
#learning_rate=learning_rate
#max_depth=max_depth)
)
elif clfname == "ada":
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(
max_depth=max_depth),
n_estimators=n_estimators,
#subsample=subsample,
learning_rate=learning_rate,
#max_depth=max_depth)
)
elif clfname == "lr":
clf = LogisticRegression(penalty="l1",
C=cost,
class_weight='auto',
max_iter=n_estimators,
solver='liblinear')
elif clfname == "dt":
clf = DecisionTreeClassifier(splitter='best',
max_features='auto',
max_depth=max_depth)
elif clfname == "gbdt":
print 'n_estimators-----------------',n_estimators
clf = GradientBoostingClassifier(n_estimators=n_estimators,
subsample=subsample,
learning_rate=learning_rate,
max_features=0.5,
max_depth=max_depth)
else:
print "clf must be one of them(svm, lr, dt, ada, bg, gbdt)"
exit()
if clfname in ["gbdt", "svm"]:
clf.fit(traindata, trainlabel, sample_weight=sample_weight)
print 'train ok!!!'
else:
clf.fit(traindata, trainlabel)
print 'trian ok!!!!'
# joblib.dump(clf, './model/gbdt_allmerg_0608' + output + '.model')
print 'dumpok------------'
# clf = joblib.load('./model/gbdt_allmerg_0608' + output + '.model')
print clf.feature_importances_
feat_imp = clf.feature_importances_
# if feat_flag:
# cacul_feat(feat_imp,feat_ks)
prey_prob = clf.predict_proba(testdata)
result = clf.predict(testdata)
confusematrix(testlabel,result)
prey_value = []
new_label_all = []
split_value = calcul_spilit(traindata,trainlabel)
# split_value = 0.0675
index_flag = 0
for ii in prey_prob:
prey_value.append(ii[1])
index_label = 0
if ii[1] > split_value:
index_label = 1
# elif testlabel[index_flag]:
# print ii[1],test_y_reg[index_flag]
index_flag += 1
new_label_all.append(index_label)
confusematrix(testlabel,new_label_all)
func_name = {"plot_ks"}
plot_features.plot_one_feature(prey_value, testlabel, 200,'./',func_name, 0,'classfication', 'classfication',0)
auc_score(clf,testdata, testlabel)
return new_label_all
# write_res(prey_value,testlabel)
def get_rnn_res(file):
f = open(file, "r")
train_x_res = []
while True:
data = f.readline()
if not data:
break
datablock = float(data.strip('\n'))
train_x_res.append(datablock)
f.close()
return train_x_res
def log_loss(test_y,predict_y):
all_loss = 0
for ii in range(len(test_y)):
test_res = math.log(math.exp(test_y[ii]),10)
predict_res = math.log(math.exp(predict_y[ii]),10)
if not test_y[ii]:
test_res = 0
if not predict_y[ii]:
predict_res = 0
# print math.log(test_y[ii]),math.log(predict_y[ii])
all_loss += (test_res - predict_res) * (test_res - predict_res)
return all_loss / float(len(predict_y))
def merge_rnn(train_x,train_rnn):
# result = []
for ii in range(len(train_rnn)):
train_x[ii].append(train_rnn[ii])
return train_x
def regression(train_x,train_y,test_x,test_y,class_res):
# clf = linear_model.LinearRegression()
# train_rnn = get_rnn_res('./lstm-regression/train2_rnn.txt')
# test_rnn = get_rnn_res('./lstm-regression/test2_rnn.txt')
print len(train_x[0])
# train_x = merge_rnn(train_x,train_rnn)
print len(train_x[0])
# test_x = merge_rnn(test_x,test_rnn)
clf=GradientBoostingRegressor(loss='ls',learning_rate=0.1,n_estimators=n_estimators, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
clf.fit(train_x,train_y)
# joblib.dump(clf, './huodong.model')
# gbdt = joblib.load('./huodong.model')
# print 'feature_importances_'
# print clf.feature_importances_
# print 'end'
# print clf.alpha_
# load test data
# predict with test data
predict_y = clf.predict(test_x)
# print predict_y
# test_y = array(test_y)
for ii in range(len(predict_y)):
if not class_res[ii]:
predict_y[ii] = 0.0
all_loss = log_loss(test_y,predict_y)
print 'sum_loss',all_loss
def get_split_data_file(file):
f = open(file, "r")
train_x = []
train_y = []
train_y_class = []
flag = 0
while True:
data = f.readline()
if not data:
break
# if not (flag % 100):
# print flag
datablock = data.strip('\n').split('\t')
index_value = []
for ii in datablock[1:len(datablock)]:
index_value.append(float(ii))
train_x.append(index_value)
train_y.append(float(datablock[0]))
index_class = 0.0
if datablock[0] != '0.0':
index_class = 1.0
train_y_class.append(index_class)
f.close()
return train_x,train_y,train_y_class
def get_split_data():
train_x = []
train_y = []
train_y_class = []
for i in range(10):
train_x_index,train_y_index,train_y_index_class = get_split_data_file('./split/' + str(i))
train_x.append(train_x_index)
train_y.append(train_y_index)
train_y_class.append(train_y_index_class)
return train_x,train_y,train_y_class
def cross_regress():
train_x_all,train_y_all,train_y_all_class = get_split_data()
clf_class = GradientBoostingClassifier(n_estimators=n_estimators,
subsample=subsample,
learning_rate=learning_rate,
max_features=0.5,
max_depth=max_depth)
# clf.fit(traindata, trainlabel, sample_weight=None)
# prey_prob = clf.predict_proba(testdata)
clf=GradientBoostingRegressor(loss='ls',learning_rate=0.1,n_estimators=n_estimators, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
sum_split = 0.0
split_num = 0.1725
sum_auc = 0.0
sum_loss = 0.0
sum_zero = 0
sum_wrong_0 = 0
# predict_1
for i in range(10):
test_x = train_x_all[i]
test_y = train_y_all[i]
test_y_class = train_y_all_class[i]
train_x = []
train_y = []
train_y_class = []
clf_class = GradientBoostingClassifier(n_estimators=n_estimators,
subsample=subsample,
learning_rate=learning_rate,
max_features=0.5,
max_depth=max_depth)
# clf.fit(traindata, trainlabel, sample_weight=None)
# prey_prob = clf.predict_proba(testdata)
clf=GradientBoostingRegressor(loss='ls',learning_rate=0.1,n_estimators=n_estimators, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
for ii in range(10):
if i == ii:
continue
else:
train_x.extend(train_x_all[ii])
train_y.extend(train_y_all[ii])
train_y_class.extend(train_y_all_class[ii])
clf_class.fit(train_x,train_y_class)
prey_prob = clf_class.predict_proba(test_x)
prey_value = []
index_flag = 0
predict_zero = {}
index_flag = 0
index_all_zero = 0
index_zero = 0
for ii in prey_prob:
prey_value.append(ii[1])
if ii[1] < split_num:
predict_zero[index_flag] = 1
else:
index_all_zero += 1
if not test_y[index_flag]:
index_zero += 1
index_flag += 1
sum_zero += index_all_zero
sum_wrong_0 += index_zero
# if i == 1:
# print predict_zero
func_name = {"plot_ks"}
sum_split += plot_features.plot_one_feature(prey_value, test_y_class, 200,'./',func_name, 0,'classfication', 'classfication',0)
sum_auc += auc_score(clf_class,test_x, test_y_class)[4]
# train_x_reg,train_y_reg = split_reg(train_x,train_y)
train_x_reg,train_y_reg = train_x,train_y
# 0.378656777915
# 0.75 result 1.39204250948
# 0.378656777915 result 1.62
# 0.6 result 1.48319360175
# 0.9 result 1.3901058072
# 1 result 1.39485026152
# 0.8 1.39003133975
# train_x_reg,train_y_reg = split_reg_radio(train_x,train_y,0.8)
print len(train_y_reg),len(train_y)
clf.fit(train_x_reg,train_y_reg)
predict_y = clf.predict(test_x)
# zero_num = 0
for ii in range(len(predict_y)):
has_flag = predict_zero.has_key(ii)
if has_flag:
predict_y[ii] = 0.0
all_loss = log_loss(test_y,predict_y)
sum_loss += all_loss
print 'sum_loss',all_loss
print sum_split / float(10),sum_auc / float(10),sum_loss / float(10),sum_wrong_0 / float(sum_zero)
print 'cross valadation endddddd!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
def split_reg_radio(train_x,train_y,zero_radio):
train_x_res = []
train_y_res = []
train_x_0 = []
train_y_0 = []
for ii in range(len(train_y)):
if train_y[ii]:
train_x_res.append(train_x[ii])
train_y_res.append(train_y[ii])
else:
train_x_0.append(train_x[ii])
train_y_0.append(train_y[ii])
# zero_num = int(float(len(train_x)) * zero_radio)
add_zero_num = 0
for i in range(10):
index_flag = 0
for ii in range(len(train_x_0)):
index_flag = ii % 10
if i == index_flag:
add_zero_num += 1
train_x_res.append(train_x_0[ii])
train_y_res.append(train_y_0[ii])
index_radio = add_zero_num / float(len(train_x_res))
if index_radio > zero_radio:
break
index_radio = add_zero_num / float(len(train_x_res))
if index_radio > zero_radio:
break
print 'index 0 zero:',add_zero_num / float(len(train_x_res))
return train_x_res,train_y_res
def split_reg(train_x,train_y):
train_x_new = []
train_y_new = []
for ii in range(len(train_y)):
if train_y[ii]:
train_x_new.append(train_x[ii])
train_y_new.append(train_y[ii])
return train_x_new,train_y_new
def pre_regression(train_x,train_y,test_x,class_res,train_x_id):
clf=GradientBoostingRegressor(loss='ls',learning_rate=0.1,n_estimators=100, subsample=1, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False)
clf.fit(train_x,train_y)
predict_y = clf.predict(test_x)
# train_x_1,train_y_1,train_x_0,train_y_0 = split_right(train_x,train_y)
# split_10_num(train_x_1,train_y_1,train_x_0,train_y_0)
# print predict_y
content_res = 'uid,predict\n'
file = 'QCfWsd.csv'
zero_num = 0
for ii in range(len(predict_y)):
has_flag = class_res.has_key(ii)
if has_flag:
predict_y[ii] = 0.0
zero_num += 1
index_result = math.exp(predict_y[ii])
if (index_result == 1):
index_result = 0.0
content_res += train_x_id[ii] + ',' + str(index_result) + '\n'
print 'zero_num',zero_num
fp = open(file,'w')
fp.write(content_res)
fp.close()
def get_csv(file):
f = open(file, "r")
train_x_res = []
train_x_id = []
flag = 0
while True:
data = f.readline()
if not data:
break
if not flag:
flag += 1
continue
if not (flag % 100):
print flag
datablock = data.strip('\n').split(',')
index_value = []
for ii in datablock[1:len(datablock)]:
index_value.append(float(ii))
train_x_res.append(index_value)
train_x_id.append(datablock[0])
f.close()
return train_x_res,train_x_id
def auc_score(clf,Xtest, ytest, pos_label=1, auc_score=True):
pytest = clf.predict(Xtest)
pytestprob = clf.predict_proba(Xtest)
pytestprob = [k[pos_label] for k in pytestprob]
fpr, tpr, thresholds = roc_curve(ytest, pytestprob, pos_label=pos_label)
header = "pos=" + str(pos_label) +"\tprecision\trecall\taccuracy\tf1_score"
scores = (precision_score(ytest, pytest, pos_label=pos_label),
recall_score(ytest, pytest, pos_label=pos_label),
accuracy_score(ytest, pytest),
f1_score(ytest, pytest, pos_label=pos_label))
if auc_score:
header += "\tauc"
scores = scores + (auc(fpr, tpr), )
print header
print scores
return scores
def split_right(traindata,trainlabel):
train_x_1 = []
train_y_1 = []
train_x_0 = []
train_y_0 = []
for ii in range(len(traindata)):
if trainlabel[ii]:
train_x_1.append(traindata[ii])
train_y_1.append(trainlabel[ii])
else:
train_x_0.append(traindata[ii])
train_y_0.append(trainlabel[ii])
return train_x_1,train_y_1,train_x_0,train_y_0
def split_10_num(train_x_1,train_y_1,train_x_0,train_y_0):
for i in range(10):
content = ''
train_x,train_y,val_x_0,val_y_0 = cross_spilit(train_x_0,train_y_0,[i])
train_x,train_y,val_x_1,val_y_1 = cross_spilit(train_x_1,train_y_1,[i])
for ii in range(len(val_x_0)):
content += str(val_y_0[ii])
for iii in val_x_0[ii]:
content += '\t' + str(iii)
content += '\n'
for ii in range(len(val_x_1)):
content += str(val_y_1[ii])
for iii in val_x_1[ii]:
content += '\t' + str(iii)
content += '\n'
print i,len(content.split('\n'))
fp = open('./split/' + str(i) ,'w')
fp.write(content)
fp.close()
return sum_split / float(1000)
def predict_classfication(traindata, trainlabel, testdata):
clf = GradientBoostingClassifier(n_estimators=n_estimators,
subsample=subsample,
learning_rate=learning_rate,
max_features=0.5,
max_depth=max_depth)
clf.fit(traindata, trainlabel, sample_weight=None)
prey_prob = clf.predict_proba(testdata)
prey_value = []
new_label_all = {}
# split_value = calcul_spilit(traindata,trainlabel)
split_value = 0.1725
index_flag = 0
for ii in prey_prob:
prey_value.append(ii[1])
index_label = 0
if ii[1] < split_value:
new_label_all[index_flag] = 1
index_flag += 1
# new_label_all.append(index_label)
return new_label_all
def main():
index_process = 1
if index_process:
cross_regress()
test_x,test_id = get_csv('./all_test.csv')
print len(test_x),len(test_id)
index_type = 1
train_x,train_y,train_y_reg = getdata('./all_train.txt',index_type)
print len(train_x),len(train_y),len(train_y_reg)
class_res = predict_classfication(train_x,train_y,test_x)
# print class_res
index_type = 0
train_x,train_y = getdata('./all_train.txt',index_type)
# test_x,test_y = getdata(test2,index_type,class_res)
print len(train_x),len(train_y),len(train_y_reg),len(test_x),len(class_res)
pre_regression(train_x,train_y,test_x,class_res,test_id)
else:
train2 = './train2.txt'
test2 = './test2.txt'
# classfication
index_type = 1
train_x,train_y,train_y_reg = getdata(train2,index_type)
test_x,test_y,test_y_reg = getdata(test2,index_type)
class_res = Classification(train_x,train_y,test_x,test_y,0,test_y_reg)
index_type = 0
train_x,train_y = getdata(train2,index_type)
test_x,test_y = getdata(test2,index_type,class_res)
print len(train_x),len(train_y),len(test_x),len(test_y)
regression(train_x,train_y,test_x,test_y,class_res)
if __name__=='__main__':
main()
sklearn gbdt
最新推荐文章于 2022-11-21 12:02:07 发布