数据读取和交叉验证:分别使用了LR分类,决策树和随机森林
但出现一个问题是在交叉验证中提升的准确率在提交后并无提升,有待考证
经过试验,随机森林的效果要好一些但不稳定,LR又比决策树稍高,但仅是在改数据和处理方法下的结论。
datafile = load_data('train.csv')
train_target,train_data = data_clean(datafile)
#洗牌
r = random.randint(2147483647)
random.seed(r)
random.shuffle(train_data)
random.seed(r)
random.shuffle(train_target)
print 'load finished'
#'''
#交叉验证
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_data,train_target, test_size=0.5, random_state=0)
#clf = LogisticRegression()#LR分类 参数默认
#clf = RandomForestClassifier(n_estimators=8)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test).tolist()
calculate_result(y_test,y_predicted)
对test分类及输出
clf = RandomForestClassifier(n_estimators=12)
clf.fit(train_data,train_target)
datafile1 = load_data('test.csv')
test_data = data_clean1(datafile1)
doc_class_predicted = clf.predict(test_data)
csvfile = file('submission.csv', 'rb')
reader = csv.reader(csvfile)
datafile2 = []
for line in reader:
datafile2.append(line)
test_target = [0 for i in range(len(datafile2))]
for i in range(len(datafile2)):
test_target[i] = datafile2[i][1]
csvfile.close()
calculate_result(test_target,doc_class_predicted)
#输出
csvfile = file('csv_test.csv', 'wb')
writer = csv.writer(csvfile)
writer.writerow(doc_class_predicted)
随机森林的参数设置:树的颗数、树的深度
LR参数设置:学习率、正则化项
#clf = LogisticRegression(C=0.2, dual=False, fit_intercept=True,penalty='l2', tol=0.0001)
#clf = RandomForestClassifier(n_estimators=12, max_depth=3)
clf = GradientBoostingClassifier(n_estimators=11, subsample=0.5, max_depth=3)
并没有太多试验,初步发现10-12颗树的情况较好些。
我认为样本数量太少是训练不稳定以及结果稍差的重要原因,接下来可能尝试使用随机抽样再重组的方法提高样本数量,KNN与贝叶斯分类表现较差。
发现一种有趣的调试方法
#Applying method
max_score = 0
best_n = 0
for n in range(1,100):
print n
rfc_scr = 0.
rfc = RandomForestClassifier(n_estimators=n)
for train, test in KFold(len(train_data), n_folds=10, shuffle=True):
rfc.fit(train_data[predictors].T[train].T, train_data["Survived"].T[train].T)
rfc_scr += rfc.score(train_data[predictors].T[test].T, train_data["Survived"].T[test].T)/10
if rfc_scr > max_score:
max_score = rfc_scr
best_n = n