当输入数据中存在非线性关系的时候,基于线性回归的模型就会失效,而基于树的算法则不受数据中非线性关系的影响,基于树的方法最大的一个困扰时为了避免过拟合而对树进行剪枝的难度,对于潜在数据中的噪声,大型的树倾向于受影响,导致低偏差(过度拟合)或高方差(极度不拟合)。不过如果我们生成大量的树,最终的预测值采用集成所有树产生的输出的平均值,就可以避免方差的问题。
1. 随机森林:集成技术,采用大量的树来建模,但这里我们要保证树之间没有相互关联,不能选择所有属性,而是随机选择一个属性的子集给某个树。虽然我们再随机森林中构建最大深度的树,这样它们可以很好适应自举的样本,得到的偏差较低,后果是引入了高方差,但通过构建大量树,使用平均法则作为最后的预测值,可以解决方差问题。
2. 超随机树:比随机森林引入更多随机化,可以更高效地解决方差问题,它的运算复杂度也略有降低。随机森林是自举部分实例来给每棵树,但超随机树是使用完整的训练集数据,另外关于给定K作为给定节点随机选择的属性数量,它随机选择割点,不考虑目标变量,不像随机森林那样基于基尼不存度或熵标准。这种更多随机化带来的架构可以更好的降低方差。而且由于划分节点不需要相关标准,因此不需要花费时间来鉴定最适合用来划分数据集的属性。
3. 旋转森林:前两种需要集成大量的树才能获得好效果,而旋转森林可以用较小的树来获取相同甚至更好的效果。算法场景是投票场景,属性被划分为大小相等的K个不重叠的子集,然后结合PCA、旋转矩阵来完成模型的构建。
随机森林:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 09:57:49 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import RandomizedSearchCV
from operator import itemgetter
import numpy as np
#加入数据
def get_data():
no_features = 30
redundant_features = int(0.1*no_features)
informative_features = int(0.6*no_features)
repeated_features = int(0.1*no_features)
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.03,n_informative=informative_features,\
n_redundant=redundant_features,\
n_repeated=repeated_features,random_state=7)
return x,y
#构建普通的随机森林树
def build_model(x,y,x_dev,y_dev):
no_trees = 100
estimator = RandomForestClassifier(n_estimators=no_trees)
estimator.fit(x,y)
train_predicted = estimator.predict(x)
train_score = accuracy_score(y,train_predicted)
dev_predicted = estimator.predict(x_dev)
dev_score = accuracy_score(y_dev,dev_predicted)
print "training accuracy = %0.2f dev accuracy = %0.2f" \
% (train_score,dev_score)
#寻找森林所需的最优参数的方法
def search_parameters(x,y,x_dev,y_dev):
estimator = RandomForestClassifier()
no_features = x.shape[1]#每个属性值附给一个随机树
no_iterations = 20#树的大小,即深度
sqr_no_features = int(np.sqrt(no_features))
#随机在基尼系数和熵里选择,并用它作为标准在每次迭代中划分节点
#每次划分节点时随机选择m个属性,这个m就由max_features定义
#这里有个splitter参数,默认时best,在算法执行的时候基于max_features属性的内部选择划分机制
#best:从max_features参数定义的给定属性集合里选择最大可能划分;random:随机选择一个用来划分的属性
parameters = {"n_estimators":np.random.randint(75,200,no_iterations),\
"criterion":["gini","entropy"],\
"max_features":[sqr_no_features,sqr_no_features*2,\
sqr_no_features*3,sqr_no_features+10]}
#n_jobs:在并行中评估器的最大数量,如果是-1,则用上所有CPU,如果是1则不并行计算
#交叉验证数为5份,一种20次迭代,因此构建的模型由100个
grid = RandomizedSearchCV(estimator=estimator,\
param_distributions=parameters,\
verbose=1,\
n_iter=no_iterations,\
random_state=77,\
n_jobs=-1,\
cv=5)
#print grid
grid.fit(x,y)
print_model_worth(grid,x_dev,y_dev)
return grid.best_estimator_,grid.grid_scores_
#打印出模型的效果评估值
def print_model_worth(grid,x_dev,y_dev):
#grid.grid_scores_:最佳的交叉验证分数
#key:指定第几个域。 itemgetter(1)用于获取对象第1维的数据。reverse=True:实现降序
scores = sorted(grid.grid_scores_,key=itemgetter(1),reverse=True) [0:5]#这里只选择前5个最好的模型看看
for model_no,score in enumerate(scores):
print "model %d, score = %0.3f" % (model_no+1,\
score.mean_validation_score)
print "parameters = {0}".format(score.parameters)
print
dev_predicted = grid.predict(x_dev)
print classification_report(y_dev,dev_predicted)
#输出重要性特征
def get_feature_importance(model):
feature_importance = model.feature_importances_
fm_with_id = [(i,importance) for i,importance \
in enumerate(feature_importance)]
fm_with_id = sorted(fm_with_id,key=itemgetter(1),reverse=True)[0:10]
print "Top 10 Features"
for importance in fm_with_id:
print "feature %d importance = %0.3f" % (importance[0],importance[1])
#编写main函数
if __name__=="__main__":
x,y = get_data()
#数据集划分
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
#建立模型
build_model(x_train,y_train,x_dev,y_dev)
model,score_all= search_parameters(x,y,x_dev,y_dev)
get_feature_importance(model)
# get_feature_importance(model)
超随机树:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 14:46:59 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report,accuracy_score
from sklearn.cross_validation import train_test_split,cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.grid_search import RandomizedSearchCV
from operator import itemgetter
import numpy as np
def get_data():
no_features = 30
redundant_features = int(0.1*no_features)
informative_features = int(0.6*no_features)
repeated_features = int(0.1*no_features)
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.3,n_informative=informative_features,\
n_repeated=repeated_features,\
n_redundant=redundant_features,\
random_state=7)
return x,y
def build_forest(x,y,x_dev,y_dev):
no_trees = 100
estimator = ExtraTreesClassifier(n_estimators=no_trees,random_state=51)
estimator.fit(x,y)
train_predicted = estimator.predict(x)
train_score = accuracy_score(y,train_predicted)
dev_predicted = estimator.predict(x_dev)
dev_score = accuracy_score(y_dev,dev_predicted)
print "training accuracy = %0.2f\n dev accuracy = %0.2f" \
% (train_score,dev_score)
print "cross validation"
print cross_val_score(estimator,x_dev,y_dev,cv=5)
def print_model_worth(grid,x_dev,y_dev):
scores = sorted(grid.grid_scores_,key=itemgetter(1),reverse=True) [0:5]
for model_no,score in enumerate(scores):
print "model %d, score = %0.3f" % (model_no+1,\
score.mean_validation_score)
print "parameters = {0}".format(score.parameters)
print
dev_predicted = grid.predict(x_dev)
print classification_report(y_dev,dev_predicted)
def search_parameters(x,y,x_dev,y_dev):
estimator = ExtraTreesClassifier()
no_features = x.shape[1]
no_iterations = 20
sqr_no_features = int(np.sqrt(no_features))
parameters = {"n_estimators":np.random.randint(75,200,no_iterations),\
"criterion":["gini","entropy"],\
"max_features":[sqr_no_features,sqr_no_features*2,\
sqr_no_features*3,sqr_no_features+10]}
grid = RandomizedSearchCV(estimator=estimator,\
param_distributions=parameters,\
verbose=1,\
n_iter=no_iterations,\
random_state=77,\
n_jobs=-1,\
cv=5)
grid.fit(x.y)
print_model_worth(grid,x_dev,y_dev)
return grid.best_estimator_
if __name__=="__main__":
x,y = get_data()
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,test_size=0.3,\
random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
build_forest(x_train,y_train,x_dev,y_dev)
model = search_parameters(x,y,x_dev,y_dev)
旋转森林:
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 11 17:01:22 2018
@author: Alvin AI
"""
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
import numpy as np
#加载数据
def get_data():
no_features = 50
redundant_features = int(0.1*no_features)
informative_features = int(0.6*no_features)
repeated_features = int(0.1*no_features)
x,y = make_classification(n_samples=500,n_features=no_features,\
flip_y=0.03,n_informative=informative_features,\
n_redundant=redundant_features,\
n_repeated=repeated_features,random_state=7)
return x,y
#得到随机子集
def get_random_subset(iterable,k):
subsets = []
iteration = 0
np.random.shuffle(iterable)#打乱特征索引
subset = 0
limit = len(iterable)/k
while iteration < limit:
if k <= len(iterable):
subset = k
else:
subset = len(iterable)
subsets.append(iterable[-subset:])
del iterable[-subset:]
iteration += 1
return subsets
#建立旋转森林模型
def build_rotationtree_model(x_train,y_train,d,k):
models = []#决策树
r_matrices = []#与树相关的旋转矩阵
feature_subsets =[]#迭代中用到的特征子集
for i in range(d):
x,_,_ ,_= train_test_split(x_train,y_train,test_size=0.3,random_state=7)
#特征的索引
feature_index = range(x.shape[1])
#获取特征的子集
#10个子集,每个子集包含5个索引
random_k_subset = get_random_subset(feature_index,k)#10个子集
feature_subsets.append(random_k_subset)#25个树,每个树10个子集
R_matrix = np.zeros((x.shape[1],x.shape[1]),dtype=float)#旋转矩阵
for each_subset in random_k_subset:
pca = PCA()
x_subset = x[:,each_subset]#提取出子集内索引对应的x值
pca.fit(x_subset)#主成分分析
for ii in range(0,len(pca.components_)):
for jj in range(0,len(pca.components_)):
R_matrix[each_subset[ii],each_subset[jj]] =\
pca.components_[ii,jj]
x_transformed = x_train.dot(R_matrix)
model = DecisionTreeClassifier()
model.fit(x_transformed,y_train)
models.append(model)
r_matrices.append(R_matrix)
return models,r_matrices,feature_subsets
def model_worth(models,r_matrices,x,y):
predicted_ys = []
for i,model in enumerate(models):
x_mod = x.dot(r_matrices[i])
predicted_y = model.predict(x_mod)
predicted_ys.append(predicted_y)
predicted_matrix = np.asmatrix(predicted_ys)#转化为矩阵 25*350
final_prediction = []
for i in range(len(y)):
pred_from_all_models = np.ravel(predicted_matrix[:,i])#将多维数组降为一维
non_zero_pred = np.nonzero(pred_from_all_models)[0]#nonzeros(a)返回数组a中值不为零的元素的下标
is_one = len(non_zero_pred) > len(models)/2#如果非0预测大于模型内树的总数的一半则为1
final_prediction.append(is_one)
print classification_report(y,final_prediction)
return predicted_matrix
#主函数
if __name__=="__main__":
x,y = get_data()
#数据集划分
x_train,x_test_all,y_train,y_test_all = train_test_split(x,y,\
test_size=0.3,random_state=9)
x_dev,x_test,y_dev,y_test = train_test_split(x_test_all,y_test_all,\
test_size=0.3,random_state=9)
models,r_matrices,features = build_rotationtree_model(x_train,y_train,25,5)#输的数量25,要用的特征子集5
predicted_matrix1=model_worth(models,r_matrices,x_train,y_train)
predicted_matrix2=model_worth(models,r_matrices,x_dev,y_dev)