机器学习之随机森林

#随机森林存在2重随机性:1样本的随机性 2特征的随机性
import pandas

columns = ["age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", 
           "capital_gain", "capital_loss", "hours_per_week", "native_country", "high_income"]
#income = pandas.read_csv("income.csv", index_col=False,names=columns)
income = pandas.read_csv("D:\\test\machineLearning\income.csv", names=columns)
print income.head(2)
columns = ["education","marital_status","occupation","relationship","race", "sex", "native_country", "high_income"]
for name in columns:
    #将不同类别转化为数字
    col = pandas.Categorical.from_array(income[name])
    income[name]=col.codes
print income.head(2)
   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   

        marital_status        occupation    relationship    race    sex  \
0        Never-married      Adm-clerical   Not-in-family   White   Male   
1   Married-civ-spouse   Exec-managerial         Husband   White   Male   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0          2174             0              40   United-States       <=50K  
1             0             0              13   United-States       <=50K  
   age          workclass  fnlwgt  education  education_num  marital_status  \
0   39          State-gov   77516          9             13               4   
1   50   Self-emp-not-inc   83311          9             13               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
0           1             1     4    1          2174             0   
1           4             0     4    1             0             0   

   hours_per_week  native_country  high_income  
0              40              26            0  
1              13              26            0  
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import math

columns = ["age","capital_gain","occupation","relationship","race", "sex", "native_country", "high_income"]
numpy.random.seed(10)
#洗牌
income=income.reindex(numpy.random.permutation(income.index))
#求最接近它的一个整数
train_max_row = math.floor(income.shape[0]*.8) 
train = income.iloc[:int(train_max_row)]
test = income.iloc[int(train_max_row):]

clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2)
clf.fit(train[columns],train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1,max_depth=5)
clf2.fit(train[columns],train["high_income"])

predictions = clf.predict(test[columns])
print (roc_auc_score(test["high_income"],predictions))

predictions = clf2.predict(test[columns])
print (roc_auc_score(test["high_income"],predictions))
1.0
1.0
predict = clf.predict_proba(test[columns])[:,1]
predict2 = clf2.predict_proba(test[columns])[:,1]
combined = (predict+predict2)/2
#四舍五入
rounded = np.round(combined)

print(roc_auc_score(test["high_income"],rounded))
1.0
#bagging:每棵树的训练样本使随机的  
tree_count = 10
bag_proportion = .6  #指定训练的样本占总树的多少
predictions = []
for i in range(tree_count):
    #random_state指定随机的状态,不同的值,代表每次得到的随机数都是不一样的
    bag = train.sample(frac=bag_proportion,replace=True,random_state=i)
    clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2)
    clf.fit(bag[columns],bag["high_income"])
    predictions.append(clf.predict_proba(test[columns])[:,1])
#求10次概率的平均值
combined = numpy.sum(predictions,axis=0)/10
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"],rounded))
1.0
#bagging:每棵树的训练样本使随机的  
tree_count = 10
bag_proportion = .6  #指定训练的样本占总树的多少
predictions = []
for i in range(tree_count):
    #random_state指定随机的状态,不同的值,代表每次得到的随机数都是不一样的
    bag = train.sample(frac=bag_proportion,replace=True,random_state=i)
    #splitter指定所有特征里的随机部分特征,每次特征都是不一样的
    clf = DecisionTreeClassifier(random_state=1,min_samples_leaf=2,splitter="random",max_features="auto")
    clf.fit(bag[columns],bag["high_income"])
    predictions.append(clf.predict_proba(test[columns])[:,1])
#求10次概率的平均值
combined = numpy.sum(predictions,axis=0)/10
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"],rounded))
1.0
#使用库函数,同样实现以上功能
from sklearn.ensemble import RandomForestClassifier

#n_estimators代表树的个数
clf = RandomForestClassifier(n_estimators=5,random_state=1,min_samples_leaf=2)
clf.fit(train[columns],train["high_income"])

predict = clf.predict(test[columns])
print(roc_auc_score(test["high_income"],predict))
1.0
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值