python决策树sklearn_Python_sklearn机器学习库学习笔记(四)decision_tree(决策树)...

# 决策树

import pandas as pd

from sklearn.tree import DecisionTreeClassifier

from sklearn.cross_validation import train_test_split

from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

from sklearn.grid_search import GridSearchCV

import zipfile

#压缩节省空间

z=zipfile.ZipFile('ad-dataset.zip')

# df=pd.read_csv(z.open(z.namelist()[0]),header=None,low_memory=False)

# df = pd.read_csv(z.open(z.namelist()[0]), header=None, low_memory=False)

df=pd.read_csv('.\\tree_data\\ad.data',header=None)

explanatory_variable_columns=set(df.columns.values)

response_variable_column=df[len(df.columns.values)-1]

#最后一列是代表的标签类型

explanatory_variable_columns.remove(len(df.columns)-1)

y=[1 if e =='ad.' else 0 for e in response_variable_column]

X=df.loc[:,list(explanatory_variable_columns)]

#匹配?字符,并把值转化为-1

X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)

copycode.gif

X_train,X_test,y_train,y_test=train_test_split(X,y)

#用信息增益启发式算法建立决策树

pipeline=Pipeline([('clf',DecisionTreeClassifier(criterion='entropy'))])

parameters = {

'clf__max_depth': (150, 155, 160),

'clf__min_samples_split': (1, 2, 3),

'clf__min_samples_leaf': (1, 2, 3)

}

#f1查全率和查准率的调和平均

grid_search=GridSearchCV(pipeline,parameters,n_jobs=-1,

verbose=1,scoring='f1')

grid_search.fit(X_train,y_train)

print '最佳效果:%0.3f'%grid_search.best_score_

print '最优参数'

best_parameters=grid_search.best_estimator_.get_params()

best_parameters

copycode.gif

输出结果:

Fitting 3 folds for each of 27 candidates, totalling 81 fits

[Parallel(n_jobs=-1)]: Done 46 tasks | elapsed: 21.0s

[Parallel(n_jobs=-1)]: Done 81 out of 81 | elapsed: 34.7s finished

最佳效果:0.888

最优参数

Out[123]:

{'clf': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,

max_features=None, max_leaf_nodes=None, min_samples_leaf=1,

min_samples_split=3, min_weight_fraction_leaf=0.0,

presort=False, random_state=None, splitter='best'),

'clf__class_weight': None,

'clf__criterion': 'entropy',

'clf__max_depth': 160,

'clf__max_features': None,

'clf__max_leaf_nodes': None,

'clf__min_samples_leaf': 1,

'clf__min_samples_split': 3,

'clf__min_weight_fraction_leaf': 0.0,

'clf__presort': False,

'clf__random_state': None,

'clf__splitter': 'best',

'steps': [('clf',

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=160,

max_features=None, max_leaf_nodes=None, min_samples_leaf=1,

min_samples_split=3, min_weight_fraction_leaf=0.0,

presort=False, random_state=None, splitter='best'))]}

for param_name in sorted(parameters.keys()):

print ('\t%s:%r'%(param_name,best_parameters[param_name]))

predictions=grid_search.predict(X_test)

print classification_report(y_test,predictions)

输出结果:

clf__max_depth:150

clf__min_samples_leaf:1

clf__min_samples_split:1

precision recall f1-score support

0 0.97 0.99 0.98 703

1 0.91 0.84 0.87 117

avg / total 0.96 0.96 0.96 820

df.head()

输出结果;

0123456789...1549155015511552155315541555155615571558

0

125

125

1.0

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

1

57

468

8.2105

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

2

33

230

6.9696

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

3

60

468

7.8

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

4

60

468

7.8

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

# 决策树集成

copycode.gif

#coding:utf-8

import pandas as pd

from sklearn.ensemble import RandomForestClassifier

from sklearn.cross_validation import train_test_split

from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline

from sklearn.grid_search import GridSearchCV

df=pd.read_csv('.\\tree_data\\ad.data',header=None,low_memory=False)

explanatory_variable_columns=set(df.columns.values)

response_variable_column=df[len(df.columns.values)-1]

copycode.gif

df.head()

0123456789...1549155015511552155315541555155615571558

0

125

125

1.0

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

1

57

468

8.2105

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

2

33

230

6.9696

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

3

60

468

7.8

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

4

60

468

7.8

1

0

0

0

0

0

0

...

0

0

0

0

0

0

0

0

0

ad.

copycode.gif

#The last column describes the targets(去掉最后一列)

explanatory_variable_columns.remove(len(df.columns.values)-1)

y=[1 if e=='ad.' else 0 for e in response_variable_column]

X=df.loc[:,list(explanatory_variable_columns)]

#置换有?的为-1

X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)

X_train,X_test,y_train,y_test=train_test_split(X,y)

pipeline=Pipeline([('clf',RandomForestClassifier(criterion='entropy'))])

parameters = {

'clf__n_estimators': (5, 10, 20, 50),

'clf__max_depth': (50, 150, 250),

'clf__min_samples_split': (1, 2, 3),

'clf__min_samples_leaf': (1, 2, 3)

}

grid_search = GridSearchCV(pipeline,parameters,n_jobs=-1,verbose=1,scoring='f1')

grid_search.fit(X_train,y_train)

copycode.gif

print(u'最佳效果:%0.3f'%grid_search.best_score_)

print u'最优的参数:'

best_parameters=grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):

print('\t%s:%r'%(param_name,best_parameters[param_name]))

输出结果:

最佳效果:0.929 最优的参数: clf__max_depth:250 clf__min_samples_leaf:1 clf__min_samples_split:3 clf__n_estimators:50

predictions=grid_search.predict(X_test)

print classification_report(y_test,predictions)

输出结果:

precision recall f1-score support

0 0.98 1.00 0.99 705

1 0.97 0.90 0.93 115

avg / total 0.98 0.98 0.98 820

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值