sklearn构建决策树

1.可视化树

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
import graphviz #可视化
import pydotplus #画.dot文件
from IPython.display import Image #图片
from sklearn.model_selection import train_test_split #数据集划分为测试集和训练集

from sklearn.datasets.california_housing import fetch_california_housing #sklearn内置的房价的数据集

house = fetch_california_housing()
#print(house.data.shape) #(20640, 8)

dtr = tree.DecisionTreeRegressor(max_depth=2)
dtr.fit(house.data[:,[6,7]],house.target) #指定了第6,7列,fit()传递两个参数X,y

#可视化树
#格式基本上不需要变动,这里生成.dot文件
dot_data = \
    tree.export_graphviz(
        dtr, #这里是实例的名字
        out_file=None,
        feature_names=house.feature_names[6:8], #列名
        filled= True,
        impurity=False,
        rounded=True
    )

#画树
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FF2DD") #填充颜色

#保存树
graph.write_png(r'C:\\Users\\Administrator\\Desktop\\dtr.png') #保存图片

在这里插入图片描述

2.训练数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import tree
import graphviz #可视化
import pydotplus #画.dot文件
from IPython.display import Image #图片
from sklearn.model_selection import train_test_split #数据集划分为测试集和训练集

from sklearn.datasets.california_housing import fetch_california_housing #sklearn内置的房价的数据集

house = fetch_california_housing()
#print(house.data.shape) #(20640, 8)

x_train,x_test,y_train,y_test = train_test_split(house.data,house.target,test_size=0.1,random_state=42)

dtr = tree.DecisionTreeRegressor(random_state=42)
dtr.fit(x_train,y_train)

score = dtr.score(x_test,y_test)
print(score) #0.637318351331017

3.小练习(用随机森林预测科比投中是否投中球)

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time

from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.model_selection import KFold #交叉验证
from sklearn.metrics import confusion_matrix,log_loss
from sklearn import metrics

filename = 'data.csv'
raw = pd.read_csv(filename)
#print(raw.shape) #(30697, 25)
#print(raw.tail()) #数据中shot_made_flag(也就是是否投中这一列我们可以做标签),其中有很多空值,我们直接将空值的数据作为预测值吧

#取出shot_made_flag列中不为空的所有数据
kebe = raw[pd.notnull(raw["shot_made_flag"])]
#print(kebe.shape) #(25697, 25) 将近5000个数据作为测试集来预测标签

"""
#画图
plt.figure()

#loc_x and loc_y #位置坐标
plt.subplot(121)
plt.scatter(kebe.loc_x,kebe.loc_y,c='blue',alpha=0.02) #alpha透明度
plt.title("loc_x and loc_y")

#lat and lon #经纬度
plt.subplot(122)
plt.scatter(kebe.lon,kebe.lat,c="red",alpha=0.02)
plt.title("lat and lon")

#显示
plt.show()
"""

#数据预处理
#将剩余的分钟和秒数转化成一列
raw["remaining_time"] = raw["minutes_remaining"] * 60 + raw["seconds_remaining"]
#print(raw["remaining_time"][0:5]) #打印前5行
#print(kebe.action_type.unique()) #打印某列的属性值(不重复的)
#print(kebe.shot_type.value_counts()) #2PT Field Goal    20285  3PT Field Goal     5412

#将season列(2008-09)中numpy不认识的字符"-"转化
raw["season"] = raw["season"].apply(lambda x:int(x.split("-")[1])) #在season列2008-09这样的数据分割取后面09

#将数据中的某些列用pandas转化成DataFrame格式
matchup_opponent = pd.DataFrame({"matchup":kebe.matchup,"opponent":kebe.opponent})
#print(matchup_opponent)

#去除列中强相关(可能两个列代表一个意思)的列
drops = ["shot_id","team_id","team_name","shot_zone_range","shot_zone_basic","lon","lat","seconds_remaining",
         "minutes_remaining","game_event_id","game_id","game_date","shot_zone_area","matchup"]
for drop in drops:
    raw = raw.drop(drop,1) #1按列,去除drop列
#print(raw.shape) #(30697, 14)

#将属性值为string类型重新编码
categorical_vars = ["action_type","combined_shot_type","shot_type","opponent","period","season"]
for var in categorical_vars:
    var_dummies = pd.get_dummies(raw[var],prefix=var) #获得dummies格式数据
    raw = pd.concat([raw,var_dummies],1) #按列连接
    raw = raw.drop(var,1) #去除原来的列
#print(raw.shape) #(30697, 133)


#sklearn构造模型和训练
x_train = raw[pd.notnull(raw["shot_made_flag"])]
y_train = x_train["shot_made_flag"]
x_train = x_train.drop("shot_made_flag",1)
x_test = raw[pd.isnull(raw["shot_made_flag"])]
x_test = x_test.drop("shot_made_flag",1)

"""
#find the best n_estimators for RandomForestClassifier
min_score = 100000
best_n = 0
score_n = []
range_n = np.logspace(0,2,num=3).astype(int)#在10**0-10**2中随机选择随机选择三个数
for n in range_n:
    print("the number of trees:",n)
    t1 = time.time() #返回当前时间的时间戳(1970纪元后经过的浮点秒数)

    rfc_score = 0.0
    rfc = RandomForestClassifier(n_estimators=n,max_depth=5) #示例化,也可以用相同的方法找一下树的最大深度的最优值

    #交叉验证
    kf = KFold(n_splits=5)
    kf.get_n_splits(x_train)

    for train_k,test_k in kf.split(x_train): #train_k,test_k返回索引
        rfc.fit(x_train.iloc[train_k],y_train.iloc[train_k])
        pred = rfc.predict(x_train.iloc[test_k])
        #print(pred)
        rfc_score += log_loss(y_train.iloc[test_k],pred) / 5 #希望log_loss值越低越好

    score_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n

    t2 = time.time()
print(best_n,min_score) #
"""

#训练
rfc = RandomForestClassifier(n_estimators=8,max_depth=5)
rfc.fit(x_train,y_train)
pred = rfc.predict(x_test)
print(pred) #[0. 0. 1. ... 1. 1. 1.]
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值