1.可视化树
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
import graphviz #可视化
import pydotplus #画.dot文件
from IPython.display import Image #图片
from sklearn.model_selection import train_test_split #数据集划分为测试集和训练集
from sklearn.datasets.california_housing import fetch_california_housing #sklearn内置的房价的数据集
house = fetch_california_housing()
#print(house.data.shape) #(20640, 8)
dtr = tree.DecisionTreeRegressor(max_depth=2)
dtr.fit(house.data[:,[6,7]],house.target) #指定了第6,7列,fit()传递两个参数X,y
#可视化树
#格式基本上不需要变动,这里生成.dot文件
dot_data = \
tree.export_graphviz(
dtr, #这里是实例的名字
out_file=None,
feature_names=house.feature_names[6:8], #列名
filled= True,
impurity=False,
rounded=True
)
#画树
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FF2DD") #填充颜色
#保存树
graph.write_png(r'C:\\Users\\Administrator\\Desktop\\dtr.png') #保存图片
2.训练数据
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
import graphviz #可视化
import pydotplus #画.dot文件
from IPython.display import Image #图片
from sklearn.model_selection import train_test_split #数据集划分为测试集和训练集
from sklearn.datasets.california_housing import fetch_california_housing #sklearn内置的房价的数据集
house = fetch_california_housing()
#print(house.data.shape) #(20640, 8)
x_train,x_test,y_train,y_test = train_test_split(house.data,house.target,test_size=0.1,random_state=42)
dtr = tree.DecisionTreeRegressor(random_state=42)
dtr.fit(x_train,y_train)
score = dtr.score(x_test,y_test)
print(score) #0.637318351331017
3.小练习(用随机森林预测科比投中是否投中球)
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
from sklearn.ensemble import RandomForestClassifier #随机森林
from sklearn.model_selection import KFold #交叉验证
from sklearn.metrics import confusion_matrix,log_loss
from sklearn import metrics
filename = 'data.csv'
raw = pd.read_csv(filename)
#print(raw.shape) #(30697, 25)
#print(raw.tail()) #数据中shot_made_flag(也就是是否投中这一列我们可以做标签),其中有很多空值,我们直接将空值的数据作为预测值吧
#取出shot_made_flag列中不为空的所有数据
kebe = raw[pd.notnull(raw["shot_made_flag"])]
#print(kebe.shape) #(25697, 25) 将近5000个数据作为测试集来预测标签
"""
#画图
plt.figure()
#loc_x and loc_y #位置坐标
plt.subplot(121)
plt.scatter(kebe.loc_x,kebe.loc_y,c='blue',alpha=0.02) #alpha透明度
plt.title("loc_x and loc_y")
#lat and lon #经纬度
plt.subplot(122)
plt.scatter(kebe.lon,kebe.lat,c="red",alpha=0.02)
plt.title("lat and lon")
#显示
plt.show()
"""
#数据预处理
#将剩余的分钟和秒数转化成一列
raw["remaining_time"] = raw["minutes_remaining"] * 60 + raw["seconds_remaining"]
#print(raw["remaining_time"][0:5]) #打印前5行
#print(kebe.action_type.unique()) #打印某列的属性值(不重复的)
#print(kebe.shot_type.value_counts()) #2PT Field Goal 20285 3PT Field Goal 5412
#将season列(2008-09)中numpy不认识的字符"-"转化
raw["season"] = raw["season"].apply(lambda x:int(x.split("-")[1])) #在season列2008-09这样的数据分割取后面09
#将数据中的某些列用pandas转化成DataFrame格式
matchup_opponent = pd.DataFrame({"matchup":kebe.matchup,"opponent":kebe.opponent})
#print(matchup_opponent)
#去除列中强相关(可能两个列代表一个意思)的列
drops = ["shot_id","team_id","team_name","shot_zone_range","shot_zone_basic","lon","lat","seconds_remaining",
"minutes_remaining","game_event_id","game_id","game_date","shot_zone_area","matchup"]
for drop in drops:
raw = raw.drop(drop,1) #1按列,去除drop列
#print(raw.shape) #(30697, 14)
#将属性值为string类型重新编码
categorical_vars = ["action_type","combined_shot_type","shot_type","opponent","period","season"]
for var in categorical_vars:
var_dummies = pd.get_dummies(raw[var],prefix=var) #获得dummies格式数据
raw = pd.concat([raw,var_dummies],1) #按列连接
raw = raw.drop(var,1) #去除原来的列
#print(raw.shape) #(30697, 133)
#sklearn构造模型和训练
x_train = raw[pd.notnull(raw["shot_made_flag"])]
y_train = x_train["shot_made_flag"]
x_train = x_train.drop("shot_made_flag",1)
x_test = raw[pd.isnull(raw["shot_made_flag"])]
x_test = x_test.drop("shot_made_flag",1)
"""
#find the best n_estimators for RandomForestClassifier
min_score = 100000
best_n = 0
score_n = []
range_n = np.logspace(0,2,num=3).astype(int)#在10**0-10**2中随机选择随机选择三个数
for n in range_n:
print("the number of trees:",n)
t1 = time.time() #返回当前时间的时间戳(1970纪元后经过的浮点秒数)
rfc_score = 0.0
rfc = RandomForestClassifier(n_estimators=n,max_depth=5) #示例化,也可以用相同的方法找一下树的最大深度的最优值
#交叉验证
kf = KFold(n_splits=5)
kf.get_n_splits(x_train)
for train_k,test_k in kf.split(x_train): #train_k,test_k返回索引
rfc.fit(x_train.iloc[train_k],y_train.iloc[train_k])
pred = rfc.predict(x_train.iloc[test_k])
#print(pred)
rfc_score += log_loss(y_train.iloc[test_k],pred) / 5 #希望log_loss值越低越好
score_n.append(rfc_score)
if rfc_score < min_score:
min_score = rfc_score
best_n = n
t2 = time.time()
print(best_n,min_score) #
"""
#训练
rfc = RandomForestClassifier(n_estimators=8,max_depth=5)
rfc.fit(x_train,y_train)
pred = rfc.predict(x_test)
print(pred) #[0. 0. 1. ... 1. 1. 1.]