python_nba_tree

#! /usr/bin/env python
#coding=utf-8
import pandas as pd
root="F:/Data/data/"
nba=pd.read_csv(root+"nba.txt",parse_dates=["Date"])
nba.columns=["Date","Start","Visitor Team","Visitor Pts","Home Team",
             "Home Pts","Score Type","OT?","Notes"]
nba=nba.drop(nba.columns[1],axis=1)
#增加HomeWin特征
nba["HomeWin"]=nba["Visitor Pts"]<nba["Home Pts"]#把主场获胜球队的数据保存到NumPy数组里
y_true=nba["HomeWin"].values#数组保存的是类别数据
#增加HomeLastWinVisitorLastWin特征
nba["HomeLastWin"]=True#之前我并未先设定这两个特征,后面for循环添加两天特征却不显示????
nba["VisitorLastWin"]=True
from collections import defaultdict#创建默认字典,存储球队上次比赛的结果
won_last=defaultdict(int)#字典的键为球队,值为能否赢得上一场比赛????不懂什么时候设定的键和值????
for index,row in nba.iterrows():
    homeTeam=row["Home Team"]
    visitorTeam=row["Visitor Team"]
    row["HomeLastWin"]=won_last[homeTeam]
    row["VisitorLastWin"]=won_last[visitorTeam]
    nba.ix[index]=row
    won_last[homeTeam]=row["HomeWin"]
    won_last[visitorTeam]=not row["HomeWin"]
from sklearn.tree import DecisionTreeClassifier
clf=DecisionTreeClassifier(random_state=14)
x_previousWins=nba[["HomeLastWin","VisitorLastWin"]].values#查看数据值
from sklearn.model_selection import cross_val_score#之前引文python版本问题有错误
import numpy as np
scores=cross_val_score(clf,x_previousWins,y_true,scoring="accuracy")
print "精确度:{}".format(np.mean(scores))#精确度:0.574679022572
#再增加HomeTeamRanksHigher特征
expStanding=pd.read_csv(root+"nba1.txt",skiprows=[0])#去掉第一行
nba["HomeTeamRanksHigher"]=0
for index,row in nba.iterrows():
    homeTeam=row["Home Team"]
    visitorTeam=row["Visitor Team"]
    if homeTeam=="New Orleans Pelicans":
        homeTeam="New Orleans Hornets"
    elif visitorTeam=="New Orleans Pelicans":
        visitorTeam="New Orleans Hornets"
    homeRank=expStanding[expStanding["Team"]==homeTeam]["Rk"].values[0]
    visitorRank=expStanding[expStanding["Team"]==visitorTeam]["Rk"].values[0]
    row["HomeTeamRanksHigher"]=int(homeRank>visitorRank)
    nba.ix[index]=row
x_homehigher=nba[["HomeLastWin","VisitorLastWin","HomeTeamRanksHigher"]].values#查看数据值
clf=DecisionTreeClassifier(random_state=14)
scores=cross_val_score(clf,x_homehigher,y_true,scoring="accuracy")
print "精确度:{}".format(np.mean(scores))#精确度:0.596657347967
#用两支球队上场比赛的情况作为另一个特征:HomeTeamWonLast
last_match_winner=defaultdict(int)
nba["HomeTeamWonLast"]=0
for index,row in nba.iterrows():
    homeTeam=row["Home Team"]
    visitorTeam=row["Visitor Team"]
    teams=tuple(sorted([homeTeam,visitorTeam]))
    row["HomeTeamWonLast"]=1 if last_match_winner[teams]==homeTeam else 0#??????????
    nba.ix[index]=row
    winner=homeTeam if row["HomeWin"] else visitorTeam
    last_match_winner[teams]=winner#本场比赛中两支球队胜败情况
#HomeTeamRanksHigherHomeTeamWonLast两个特征来做数据集
x_lastwinner=nba[["HomeLastWin","VisitorLastWin","HomeTeamRanksHigher","HomeTeamWonLast"]].values
clf=DecisionTreeClassifier(random_state=14)
scores=cross_val_score(clf,x_lastwinner,y_true,scoring="accuracy")
print "精确度:{}".format(np.mean(scores))#精确度:0.603482432526
from sklearn.preprocessing import LabelEncoder
encoding=LabelEncoder()#把字符串类型的球队转化为整型
encoding.fit(nba["Home Team"].values)#将主场球队的名称化为整型
homeTeam=encoding.transform(nba["Home Team"].values)#
visitorTeam=encoding.transform(nba["Visitor Team"].values)#不造为啥得到的是行向量
x_teams=np.vstack([homeTeam,visitorTeam]).T#矩阵转置后每行两个特征
#由于决策树会把特征看成是连续型的,所以改用二进制来表示
from sklearn.preprocessing import OneHotEncoder
onehot=OneHotEncoder()
x_teams_expanded=onehot.fit_transform(x_teams).todense()
clf=DecisionTreeClassifier(random_state=14)
scores=cross_val_score(clf,x_teams_expanded,y_true,scoring="accuracy")
print "精确度:{}".format(np.mean(scores))#精确度:0.595154276248
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(random_state=14)
scores=cross_val_score(clf,x_teams,y_true,scoring="accuracy")
print "精确度:{}".format(np.mean(scores))#精确度:0.583773383033
x_all=np.hstack([x_lastwinner,x_teams])
scores=cross_val_score(clf,x_all,y_true,scoring="accuracy")
print "精确度:{}".format(np.mean(scores))#精确度:0.579208945952
paramete_space={
    "max_features":[2,10 ],
    "n_estimators":[100,],
    "criterion":["gini","entropy"],
    "min_samples_leaf":[2,4,6]
}
"""
from sklearn.grid_search import GridSearchCV
clf=RandomForestClassifier(random_state=14)
grid=GridSearchCV(clf,paramete_space)
grid.fit(x_all,y_true)
print "精确度:{}".format(np.mean(scores))
print grid.best_estimator_
"""











  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值