#! /usr/bin/env python #coding=utf-8 import pandas as pd root="F:/Data/data/" nba=pd.read_csv(root+"nba.txt",parse_dates=["Date"]) nba.columns=["Date","Start","Visitor Team","Visitor Pts","Home Team", "Home Pts","Score Type","OT?","Notes"] nba=nba.drop(nba.columns[1],axis=1) #增加HomeWin特征 nba["HomeWin"]=nba["Visitor Pts"]<nba["Home Pts"]#把主场获胜球队的数据保存到NumPy数组里 y_true=nba["HomeWin"].values#数组保存的是类别数据 #增加HomeLastWin和VisitorLastWin特征 nba["HomeLastWin"]=True#之前我并未先设定这两个特征,后面for循环添加两天特征却不显示???? nba["VisitorLastWin"]=True from collections import defaultdict#创建默认字典,存储球队上次比赛的结果 won_last=defaultdict(int)#字典的键为球队,值为能否赢得上一场比赛????不懂什么时候设定的键和值???? for index,row in nba.iterrows(): homeTeam=row["Home Team"] visitorTeam=row["Visitor Team"] row["HomeLastWin"]=won_last[homeTeam] row["VisitorLastWin"]=won_last[visitorTeam] nba.ix[index]=row won_last[homeTeam]=row["HomeWin"] won_last[visitorTeam]=not row["HomeWin"] from sklearn.tree import DecisionTreeClassifier clf=DecisionTreeClassifier(random_state=14) x_previousWins=nba[["HomeLastWin","VisitorLastWin"]].values#查看数据值 from sklearn.model_selection import cross_val_score#之前引文python版本问题有错误 import numpy as np scores=cross_val_score(clf,x_previousWins,y_true,scoring="accuracy") print "精确度:{}".format(np.mean(scores))#精确度:0.574679022572 #再增加HomeTeamRanksHigher特征 expStanding=pd.read_csv(root+"nba1.txt",skiprows=[0])#去掉第一行 nba["HomeTeamRanksHigher"]=0 for index,row in nba.iterrows(): homeTeam=row["Home Team"] visitorTeam=row["Visitor Team"] if homeTeam=="New Orleans Pelicans": homeTeam="New Orleans Hornets" elif visitorTeam=="New Orleans Pelicans": visitorTeam="New Orleans Hornets" homeRank=expStanding[expStanding["Team"]==homeTeam]["Rk"].values[0] visitorRank=expStanding[expStanding["Team"]==visitorTeam]["Rk"].values[0] row["HomeTeamRanksHigher"]=int(homeRank>visitorRank) nba.ix[index]=row x_homehigher=nba[["HomeLastWin","VisitorLastWin","HomeTeamRanksHigher"]].values#查看数据值 clf=DecisionTreeClassifier(random_state=14) scores=cross_val_score(clf,x_homehigher,y_true,scoring="accuracy") print "精确度:{}".format(np.mean(scores))#精确度:0.596657347967 #用两支球队上场比赛的情况作为另一个特征:HomeTeamWonLast last_match_winner=defaultdict(int) nba["HomeTeamWonLast"]=0 for index,row in nba.iterrows(): homeTeam=row["Home Team"] visitorTeam=row["Visitor Team"] teams=tuple(sorted([homeTeam,visitorTeam])) row["HomeTeamWonLast"]=1 if last_match_winner[teams]==homeTeam else 0#?????????? nba.ix[index]=row winner=homeTeam if row["HomeWin"] else visitorTeam last_match_winner[teams]=winner#本场比赛中两支球队胜败情况 #用HomeTeamRanksHigher和HomeTeamWonLast两个特征来做数据集 x_lastwinner=nba[["HomeLastWin","VisitorLastWin","HomeTeamRanksHigher","HomeTeamWonLast"]].values clf=DecisionTreeClassifier(random_state=14) scores=cross_val_score(clf,x_lastwinner,y_true,scoring="accuracy") print "精确度:{}".format(np.mean(scores))#精确度:0.603482432526 from sklearn.preprocessing import LabelEncoder encoding=LabelEncoder()#把字符串类型的球队转化为整型 encoding.fit(nba["Home Team"].values)#将主场球队的名称化为整型 homeTeam=encoding.transform(nba["Home Team"].values)# visitorTeam=encoding.transform(nba["Visitor Team"].values)#不造为啥得到的是行向量 x_teams=np.vstack([homeTeam,visitorTeam]).T#矩阵转置后每行两个特征 #由于决策树会把特征看成是连续型的,所以改用二进制来表示 from sklearn.preprocessing import OneHotEncoder onehot=OneHotEncoder() x_teams_expanded=onehot.fit_transform(x_teams).todense() clf=DecisionTreeClassifier(random_state=14) scores=cross_val_score(clf,x_teams_expanded,y_true,scoring="accuracy") print "精确度:{}".format(np.mean(scores))#精确度:0.595154276248 from sklearn.ensemble import RandomForestClassifier clf=RandomForestClassifier(random_state=14) scores=cross_val_score(clf,x_teams,y_true,scoring="accuracy") print "精确度:{}".format(np.mean(scores))#精确度:0.583773383033 x_all=np.hstack([x_lastwinner,x_teams]) scores=cross_val_score(clf,x_all,y_true,scoring="accuracy") print "精确度:{}".format(np.mean(scores))#精确度:0.579208945952 paramete_space={ "max_features":[2,10 ], "n_estimators":[100,], "criterion":["gini","entropy"], "min_samples_leaf":[2,4,6] } """ from sklearn.grid_search import GridSearchCV clf=RandomForestClassifier(random_state=14) grid=GridSearchCV(clf,paramete_space) grid.fit(x_all,y_true) print "精确度:{}".format(np.mean(scores)) print grid.best_estimator_ """
python_nba_tree
最新推荐文章于 2024-08-25 09:26:04 发布