dataset : https://www.basketball-reference.com/leagues/NBA_2014_games.html
import os.path
from collections import defaultdict
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
datafile = os.path.join(os.getcwd(),"NBAdata.csv")
dataset = pd.read_csv(datafile,parse_dates=["Date"],skiprows=[1,])
dataset.columns = ["Date", "Start", "Visitor Team", "Visitor PTS", "Home Team", "Home PTS","Score Type", "Attend", "Arena", "Notes"]
dataset["HomeWin"]=dataset["Visitor PTS"] < dataset["Home PTS"]
y_true = dataset["HomeWin"].values
won_last=defaultdict(int)
dataset["HomeLastWin"] = False
dataset["VisitorLastWin"] = False
for index, row in dataset.iterrows():
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
row["HomeLastWin"] = won_last[home_team]
row["VisitorLastWin"] = won_last[visitor_team]
dataset.loc[index]=row
won_last[home_team]=row["HomeWin"]
won_last[visitor_team]=not row["HomeWin"]
clf = DecisionTreeClassifier(random_state=14)
X_previouswins = dataset[["HomeLastWin","VisitorLastWin"]].values
scores = cross_val_score(clf,X_previouswins,y_true,scoring='accuracy')
print("决策树1:")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
datafile = os.path.join(os.getcwd(),"NBA_standing.csv")
standing = pd.read_csv(datafile,skiprows=[0])
dataset["HomeTeamRanksHigher"]=0
for index, row in dataset.iterrows():
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
home_rank = standing.loc[standing["Team"]==home_team]["Rk"].values[0]
visitor_rank = standing.loc[standing["Team"]==visitor_team]["Rk"].values[0]
row["HomeTeamRanksHigher"] = int(home_rank>visitor_rank)
dataset.loc[index]=row
X_homehigher = dataset[["HomeLastWin","VisitorLastWin","HomeTeamRanksHigher"]].values
scores = cross_val_score(clf,X_homehigher,y_true,scoring='accuracy')
print("决策树2:")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
last_match_winner = defaultdict(int)
dataset["HomeTeamWonLast"] = 0
for index,row in dataset.iterrows():
home_team = row["Home Team"]
visitor_team=row["Visitor Team"]
teams =tuple(sorted([home_team,visitor_team]))
row["HomeTeamWonLast"] = 1 if last_match_winner[teams]==row["Home Team"] else 0
dataset.loc[index] = row
winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
last_match_winner[teams]=winner
X_lastwinner = dataset[["HomeTeamRanksHigher","HomeTeamWonLast",]].values
scores = cross_val_score(clf,X_lastwinner,y_true,scoring='accuracy')
print("决策树3:")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
encoding = LabelEncoder()
encoding.fit(dataset["Home Team"].values)
encoding.fit(dataset["Visitor Team"].values)
home_team = encoding.transform(dataset["Home Team"].values)
visitor_team = encoding.transform(dataset["Visitor Team"].values)
X_teams = np.vstack([home_team,visitor_team]).T
onehot = OneHotEncoder()
X_teams_expanded = onehot.fit_transform(X_teams).todense()
scores = cross_val_score(clf,np.asarray(X_teams_expanded),y_true,scoring='accuracy')
print("决策树4:")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
clf1 = RandomForestClassifier(random_state=14)
scores=cross_val_score(clf1,X_teams,y_true,scoring='accuracy')
print("随机深林:")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
X_all = np.hstack([X_homehigher,X_teams])
scores=cross_val_score(clf1,X_all,y_true,scoring='accuracy')
print("随机深林2:")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
parameter_space = {
"max_features":[2,5,'auto'],
"n_estimators":[100,200],
"criterion":["gini","entropy"],
"min_samples_leaf":[2,4,6],
}
grid = GridSearchCV(clf1,parameter_space)
grid.fit(X_all,y_true)
print("Grid result:")
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_params_)
clf2 = RandomForestClassifier(bootstrap=True,criterion='entropy',max_depth=None,max_features=5,max_leaf_nodes=None,min_samples_leaf=2,min_samples_split=2,n_estimators=100,n_jobs=1,oob_score=False,random_state=14,verbose=0)
X_all = np.hstack([X_homehigher,X_teams])
scores=cross_val_score(clf2,X_all,y_true,scoring='accuracy')
print("Grid result 2 :")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))