一、导入库和数据集
代码环境:
python==3.10
scikit-learn==1.0.2
xgboost==2.0.3
导入模块
# Import the necessary libraries.
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from IPython.display import display
%matplotlib inline
读取数据集加载到pandas,并打印前5行和列
# Read data and drop redundant column.
data = pd.read_csv('D:/pycharm/2024_ml_learn/football_predict/Datasets/final_dataset.csv')
# Remove first 3 matchweeks
data = data[data.MW > 3]
data.drop(['Unnamed: 0','HomeTeam', 'AwayTeam', 'Date', 'MW', 'HTFormPtsStr', 'ATFormPtsStr', 'FTHG', 'FTAG',
'HTGS', 'ATGS', 'HTGC', 'ATGC','HomeTeamLP', 'AwayTeamLP','DiffPts','HTFormPts','ATFormPts',
'HM4','HM5','AM4','AM5','HTLossStreak5','ATLossStreak5','HTWinStreak5','ATWinStreak5',
'HTWinStreak3','HTLossStreak3','ATWinStreak3','ATLossStreak3'],1, inplace=True)
# Preview data.
display(data.head())
二、数据探索
# Total number of students.
n_matches = data.shape[0]
# Calculate number of features.
n_features = data.shape[1] - 1
# Calculate matches won by home team.
n_homewins = len(data[data.FTR == 'H'])
# Calculate win rate for home team.
win_rate = (float(n_homewins) / (n_matches)) * 100
# Print the results
print("Total number of matches: {}".format(n_matches))
print("Number of features: {}".format(n_features))
print("Number of matches won by home team: {}".format(n_homewins))
print("Win rate of home team: {:.2f}%".format(win_rate))
# Visualising distribution of data
from pandas.plotting import scatter_matrix
scatter_matrix(data[['HTGD','ATGD','HTP','ATP','DiffFormPts','DiffLP']], figsize=(10,10))
三、模型训练
分类变量处理
def transform_ftr(value):
if value == "H":
return 1
else:
return 0
data['FTR'] = data.FTR.apply(transform_ftr)
data.head()
数据集划分和数据标准化
# Separate into feature set and target variable
X_all = data.drop(['FTR'],1)
y_all = data['FTR']
# Standardising the data.
from sklearn.preprocessing import scale
cols = [['HTGD','ATGD','HTP','ATP','DiffLP']]
for col in cols:
X_all[col] = scale(X_all[col])
X_all.HM1 = X_all.HM1.astype('str')
X_all.HM2 = X_all.HM2.astype('str')
X_all.HM3 = X_all.HM3.astype('str')
X_all.AM1 = X_all.AM1.astype('str')
X_all.AM2 = X_all.AM2.astype('str')
X_all.AM3 = X_all.AM3.astype('str')
def preprocess_features(X):
''' Preprocesses the football data and converts catagorical variables into dummy variables. '''
# Initialize new output DataFrame
output = pd.DataFrame(index = X.index)
# Investigate each feature column for the data
for col, col_data in X.iteritems():
# If data type is categorical, convert to dummy variables
if col_data.dtype == object:
col_data = pd.get_dummies(col_data, prefix = col)
# Collect the revised columns
output = output.join(col_data)
return output
X_all = preprocess_features(X_all)
print("Processed feature columns ({} total features):\n{}".format(len(X_all.columns), list(X_all.columns)))
from sklearn.model_selection import train_test_split
# Shuffle and split the dataset into training and testing set.
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all,
test_size = 50,
random_state = 2,
stratify = y_all)
训练和预估模型
from time import time
from sklearn.metrics import f1_score
def train_classifier(clf, X_train, y_train):
''' Fits a classifier to the training data. '''
# Start the clock, train the classifier, then stop the clock
start = time()
clf.fit(X_train, y_train)
end = time()
# Print the results
print("Trained model in {:.4f} seconds".format(end - start))
def predict_labels(clf, features, target):
''' Makes predictions using a fit classifier based on F1 score. '''
# Start the clock, make predictions, then stop the clock
start = time()
y_pred = clf.predict(features)
end = time()
# Print and return results
print("Made predictions in {:.4f} seconds.".format(end - start))
return f1_score(target, y_pred, pos_label=1), sum(target == y_pred) / float(len(y_pred))
def train_predict(clf, X_train, y_train, X_test, y_test):
''' Train and predict using a classifer based on F1 score. '''
# Indicate the classifier and the training set size
print("Training a {} using a training set size of {}. . .".format(clf.__class__.__name__, len(X_train)))
# Train the classifier
train_classifier(clf, X_train, y_train)
# Print the results of prediction for both training and testing
f1, acc = predict_labels(clf, X_train, y_train)
print(f1, acc)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))
# TODO: Initialize the three models (XGBoost is initialized later)
clf_A = LogisticRegression(random_state = 42)
clf_B = SVC(random_state = 912, kernel='rbf')
clf_C = xgb.XGBClassifier(seed = 82)
train_predict(clf_A, X_train, y_train, X_test, y_test)
print(train_predict(clf_B, X_train, y_train, X_test, y_test))
print(train_predict(clf_C, X_train, y_train, X_test, y_test))
微调Xgboost参数
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.1],
'n_estimators' : [40],
'max_depth': [3],
'min_child_weight': [3],
'gamma':[0.4],
'subsample' : [0.8],
'colsample_bytree' : [0.8],
'scale_pos_weight' : [1],
'reg_alpha':[1e-5]
}
# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)
# TODO: Make an f1 scoring function using 'make_scorer'
f1_scorer = make_scorer(f1_score,pos_label=1)
# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
scoring=f1_scorer,
param_grid=parameters,
cv=5)
# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_train,y_train)
# Get the estimator
clf = grid_obj.best_estimator_
print(clf)
# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
f1, acc = predict_labels(clf, X_test, y_test)
print("F1 score and accuracy score for test set: {:.4f} , {:.4f}.".format(f1 , acc))
训练模型和预测数据
# TODO: Import 'GridSearchCV' and 'make_scorer'
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from pprint import pprint
# TODO: Create the parameters list you wish to tune
parameters = { 'learning_rate' : [0.03],
'n_estimators' : [20],
'max_depth': [5],
'min_child_weight': [5],
'gamma':[0.2],
'subsample':[0.8],
'colsample_bytree':[0.8],
'scale_pos_weight' : [1],
'reg_alpha':[1e-2]
}
# TODO: Initialize the classifier
clf = xgb.XGBClassifier(seed=2)
# TODO: Make an f1 scoring function using 'make_scorer'
f1_scorer = make_scorer(f1_score,pos_label=1)
# TODO: Perform grid search on the classifier using the f1_scorer as the scoring method
grid_obj = GridSearchCV(clf,
scoring=f1_scorer,
param_grid=parameters,
cv=5)
# TODO: Fit the grid search object to the training data and find the optimal parameters
grid_obj = grid_obj.fit(X_all,y_all)
# Get the estimator
clf = grid_obj.best_estimator_
print(clf)
# Report the final F1 score for training and testing after parameter tuning
f1, acc = predict_labels(clf, X_train, y_train)
print("F1 score and accuracy score for training set: {:.4f} , {:.4f}.".format(f1 , acc))
四、数据来源和源码获取
训练数据来源于kaggle,读者可以去kagga下载。
或者加小编微信获取: