他山之石可以攻玉。
华为AI Gallery社区的技术贴。文章中将预测结果进行了二分类问题的处理(胜/平负两类)。
所需技能Notebook+Python+Pandas。
本次预测标签为赛果(win_result),计算home_score和away_score的差值,若两者差值大于0,则赛果win_result为胜(1),反之为负(0)。
df['diff'] = df['home_score']-df['away_score']
df['win_result'] = df['diff'].apply(lambda x: 1 if x>0 else 0)
将做特征进行衍生,主要分为5种衍生方式:
- 主队最近5、2、1场比赛的胜负情况,包括胜利场数、失败场数、净胜球(主队分数减去客队分数)均值。
- 主队与当前客队的最近5、2、1场比赛的胜负情况,包括胜利场数、失败场数、净胜球均值。
- 主队最近15、7、3、2、1年比赛的胜负情况,包括总场数、胜利场数、失败场数。
- 主客队历史上全部赛事的胜负情况,。
- 赛事发生的月份、季节。
# 剔除友谊赛的数据,共24908个样本
df = df[df['tournament']!='Friendly']
def process_home_team_latest_info(df):
row_num =df.shape[0]
for i in range(row_num):
home_team = df.loc[i,'home_team']
for suffix in ['5','3','1']:
j = i-1
flag = False
while i-j<=int(suffix) and j>=0 and df.loc[j,'home_team'] == home_team:
j-=1
flag = True
j = j+1
if flag and i-j<=int(suffix) and j>=0:
df.loc[i,'num_'+suffix]=i-j
df.loc[i,'diff_num_'+suffix]=df.loc[j:i-1,'diff'].mean()
df.loc[i,'win_num_'+suffix]=df.loc[j:i-1,'win_result'].sum()
df.loc[i,'lose_num_'+suffix]=i-j-df.loc[j:i-1,'win_result'].sum()
# 按照主队、日期进行排序,计算最近5、2、1场的胜利场数、失败场数、净胜球均值、胜率
df = df.sort_values(['home_team','date']).reset_index()
process_home_team_latest_info(df)
df.head(5)
#主队与当前客队在历史上的比赛场次有可能少于5场、2场、1场。
#获取的12个衍生特征:
# num_team_5:最近5场比赛场次(可能出现不满5场的情况)
# win_num_team_5:最近5场胜场次
# lose_num_team_5:最近5场负场次
# diff_mean_team_5:最近5场得分差平均值
# num_team_2:最近2场比赛场次(可能出现不满2场的情况)
# win_num_team_2:最近2场胜场次
# lose_num_team_2:最近2场负场次
# diff_mean_team_2:最近2场得分差平均值
# nu_team_1:最近1场比赛场次(可能出现不满1场的情况)
# win_nu_team_1:最近1场胜场次
# lose_num_team_1:最近1场负场次
# diff_team_1:最近1场得分差平均值
def process_home_away_team_latest_info(df):
row_num =df.shape[0]
for i in range(row_num):
home_team = df.loc[i,'home_team']
away_team = df.loc[i,'away_team']
for suffix in ['5','3','1']:
j = i-1
flag = False
while i-j<=int(suffix) and j>=0 and df.loc[j,'home_team'] == home_team and df.loc[j,'away_team'] == away_team:
j-=1
flag = True
j = j+1
if flag and i-j<=int(suffix) and j>=0:
df.loc[i,'num_team_'+suffix]=i-j
df.loc[i,'diff_num_team_'+suffix]=df.loc[j:i-1,'diff'].mean()
df.loc[i,'win_num_team_'+suffix]=df.loc[j:i-1,'win_result'].sum()
for suffix in ['5','3','1']:
df['lose_num_team_'+suffix] = df['num_team_'+suffix]-df['win_num_team_'+suffix]
# df = df.drop(columns=['level_0'])
df = df.sort_values(['home_team','away_team','date']).reset_index()
process_home_away_team_latest_info(df)
df.head(5)
#获取主队在过去的15年、7年、3年、2年、1年中的比赛场数、胜利场数、失败场数。
def process_home_year_latest_info(df):
row_num =df.shape[0]
for i in range(row_num):
home_team = df.loc[i,'home_team']
home_team_year = df.loc[i,'date'].year
for suffix in ['15','7','3','2','1']:
j = i-1
flag = False
while j>=0 and df.loc[j,'home_team'] == home_team and home_team_year-df.loc[j,'date'].year<=int(suffix):
j-=1
flag = True
j = j+1
if flag and j>=0:
df.loc[i,'num_year_'+suffix]=i-j
df.loc[i,'diff_num_year_'+suffix]=df.loc[j:i-1,'diff'].mean()
df.loc[i,'win_num_year_'+suffix]=df.loc[j:i-1,'win_result'].sum()
for suffix in ['15','7','3','2','1']:
df['lose_num_year_'+suffix]= df['num_year_'+suffix] - df['win_num_year_'+suffix]
df = df.drop(columns=['level_0'])
df = df.sort_values(['home_team','date']).reset_index()
process_home_year_latest_info(df)
df.head(5)
#获取主客队在过去全部比赛的比赛场数、胜利场数、失败场数、胜率、净进球均值。
# home_num:当前主队作为主场的比赛场次
# home_win_num:当前主队作为主场的比赛胜场次
# home_lose_num:当前主队作为主场的比赛负场次
# home_win_rate:当前主队作为主场的胜率
# away_num:当前客队作为客场的比赛场次
# away_win_num:当前客队作为客场的比赛胜场次
# away_lose_num:当前客队作为客场的比赛负场次
# away_win_rate:当前客队作为客场的胜率
def process_home_away(df_home_team,df_away_team):
row_num =df.shape[0]
for i in range(row_num):
home_team = df_home_team.loc[i,'home_team']
away_team = df_home_team.loc[i,'away_team']
j = i-1
flag = False
while j>=0 and df_home_team.loc[j,'home_team'] == home_team:
j-=1
flag = True
j = j+1
if flag and j>=0:
df_home_team.loc[i,'home_num']=i-j
df_home_team.loc[i,'home_win_num']=df_home_team.loc[j:i-1,'win_result'].sum()
df_home_team.loc[i,'home_lose_num']=i-j-df_home_team.loc[j:i-1,'win_result'].sum()
df_home_team.loc[i,'home_win_rate']=df_home_team.loc[i,'home_win_num']/df_home_team.loc[i,'home_num']
away_index = df_away_team[df_away_team['index']==df_home_team.loc[i,'index']].index[0]
away_index_j = away_index -1
flag = False
while away_index_j>=0 and df_away_team.loc[away_index_j,'away_team'] == away_team:
away_index_j-=1
flag = True
away_index_j = away_index_j+1
if flag and away_index_j>=0:
df_home_team.loc[i,'away_num']=away_index-away_index_j
df_home_team.loc[i,'away_win_num']=df_away_team.loc[away_index_j:away_index-1,'win_result'].sum()
df_home_team.loc[i,'away_lose_num']= df_home_team.loc[i,'away_num'] - df_home_team.loc[i,'away_win_num']
df_home_team.loc[i,'away_win_rate']=df_home_team.loc[i,'away_win_num']/df_home_team.loc[i,'away_num']
df = df.drop(columns=['level_0'])
process_away_team = df.sort_values(['away_team','date']).reset_index()
process_away_team['win_result']= process_away_team['win_result'].apply(lambda x: 0 if x==1 else 1)
df = df.sort_values(['home_team','date']).reset_index()
process_home_away(df,process_away_team)
df.head(5)
# 比赛发生的月份和季节
df = df.drop(columns=['level_0'])
df = df.sort_values(['date']).reset_index()
df['month']= df['date'].dt.month
df['season'] = df['month'].apply(lambda x: int(x/3))
# 保存处理后的数据
df.to_csv('footballdata.csv')
该作者数据预处理时采用的Notebook+Python,所选择的kernel是PySpark-2.3.2,选择的规格是CPU:2核 4GB,该规格下运行全部cell大约需要17分钟。
可以看出该作者由较好的pandas使用功底。条理性很好。
在进行这种常规级别的大数据处理,通常需要掌握Python、SQL、Pandas等知识就可以胜任了。
下期介绍建模思路及其代码