Python数据分析与机器学习-足球赛事数据集

最新推荐文章于 2024-08-19 11:04:59 发布

未来，值得期待

最新推荐文章于 2024-08-19 11:04:59 发布

阅读量6k

点赞数 2

分类专栏： python 文章标签： Python数据分析与机器学习

本文链接：https://blog.csdn.net/adam_wzs/article/details/79246533

版权

python 专栏收录该内容

30 篇文章 3 订阅

订阅专栏

import pandas as pd
import numpy as np

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)

df = pd.read_csv('redcard.csv.gz', compression='gzip')
# print(df.shape) #(146028, 28)

# print(df.describe().T)  # 粗略看数据描述  T转置
# print(df.dtypes)  # 看数据类型 object=字符类型
# all_columns =df.columns.tolist()
# print(all_columns)
'''
['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height', 'weight', 'position', 'games', 
'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2', 
'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']
'''
# print(df['height'].mean())
# print(df['weight'].mean())
# print(df.groupby(by='playerShort')['height'].mean())
# print(np.mean(df.groupby(by='playerShort')['height'].mean()))

'''球员数据集'''
player_index = 'playerShort'
player_cols = [  # 'player', # drop player name, we have unique identifier
    'birthday',
    'height',
    'weight',
    'position',
    'photoID',
    'rater1',
    'rater2',
]
all_cols_unique_players = df.groupby(player_index).agg({col: 'nunique' for col in player_cols})


# print(all_cols_unique_players)

# 判断数据集的key值是否有重复
def get_subgroup(dataframe, g_index, g_columns):
    """Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""
    g = dataframe.groupby(g_index).agg({col: 'nunique' for col in g_columns})
    if g[g > 1].dropna().shape[0] != 0:
        print("Warning: you probably assumed this had all unique values but it doesn't.")
    return dataframe.groupby(g_index).agg({col: 'max' for col in g_columns})


# players = get_subgroup(df, player_index, player_cols)
# print(players.head())

# 把处理好的数据存储
def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
    save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
    dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
    test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
    # Test that we recover what we send in
    if dataframe.equals(test_df):
        print("Test-passed: we recover the equivalent subgroup dataframe.")
    else:
        print("Warning -- equivalence test!!! Double-check.")


# players = get_subgroup(df, player_index, player_cols)
# save_subgroup(players, player_index, "players")

# club_index = 'club'
# club_cols = ['leagueCountry']
# clubs = get_subgroup(df, club_index, club_cols)
# save_subgroup(clubs, club_index, "clubs", )

# country_index = 'refCountry'
# country_cols = ['Alpha_3',  # rename this name of country
#                 'meanIAT',
#                 'nIAT',
#                 'seIAT',
#                 'meanExp',
#                 'nExp',
#                 'seExp',
#                 ]
# countries = get_subgroup(df, country_index, country_cols)
# rename_columns = {'Alpha_3': 'countryName', }
# countries = countries.rename(columns=rename_columns)
# print(countries.head())
# save_subgroup(countries, country_index, "countries")

# 球员和裁判之间的关系
dyad_index = ['refNum', 'playerShort']
dyad_cols = ['games',
             'victories',
             'ties',
             'defeats',
             'goals',
             'yellowCards',
             'yellowReds',
             'redCards',
             ]
dyads = get_subgroup(df, g_index=dyad_index, g_columns=dyad_cols)
save_subgroup(dyads, dyad_index, "dyads")

import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)


def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
    save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
    dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
    test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
    # Test that we recover what we send in
    if dataframe.equals(test_df):
        print("Test-passed: we recover the equivalent subgroup dataframe.")
    else:
        print("Warning -- equivalence test!!! Double-check.")


def load_subgroup(filename, index_col=[0]):
    return pd.read_csv(filename, compression='gzip', index_col=index_col, encoding='UTF-8')


players = load_subgroup("raw_players.csv.gz")

'''拿到数据先看数据的概况'''

# print(players.head())
'''
                           photoID              position  height  rater1    birthday  rater2  weight
playerShort                                                                                         
aaron-hughes              3868.jpg           Center Back   182.0    0.25  08.11.1979    0.00    71.0
aaron-hunt               20136.jpg  Attacking Midfielder   183.0    0.00  04.09.1986    0.25    73.0
aaron-lennon             13515.jpg      Right Midfielder   165.0    0.25  16.04.1987    0.25    63.0
aaron-ramsey             94953.jpg     Center Midfielder   178.0    0.00  26.12.1990    0.00    76.0
abdelhamid-el-kaoutari  124913.jpg           Center Back   180.0    0.25  17.03.1990    0.25    73.0
'''

import missingno as msno

# msno.matrix(players.sample(n=500))
# msno.heatmap(players.sample(n=500))
# msno.bar(players.sample(n=500))

# 看 rater1 rater2 两个人打分的规则
# print(pd.crosstab(players['rater1'], players['rater2']))


# fig, ax = plt.subplots()
# sns.heatmap(pd.crosstab(players['rater1'], players['rater2']), cmap='Blues')
# ax.set_title('Correlation between Rater 1 and Rater 2')
# fig.tight_layout()
# plt.show()

# modifying dataframe rater1 rater2平均值
players['skintone'] = players[['rater1', 'rater2']].mean(axis=1)
# players = players.dropna(axis=0)
# # sns.distplot(players['skintone'], kde=False)
# sns.distplot(players.skintone, kde=False)
# plt.show()

# 踢球位置可视化
# MIDSIZE = (12, 8)
# fig, ax = plt.subplots(figsize=MIDSIZE)
# players.position.value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
# ax.set_ylabel("Position")
# ax.set_xlabel("Counts")
# fig.tight_layout()
# plt.show()

# 自定义球场踢球位置
defense = ['Center Back', 'Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]  # 后卫
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder', ]  # 中场
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']  # 前锋
keeper = 'Goalkeeper'  # 守门员

# modifying dataframe -- adding the aggregated position categorical position_agg
players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"
# print(players.head())
#
MIDSIZE = (12, 8)
# fig, ax = plt.subplots(figsize=MIDSIZE)
# players['position_agg'].value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
# ax.set_ylabel("position_agg")
# ax.set_xlabel("Counts")
# fig.tight_layout()
# plt.show()

'''多变量之间的关系'''
from pandas.plotting import scatter_matrix

# fig, ax = plt.subplots()
# scatter_matrix(players[['height', 'weight', 'skintone']])
# plt.show()

# fig, ax = plt.subplots(figsize=MIDSIZE)
# sns.regplot('weight', 'height', data=players, ax=ax)
# ax.set_ylabel("Height [cm]")
# ax.set_xlabel("Weight [kg]")
# fig.tight_layout()
# plt.show()

# 把连续值变为区间值
weight_categories = ["vlow_weight",
                     "low_weight",
                     "mid_weight",
                     "high_weight",
                     "vhigh_weight",
                     ]
players['weightclass'] = pd.qcut(players['weight'], len(weight_categories), weight_categories)

height_categories = ["vlow_height",
                     "low_height",
                     "mid_height",
                     "high_height",
                     "vhigh_height",
                     ]
players['heightclass'] = pd.qcut(players['height'], len(height_categories), height_categories)

# modifying dataframe
players['birth_date'] = pd.to_datetime(players['birthday'], format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime('2018-02-03') - players['birth_date']).dt.days) / 365.25
# print(players['age_years'])

players_cleaned_variables = players.columns.tolist()
# print(players_cleaned_variables)


players['skintoneclass'] = pd.qcut(players['skintone'], 3)

# 数据项属性选择
players_cleaned_variables = [
    'height',
    'weight',
    'skintone',
    'position_agg',
    'weightclass',
    'heightclass',
    'skintoneclass',
    'age_years']
players[players_cleaned_variables].to_csv("cleaned_players.csv.gz", compression='gzip')