import pandas as pd
import numpy as np
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
df = pd.read_csv('redcard.csv.gz', compression='gzip')
# print(df.shape) #(146028, 28)
# print(df.describe().T) # 粗略看数据描述 T转置
# print(df.dtypes) # 看数据类型 object=字符类型
# all_columns =df.columns.tolist()
# print(all_columns)
'''
['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height', 'weight', 'position', 'games',
'victories', 'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp']
'''
# print(df['height'].mean())
# print(df['weight'].mean())
# print(df.groupby(by='playerShort')['height'].mean())
# print(np.mean(df.groupby(by='playerShort')['height'].mean()))
'''球员数据集'''
player_index = 'playerShort'
player_cols = [ # 'player', # drop player name, we have unique identifier
'birthday',
'height',
'weight',
'position',
'photoID',
'rater1',
'rater2',
]
all_cols_unique_players = df.groupby(player_index).agg({col: 'nunique' for col in player_cols})
# print(all_cols_unique_players)
# 判断数据集的key值是否有重复
def get_subgroup(dataframe, g_index, g_columns):
"""Helper function that creates a sub-table from the columns and runs a quick uniqueness test."""
g = dataframe.groupby(g_index).agg({col: 'nunique' for col in g_columns})
if g[g > 1].dropna().shape[0] != 0:
print("Warning: you probably assumed this had all unique values but it doesn't.")
return dataframe.groupby(g_index).agg({col: 'max' for col in g_columns})
# players = get_subgroup(df, player_index, player_cols)
# print(players.head())
# 把处理好的数据存储
def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
# Test that we recover what we send in
if dataframe.equals(test_df):
print("Test-passed: we recover the equivalent subgroup dataframe.")
else:
print("Warning -- equivalence test!!! Double-check.")
# players = get_subgroup(df, player_index, player_cols)
# save_subgroup(players, player_index, "players")
# club_index = 'club'
# club_cols = ['leagueCountry']
# clubs = get_subgroup(df, club_index, club_cols)
# save_subgroup(clubs, club_index, "clubs", )
# country_index = 'refCountry'
# country_cols = ['Alpha_3', # rename this name of country
# 'meanIAT',
# 'nIAT',
# 'seIAT',
# 'meanExp',
# 'nExp',
# 'seExp',
# ]
# countries = get_subgroup(df, country_index, country_cols)
# rename_columns = {'Alpha_3': 'countryName', }
# countries = countries.rename(columns=rename_columns)
# print(countries.head())
# save_subgroup(countries, country_index, "countries")
# 球员和裁判之间的关系
dyad_index = ['refNum', 'playerShort']
dyad_cols = ['games',
'victories',
'ties',
'defeats',
'goals',
'yellowCards',
'yellowReds',
'redCards',
]
dyads = get_subgroup(df, g_index=dyad_index, g_columns=dyad_cols)
save_subgroup(dyads, dyad_index, "dyads")
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)
def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
# Test that we recover what we send in
if dataframe.equals(test_df):
print("Test-passed: we recover the equivalent subgroup dataframe.")
else:
print("Warning -- equivalence test!!! Double-check.")
def load_subgroup(filename, index_col=[0]):
return pd.read_csv(filename, compression='gzip', index_col=index_col, encoding='UTF-8')
players = load_subgroup("raw_players.csv.gz")
'''拿到数据先看数据的概况'''
# print(players.head())
'''
photoID position height rater1 birthday rater2 weight
playerShort
aaron-hughes 3868.jpg Center Back 182.0 0.25 08.11.1979 0.00 71.0
aaron-hunt 20136.jpg Attacking Midfielder 183.0 0.00 04.09.1986 0.25 73.0
aaron-lennon 13515.jpg Right Midfielder 165.0 0.25 16.04.1987 0.25 63.0
aaron-ramsey 94953.jpg Center Midfielder 178.0 0.00 26.12.1990 0.00 76.0
abdelhamid-el-kaoutari 124913.jpg Center Back 180.0 0.25 17.03.1990 0.25 73.0
'''
import missingno as msno
# msno.matrix(players.sample(n=500))
# msno.heatmap(players.sample(n=500))
# msno.bar(players.sample(n=500))
# 看 rater1 rater2 两个人打分的规则
# print(pd.crosstab(players['rater1'], players['rater2']))
# fig, ax = plt.subplots()
# sns.heatmap(pd.crosstab(players['rater1'], players['rater2']), cmap='Blues')
# ax.set_title('Correlation between Rater 1 and Rater 2')
# fig.tight_layout()
# plt.show()
# modifying dataframe rater1 rater2平均值
players['skintone'] = players[['rater1', 'rater2']].mean(axis=1)
# players = players.dropna(axis=0)
# # sns.distplot(players['skintone'], kde=False)
# sns.distplot(players.skintone, kde=False)
# plt.show()
# 踢球位置可视化
# MIDSIZE = (12, 8)
# fig, ax = plt.subplots(figsize=MIDSIZE)
# players.position.value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
# ax.set_ylabel("Position")
# ax.set_xlabel("Counts")
# fig.tight_layout()
# plt.show()
# 自定义球场踢球位置
defense = ['Center Back', 'Defensive Midfielder', 'Left Fullback', 'Right Fullback', ] # 后卫
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder', ] # 中场
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward'] # 前锋
keeper = 'Goalkeeper' # 守门员
# modifying dataframe -- adding the aggregated position categorical position_agg
players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"
# print(players.head())
#
MIDSIZE = (12, 8)
# fig, ax = plt.subplots(figsize=MIDSIZE)
# players['position_agg'].value_counts(dropna=False, ascending=True).plot(kind='barh', ax=ax)
# ax.set_ylabel("position_agg")
# ax.set_xlabel("Counts")
# fig.tight_layout()
# plt.show()
'''多变量之间的关系'''
from pandas.plotting import scatter_matrix
# fig, ax = plt.subplots()
# scatter_matrix(players[['height', 'weight', 'skintone']])
# plt.show()
# fig, ax = plt.subplots(figsize=MIDSIZE)
# sns.regplot('weight', 'height', data=players, ax=ax)
# ax.set_ylabel("Height [cm]")
# ax.set_xlabel("Weight [kg]")
# fig.tight_layout()
# plt.show()
# 把连续值变为区间值
weight_categories = ["vlow_weight",
"low_weight",
"mid_weight",
"high_weight",
"vhigh_weight",
]
players['weightclass'] = pd.qcut(players['weight'], len(weight_categories), weight_categories)
height_categories = ["vlow_height",
"low_height",
"mid_height",
"high_height",
"vhigh_height",
]
players['heightclass'] = pd.qcut(players['height'], len(height_categories), height_categories)
# modifying dataframe
players['birth_date'] = pd.to_datetime(players['birthday'], format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime('2018-02-03') - players['birth_date']).dt.days) / 365.25
# print(players['age_years'])
players_cleaned_variables = players.columns.tolist()
# print(players_cleaned_variables)
players['skintoneclass'] = pd.qcut(players['skintone'], 3)
# 数据项属性选择
players_cleaned_variables = [
'height',
'weight',
'skintone',
'position_agg',
'weightclass',
'heightclass',
'skintoneclass',
'age_years']
players[players_cleaned_variables].to_csv("cleaned_players.csv.gz", compression='gzip')