from __future__ import absolute_import,division,print_function import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib.pyplot import GridSpec import seaborn as sns import numpy as np import pandas as pda import os ,sys from tqdm import tqdm import warnings warnings.filterwarnings("ignore") sns.set_context("poster",font_scale=1.3) import missingno as msno import pandas_profiling from sklearn.datasets import make_blobs import time #读入数据 data=pda.read_csv("redcard.csv.gz",compression="gzip") print("==================开始分析数据==================================") def load_subgroup(filename,index_col=[0]): return pda.read_csv(filename,compression="gzip",index_col=index_col,encoding="UTF-8") players=load_subgroup("raw_players.csv.gz") print(players.head()) print(players.shape) msno.matrix(players,figsize=(16,7),width_ratios=(15,1)) msno.bar(players.sample(500),color="r") msno.heatmap(players,figsize=(16,7))#缺失值比例关系 plt.show() print("样本数量:",len(players)) print("rater1缺失数量:",len(players[pda.isnull(players["rater1"])])) print("rater2缺失数量:",len(players[pda.isnull(players["rater2"])])) print("rater1,2都缺失数量:",len(players[pda.isnull(players["rater1"])&pda.isnull(players["rater2"])])) #费缺失值 print("rater1非缺失数量:",len(players[players.rater1.notnull()])) players=players[players.rater1.notnull()] msno.bar(players,color="r") plt.show() fig,ax=plt.subplots(figsize=(12,8)) sns.heatmap(pda.crosstab(players.rater1,players.rater2),cmap="Blues",annot=True,fmt="d",ax=ax) ax.set_title("Correlation between Rater 1 and Rater2\n") fig.tight_layout() plt.show() print("=========================") print(pda.crosstab(players.rater1,players.rater2)) players["skinone"]=players[["rater1","rater2"]].mean(axis=1) print(players.head()) sns.distplot(players["skinone"],kde=True)#直方图 sns.distplot(players["skinone"],kde=False)#直方图 plt.show() # fig,ax=plt.subplots(figsize=(12,10)) players.position.value_counts(dropna=False,ascending=True).plot(kind="barh",ax=ax) ax.set_ylabel("Postion") ax.set_xlabel("Counts") fig.tight_layout() plt.show() position_types=players.position.unique() print(position_types) defense=['Center Back', 'Defensive Midfielder','Left Fullback','Right Fullback'] midfield=[ 'Right Midfielder','Center Midfielder','Left Midfielder'] forword=['Attacking Midfielder','Left Winger', 'Right Winger','Center Forward'] keeper=['Goalkeeper'] players.loc[players["position"].isin(defense),"postion_new"]="Defense" players.loc[players["position"].isin(midfield),"postion_new"]="Midfield" players.loc[players["position"].isin(forword),"postion_new"]="Forword" players.loc[players["position"].isin(keeper),"postion_new"]="Keeper" print(players.head()) fig,ax=plt.subplots(figsize=(12,10)) players.postion_new.value_counts(dropna=False,ascending=True).plot(kind="barh",ax=ax) ax.set_ylabel("postion_new") ax.set_xlabel("Counts") fig.tight_layout() plt.show()