import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#导入文件
df = pd.read_csv('./anli/FIFA_2018_player.csv')
df.head()
ID name full_name nationality league club age birth_date height_cm weight_kg eur_value eur_wage
0 20801 Cristiano Ronaldo C. Ronaldo dos Santos Aveiro Portugal Spanish Primera División Real Madrid CF 32 1985-02-05 185.0 80.0 95500000.0 565000.0
1 158023 L. Messi Lionel Messi Argentina Spanish Primera División FC Barcelona 30 1987-06-24 170.0 72.0 105000000.0 565000.0
2 190871 Neymar Neymar da Silva Santos Jr. Brazil French Ligue 1 Paris Saint-Germain 25 1992-02-05 175.0 68.0 123000000.0 280000.0
3 176580 L. Suárez Luis Suárez Uruguay Spanish Primera División FC Barcelona 30 1987-01-24 182.0 86.0 97000000.0 510000.0
4 167495 M. Neuer Manuel Neuer Germany German Bundesliga FC Bayern Munich 31 1986-03-27 193.0 92.0 61000000.0 230000.0
#查看各数据指标
df.describe()
ID age height_cm weight_kg eur_value eur_wage
count 17741.000000 17741.000000 17741.000000 17741.000000 1.774100e+04 17741.000000
mean 207756.835522 25.088552 181.277944 75.397103 2.405130e+06 11667.887943
std 32421.331072 4.616413 6.692701 6.993980 5.377511e+06 23173.181633
min 16.000000 16.000000 155.000000 49.000000 1.000000e+04 1000.000000
25% 192621.000000 21.000000 177.000000 70.000000 3.250000e+05 2000.000000
50% 214175.000000 25.000000 181.000000 75.000000 7.000000e+05 4000.000000
75% 231624.000000 28.000000 186.000000 80.000000 2.100000e+06 12000.000000
max 241489.000000 47.000000 205.000000 110.000000 1.230000e+08 565000.000000
#查看是否有空值
df.count()
ID 17741
name 17741
full_name 17741
nationality 17741
league 17741
club 17741
age 17741
birth_date 17741
height_cm 17741
weight_kg 17741
eur_value 17741
eur_wage 17741
dtype: int64
#查看league 空值 数据量较少不影响分析将空值删除
df[df['league'].isnull()]
df.drop(df[df['league'].isnull()].index,inplace = True)
#从数据指标中的看出eur_value最小值为零,eur_wage不为零 该数据可填充可删除
#查看小于1000的eur_value 并用eur_value平均数填充
df[df['eur_value']<1000]
df['eur_value'].replace(0,df['eur_value'].mean(),inplace = True)
#离散型数据分析
#用国家作为维度分析球员人数
nationality_data = df.groupby('nationality',as_index = False).count()[['nationality','name']]
nationality_data.rename(columns={'name':'player_count'},inplace = True)
nationality_data.sort_values(by='player_count',ascending = False)
#球员数量最多的前十国家
44 England 1631
57 Germany 1147
135 Spain 1020
53 France 966
5 Argentina 962
18 Brazil 803
75 Italy 800
29 Colombia 591
78 Japan 471
105 Netherlands 430
#各联赛球员人数
league_data = df.groupby('league',as_index = False).count()[['league','name']]
league_data.rename(columns={'name':'player'},inplace = True)
league_data.sort_values(by='player',ascending = False)
#连续性数据分析
#分析各年龄段球员数量,进行分桶查询
bins = np.arange(15,50,5)
bins_data = pd.cut(df['age'],bins)
bins_count = df['age'].groupby(bins_data).count()
age
(15, 20] 3300
(20, 25] 6749
(25, 30] 5234
(30, 35] 2192
(35, 40] 258
(40, 45] 7
Name: age, dtype: int64
数据清洗
最新推荐文章于 2025-05-24 22:07:48 发布