数据清洗

最新推荐文章于 2025-05-24 22:07:48 发布

661-

最新推荐文章于 2025-05-24 22:07:48 发布

阅读量121

点赞数

文章标签：数据分析

本文链接：https://blog.csdn.net/N5499_/article/details/112523725

版权

由于博客内容为空，无法提供包含关键信息的摘要。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#导入文件
df = pd.read_csv('./anli/FIFA_2018_player.csv')
df.head()
ID	name	full_name	nationality	league	club	age	birth_date	height_cm	weight_kg	eur_value	eur_wage
0	20801	Cristiano Ronaldo	C. Ronaldo dos Santos Aveiro	Portugal	Spanish Primera División	Real Madrid CF	32	1985-02-05	185.0	80.0	95500000.0	565000.0
1	158023	L. Messi	Lionel Messi	Argentina	Spanish Primera División	FC Barcelona	30	1987-06-24	170.0	72.0	105000000.0	565000.0
2	190871	Neymar	Neymar da Silva Santos Jr.	Brazil	French Ligue 1	Paris Saint-Germain	25	1992-02-05	175.0	68.0	123000000.0	280000.0
3	176580	L. Suárez	Luis Suárez	Uruguay	Spanish Primera División	FC Barcelona	30	1987-01-24	182.0	86.0	97000000.0	510000.0
4	167495	M. Neuer	Manuel Neuer	Germany	German Bundesliga	FC Bayern Munich	31	1986-03-27	193.0	92.0	61000000.0	230000.0
#查看各数据指标
df.describe()
	ID	age	height_cm	weight_kg	eur_value	eur_wage
count	17741.000000	17741.000000	17741.000000	17741.000000	1.774100e+04	17741.000000
mean	207756.835522	25.088552	181.277944	75.397103	2.405130e+06	11667.887943
std	32421.331072	4.616413	6.692701	6.993980	5.377511e+06	23173.181633
min	16.000000	16.000000	155.000000	49.000000	1.000000e+04	1000.000000
25%	192621.000000	21.000000	177.000000	70.000000	3.250000e+05	2000.000000
50%	214175.000000	25.000000	181.000000	75.000000	7.000000e+05	4000.000000
75%	231624.000000	28.000000	186.000000	80.000000	2.100000e+06	12000.000000
max	241489.000000	47.000000	205.000000	110.000000	1.230000e+08	565000.000000
#查看是否有空值
df.count()
ID             17741
name           17741
full_name      17741
nationality    17741
league         17741
club           17741
age            17741
birth_date     17741
height_cm      17741
weight_kg      17741
eur_value      17741
eur_wage       17741
dtype: int64
#查看league 空值 数据量较少不影响分析将空值删除
df[df['league'].isnull()]
df.drop(df[df['league'].isnull()].index,inplace = True)
#从数据指标中的看出eur_value最小值为零，eur_wage不为零 该数据可填充可删除
#查看小于1000的eur_value 并用eur_value平均数填充
df[df['eur_value']<1000] 
df['eur_value'].replace(0,df['eur_value'].mean(),inplace = True)
#离散型数据分析
#用国家作为维度分析球员人数
nationality_data = df.groupby('nationality',as_index = False).count()[['nationality','name']]
nationality_data.rename(columns={'name':'player_count'},inplace = True)
nationality_data.sort_values(by='player_count',ascending = False)
#球员数量最多的前十国家
44	England	1631
57	Germany	1147
135	Spain	1020
53	France	966
5	Argentina	962
18	Brazil	803
75	Italy	800
29	Colombia	591
78	Japan	471
105	Netherlands	430
#各联赛球员人数
league_data = df.groupby('league',as_index = False).count()[['league','name']]
league_data.rename(columns={'name':'player'},inplace = True)
league_data.sort_values(by='player',ascending = False)
#连续性数据分析
#分析各年龄段球员数量，进行分桶查询
bins = np.arange(15,50,5)
bins_data = pd.cut(df['age'],bins)
bins_count = df['age'].groupby(bins_data).count()
age
(15, 20]    3300
(20, 25]    6749
(25, 30]    5234
(30, 35]    2192
(35, 40]     258
(40, 45]       7
Name: age, dtype: int64