口袋妖怪数据集探索
数据读取
import pandas as pd
import seaborn as sns
import matplotlib. pyplot as plt
df = pd. read_csv( "pokemon.csv" )
df. head( )
abilities against_bug against_dark against_dragon against_electric against_fairy against_fight against_fire against_flying against_ghost ... percentage_male pokedex_number sp_attack sp_defense speed type1 type2 weight_kg generation is_legendary 0 ['Overgrow', 'Chlorophyll'] 1.0 1.0 1.0 0.5 0.5 0.5 2.0 2.0 1.0 ... 88.1 1 65 65 45 grass poison 6.9 1 0 1 ['Overgrow', 'Chlorophyll'] 1.0 1.0 1.0 0.5 0.5 0.5 2.0 2.0 1.0 ... 88.1 2 80 80 60 grass poison 13.0 1 0 2 ['Overgrow', 'Chlorophyll'] 1.0 1.0 1.0 0.5 0.5 0.5 2.0 2.0 1.0 ... 88.1 3 122 120 80 grass poison 100.0 1 0 3 ['Blaze', 'Solar Power'] 0.5 1.0 1.0 1.0 0.5 1.0 0.5 1.0 1.0 ... 88.1 4 60 50 65 fire NaN 8.5 1 0 4 ['Blaze', 'Solar Power'] 0.5 1.0 1.0 1.0 0.5 1.0 0.5 1.0 1.0 ... 88.1 5 80 65 80 fire NaN 19.0 1 0
5 rows × 41 columns
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 41 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 abilities 801 non-null object
1 against_bug 801 non-null float64
2 against_dark 801 non-null float64
3 against_dragon 801 non-null float64
4 against_electric 801 non-null float64
5 against_fairy 801 non-null float64
6 against_fight 801 non-null float64
7 against_fire 801 non-null float64
8 against_flying 801 non-null float64
9 against_ghost 801 non-null float64
10 against_grass 801 non-null float64
11 against_ground 801 non-null float64
12 against_ice 801 non-null float64
13 against_normal 801 non-null float64
14 against_poison 801 non-null float64
15 against_psychic 801 non-null float64
16 against_rock 801 non-null float64
17 against_steel 801 non-null float64
18 against_water 801 non-null float64
19 attack 801 non-null int64
20 base_egg_steps 801 non-null int64
21 base_happiness 801 non-null int64
22 base_total 801 non-null int64
23 capture_rate 801 non-null object
24 classfication 801 non-null object
25 defense 801 non-null int64
26 experience_growth 801 non-null int64
27 height_m 781 non-null float64
28 hp 801 non-null int64
29 japanese_name 801 non-null object
30 name 801 non-null object
31 percentage_male 703 non-null float64
32 pokedex_number 801 non-null int64
33 sp_attack 801 non-null int64
34 sp_defense 801 non-null int64
35 speed 801 non-null int64
36 type1 801 non-null object
37 type2 417 non-null object
38 weight_kg 781 non-null float64
39 generation 801 non-null int64
40 is_legendary 801 non-null int64
dtypes: float64(21), int64(13), object(7)
memory usage: 256.7+ KB
percent_missing = df. isnull( ) . sum ( ) * 100 / len ( df)
missing_value_df = pd. DataFrame( {
'column_name' : df. columns,
'percent_missing' : percent_missing
} )
missing_value_df. sort_values( by= 'percent_missing' , ascending= False ) . head( 10 )
column_name percent_missing type2 type2 47.940075 percentage_male percentage_male 12.234707 weight_kg weight_kg 2.496879 height_m height_m 2.496879 name name 0.000000 capture_rate capture_rate 0.000000 classfication classfication 0.000000 defense defense 0.000000 experience_growth experience_growth 0.000000 hp hp 0.000000
df[ 'generation' ] . value_counts( ) . sort_values( ascending= False ) . plot. bar( )
df[ 'type1' ] . value_counts( ) . sort_values( ascending= True ) . plot. barh( )
plt. subplots( figsize= ( 20 , 15 ) )
ax = plt. axes( )
ax. set_title( "Correlation Heatmap" )
corr = df. corr( )
sns. heatmap( corr,
xticklabels= corr. columns. values,
yticklabels= corr. columns. values)
interested = [ 'hp' , 'attack' , 'defense' , 'sp_attack' , 'sp_defense' , 'speed' ]
sns. pairplot( df[ interested] )
plt. subplots( figsize= ( 10 , 8 ) )
ax = plt. axes( )
ax. set_title( "Correlation Heatmap" )
corr = df[ interested] . corr( )
sns. heatmap( corr,
xticklabels= corr. columns. values,
yticklabels= corr. columns. values,
annot= True , fmt= "f" , cmap= "YlGnBu" )
for c in interested:
df[ c] = df[ c] . astype( float )
df = df. assign( total_stats = df[ interested] . sum ( axis= 1 ) )
df[ df. total_stats >= 525 ] . shape
(167, 42)
total_stats = df. total_stats
plt. hist( total_stats, bins= 35 )
plt. xlabel( 'total_stats' )
plt. ylabel( 'Frequency' )
plt. subplots( figsize= ( 20 , 12 ) )
ax = sns. violinplot( x= "type1" , y= "total_stats" ,
data= df, palette= "muted" )
df[ ( df. total_stats >= 570 ) & ( df. is_legendary == 0 ) ] [ 'name' ] . head( 10 )
2 Venusaur
5 Charizard
8 Blastoise
17 Pidgeot
64 Alakazam
79 Slowbro
93 Gengar
114 Kangaskhan
126 Pinsir
129 Gyarados
Name: name, dtype: object
其他分析
sns. jointplot( "base_egg_steps" , "experience_growth" , data= df, size= 5 , ratio= 3 , color= "g" )
sns. jointplot( "attack" , "hp" , data= df, kind= "kde" )
plt. subplots( figsize= ( 10 , 10 ) )
sns. heatmap(
df[ df[ 'type2' ] != 'None' ] . groupby( [ 'type1' , 'type2' ] ) . size( ) . unstack( ) ,
linewidths= 1 ,
annot= True ,
cmap= "Blues"
)
plt. xticks( rotation= 35 )
plt. show( )