pandas随笔
import pandas as pd
food_info = pd. read_csv( "food_info.csv" )
print ( type ( food_info) )
print ( food_info. dtypes)
print ( help ( pd. read_csv) )
food_info. head( 3 )
food_info. tail( 3 )
print ( food_info. columns)
print ( food_info. shape)
print ( food_info. loc[ 0 ] )
food_info. loc[ 3 : 6 ]
two_five_ten = [ 2 , 5 , 10 ]
food_info. loc[ two_five_ten]
ndb_col = food_info[ "NDB_No" ]
print ( ndb_col)
columns = [ "Zinc_(mg)" , "Copper_(mg)" ]
zinc_copper = food_info[ columns]
print ( zinc_copper)
col_names = food_info. columns. tolist( )
print ( col_names)
gram_columns = [ ]
for c in col_names:
if c. endswith( "(g)" ) :
gram_columns. append( c)
gram_df = food_info[ gram_columns]
print ( gram_df. head( 3 ) )
print ( food_info[ "Iron_(mg)" ] )
div_1000 = food_info[ "Iron_(mg)" ] / 1000
print ( div_1000)
water_energy = food_info[ "Water_(g)" ] * food_info[ "Energ_Kcal" ]
print ( water_energy)
iron_grams = food_info[ "Iron_(mg)" ] / 1000
print ( food_info. shape)
food_info[ "Iron_(g)" ] = iron_grams
print ( food_info. shape)
max_calories = food_info[ "Energ_Kcal" ] . max ( )
print ( max_calories)
normalized_calories = food_info[ "Energ_Kcal" ] / max_calories
normalized_fat = food_info[ "Lipid_Tot_(g)" ] / food_info[ "Lipid_Tot_(g)" ] . max ( )
import pandas as pd
food_info = pd. read_csv( "food_info.csv" )
food_info. sort_values( "Sodium_(mg)" , inplace= True )
print ( food_info[ "Sodium_(mg)" ] )
food_info. sort_values( "Sodium_(mg)" , inplace= True , ascending= False )
print ( food_info[ "Sodium_(mg)" ] )
import numpy as np
import pandas as pd
titanic_survival = pd. read_csv( "titanic_train.csv" )
titanic_survival. head( )
age = titanic_survival[ "Age" ]
print ( age. loc[ 0 : 10 ] )
age_is_null = pd. isnull( age)
print ( age_is_null)
age_null_true = age[ age_is_null]
print ( age_null_true)
age_null_count= len ( age_null_true)
print ( age_null_count)
mean_age = sum ( titanic_survival[ "Age" ] ) / len ( titanic_survival[ "Age" ] )
print ( mean_age)
good_ages = titanic_survival[ "Age" ] [ age_is_null == False ]
correct_mean_age = sum ( good_ages) / len ( good_ages)
print ( correct_mean_age)
correct_mean_age = titanic_survival[ "Age" ] . mean( )
print ( correct_mean_age)
passenger_classes = [ 1 , 2 , 3 ]
fares_by_class = { }
for this_class in passenger_classes:
pclass_rows = titanic_survival[ titanic_survival[ "Pclass" ] == this_class]
pclass_fares = pclass_rows[ "Fare" ]
fare_for_class = pclass_fares. mean( )
fares_by_class[ this_class] = fare_for_class
print ( fares_by_class)
passenger_survival = titanic_survival. pivot_table( index = "Pclass" , values= "Fare" , aggfunc= np. mean)
print ( passenger_survival)
passenger_age = titanic_survival. pivot_table( index= "Pclass" , values= "Age" )
print ( passenger_age)
port_stats = titanic_survival. pivot_table( index= "Embarked" , values= [ "Fare" , "Survived" ] , aggfunc= np. sum )
print ( port_stats)
drop_na_colums = titanic_survival. dropna( axis= 1 )
print ( drop_na_colums)
new_titanic_survival = titanic_survival. dropna( axis= 0 , subset= [ "Age" , "Sex" ] )
print ( new_titanic_survival)
row_index_83_age = titanic_survival. loc[ 83 , "Age" ]
print ( row_index_83_age)
row_index_766_pclass = titanic_survival. loc[ 766 , "Pclass" ]
print ( row_index_766_pclass)
pd. set_option( 'isplay.max_columns' , None )
new_titanic_survival = titanic_survival. sort_values( "Age" , ascending= False )
print ( new_titanic_survival[ 0 : 10 ] )
titanic_reindexed = new_titanic_survival. reset_index( drop= True )
print ( "----------" )
print ( titanic_reindexed. loc[ 0 : 10 ] )
def hundredth_row ( column) :
hundredth_row = column. loc[ 99 ]
return hundredth_row
hundredth_row = titanic_survival. apply ( hundredth_row)
print ( hundredth_row)
def null_count ( column) :
column_null = pd. isnull( column)
null = column[ column_null]
return len ( null)
column_null_count = titanic_survival. apply ( null_count)
print ( column_null_count)
def which_class ( row) :
pclass = row[ "Pclass" ]
if pd. isnull( pclass) :
return "Unknown"
elif pclass == 1 :
return "First Class"
elif pclass == 2 :
return "Second Class"
elif pclass == 3 :
return "Third Class"
classes = titanic_survival. apply ( which_class, axis= 1 )
print ( classes)
def is_minor ( row) :
if row[ "Age" ] < 18 :
return True
else :
return False
minors = titanic_survival. apply ( is_minor, axis= 1 )
print ( minors)
def generate_age_label ( row) :
age = row[ "Age" ]
if pd. isnull( age) :
return "unknown"
elif age < 18 :
return "minor"
else :
return "adult"
age_labels = titanic_survival. apply ( generate_age_label, axis= 1 )
print ( age_labels)
titanic_survival[ "age_labels" ] = age_labels
age_group_survial = titanic_survival. pivot_table( index= "age_labels" , values= "Survived" )
print ( age_group_survial)
import pandas as pd
fandango = pd. read_csv( 'fandango_score_comparison.csv' )
series_film = fandango[ 'FILM' ]
print ( type ( series_film) )
print ( series_film[ 0 : 5 ] )
series_rt = fandango[ 'RottenTomatoes' ]
print ( series_rt[ 0 : 5 ] )
film_names = series_film. values
print ( type ( film_names) )
from pandas import Series
rt_scores = series_rt. values
print ( rt_scores)
series_custom = Series( rt_scores, index= film_names)
series_custom[ [ 'Minions (2015)' , 'Leviathan (2014)' ] ]
fiveten = series_custom[ 5 : 10 ]
print ( fiveten)
original_index = series_custom. index. tolist( )
print ( original_index)
sorted_index = sorted ( original_index)
sorted_by_index = series_custom. reindex( sorted_index)
print ( sorted_by_index)
sc2 = series_custom. sort_index( )
sc3 = series_custom. sort_values( )
print ( sc2[ 0 : 10 ] )
print ( sc3[ 0 : 10 ] )
import numpy as np
print ( np. add( series_custom, series_custom) )
np. sin( series_custom)
np. max ( series_custom)
series_custom > 50
series_greater_than_50 = series_custom[ series_custom > 50 ]
criteria_one = series_custom > 50
criteria_two = series_custom < 75
both_criteria = series_custom[ criteria_one & criteria_two]
print ( both_criteria)
rt_critics = Series( fandango[ 'RottenTomatoes' ] . values, index= fandango[ 'FILM' ] )
rt_users = Series( fandango[ 'RottenTomatoes_User' ] . values, index= fandango[ 'FILM' ] )
rt_mean = ( rt_critics + rt_users) / 2
print ( rt_mean)
fandango_films = fandango. set_index( 'FILM' , drop= False )
print ( fandango_films. index)
fandango_films[ "Avengers: Age of Ultron (2015)" : "Hot Tub Time Machine 2 (2015)" ]
fandango_films. loc[ "Avengers: Age of Ultron (2015)" : "Hot Tub Time Machine 2 (2015)" ]
fandango_films. loc[ 'Kumiko, The Treasure Hunter (2015)' ]
movies = [ 'Kumiko, The Treasure Hunter (2015)' , 'Do You Believe? (2015)' , 'Ant-Man (2015)' ]
fandango_films. loc[ movies]
import numpy as np
types = fandango_films. dtypes
float_columns = types[ types. values == 'float64' ] . index
float_df = fandango_films[ float_columns]
deviations = float_df. apply ( lambda x: np. std( x) )
print ( deviations)
rt_mt_user = float_df[ [ 'RT_user_norm' , 'Metacritic_user_nom' ] ]
rt_mt_user. apply ( lambda x: np. std( x) , axis= 1 )