import pandas as pd
import numpy as np
df = pd. DataFrame( { 'a' : [ 1 , 2 ] * 3 ,
'b' : [ True , False ] * 3 ,
'c' : [ 1.0 , 2.0 ] * 3 ,
'e' : [ 'asian' , 'white' , 'black' , 'white' , 'asian' , 'white' ] ,
'd' : [ 'low' , 'low' , 'low' , 'median' , 'high' , 'high' ] } )
df
a b c e d 0 1 True 1.0 asian low 1 2 False 2.0 white low 2 1 True 1.0 black low 3 2 False 2.0 white median 4 1 True 1.0 asian high 5 2 False 2.0 white high
df. dtypes
a int64
b bool
c float64
e object
d object
dtype: object
df[ 'd' ] = df[ 'd' ] . astype( 'category' )
df
a b c e d 0 1 True 1.0 asian low 1 2 False 2.0 white low 2 1 True 1.0 black low 3 2 False 2.0 white median 4 1 True 1.0 asian high 5 2 False 2.0 white high
df. dtypes
a int64
b bool
c float64
e object
d category
dtype: object
df. select_dtypes( include= 'bool' )
b 0 True 1 False 2 True 3 False 4 True 5 False
df. select_dtypes( include= 'float64' )
c 0 1.0 1 2.0 2 1.0 3 2.0 4 1.0 5 2.0
df. select_dtypes( include= 'number' )
a c 0 1 1.0 1 2 2.0 2 1 1.0 3 2 2.0 4 1 1.0 5 2 2.0
df. select_dtypes( include= 'category' )
d 0 low 1 low 2 low 3 median 4 high 5 high
df. select_dtypes( include= 'object' )
e 0 asian 1 white 2 black 3 white 4 asian 5 white
df. select_dtypes( exclude= [ 'float64' ] )
a b e d 0 1 True asian low 1 2 False white low 2 1 True black low 3 2 False white median 4 1 True asian high 5 2 False white high
df = pd. DataFrame( np. arange( 12 ) . reshape( 3 , 4 ) , columns= [ 'A' , 'B' , 'C' , 'D' ] )
df
df. drop( [ 'B' , 'C' ] , axis= 1 )
df. drop( [ 0 , 1 ] )
s = pd. Series( [ "a" , "b" , np. nan, "c" , None ] )
print ( s)
0 a
1 b
2 NaN
3 c
4 None
dtype: object
print ( s. isnull( ) )
0 False
1 False
2 True
3 False
4 True
dtype: bool
a = pd. Series( [ 1 , 2 , np. nan, 3 , None ] )
print ( s[ s. isnull( ) ] )
2 NaN
4 None
dtype: object
a = pd. Series( [ 1 , 2 , np. nan, 3 , None ] )
a. sum ( )
6.0
a = [ [ 1 , np. nan, 2 ] , [ 9 , None , np. nan] , [ 3 , 4 , None ] , [ 5 , 6 , 7 ] ]
data = pd. DataFrame( a)
data
0 1 2 0 1 NaN 2.0 1 9 NaN NaN 2 3 4.0 NaN 3 5 6.0 7.0
data. dropna( )
data. dropna( axis= 1 )
a = [ [ 1 , np. nan, 2 ] , [ np. nan, None , np. nan] , [ 3 , None , None ] , [ 5 , None , 7 ] ]
data = pd. DataFrame( a)
print ( data)
print ( data. dropna( how= "all" ) )
print ( data. dropna( how= "all" , axis= 1 ) )
0 1 2
0 1.0 NaN 2.0
1 NaN NaN NaN
2 3.0 NaN NaN
3 5.0 NaN 7.0
0 1 2
0 1.0 NaN 2.0
2 3.0 NaN NaN
3 5.0 NaN 7.0
0 2
0 1.0 2.0
1 NaN NaN
2 3.0 NaN
3 5.0 7.0
a = [ [ 1 , 2 , 2 ] , [ 3 , None , 6 ] , [ 3 , 7 , None ] , [ 5 , None , 7 ] ]
data = pd. DataFrame( a)
print ( data)
print ( data. fillna( 0 ) )
0 1 2
0 1 2.0 2.0
1 3 NaN 6.0
2 3 7.0 NaN
3 5 NaN 7.0
0 1 2
0 1 2.0 2.0
1 3 0.0 6.0
2 3 7.0 0.0
3 5 0.0 7.0
print ( data. fillna( { 1 : 1 , 2 : 2 } ) )
print ( data. fillna( data. mean( ) ) )
0 1 2
0 1 2.0 2.0
1 3 1.0 6.0
2 3 7.0 2.0
3 5 1.0 7.0
0 1 2
0 1 2.0 2.0
1 3 4.5 6.0
2 3 7.0 5.0
3 5 4.5 7.0
from sklearn import preprocessing
import numpy as np
X_train = np. array( [ [ 1 . , - 1 . , 2 . ] ,
[ 2 . , 0 . , 0 . ] ,
[ 0 . , 1 . , - 1 . ] ] )
X_scaled = preprocessing. scale( X_train)
X_scaled
array([[ 0. , -1.22474487, 1.33630621],
[ 1.22474487, 0. , -0.26726124],
[-1.22474487, 1.22474487, -1.06904497]])
X_scaled. mean( axis= 0 )
array([0., 0., 0.])
X_scaled. std( axis= 0 )
array([1., 1., 1.])
df = pd. DataFrame( { 'col_a' : np. arange( 10 ) ,
'col_b' : np. random. randn( 10 ) ,
'col_c' : np. random. choice( [ 'A' , 'B' , 'C' ] , 10 ) ,
'col_d' : np. random. choice( [ 0 , 1 ] , 10 ) } )
df
col_a col_b col_c col_d 0 0 2.182928 B 1 1 1 -0.830507 B 0 2 2 -0.497002 B 0 3 3 1.485496 B 0 4 4 1.302028 C 1 5 5 0.480743 A 1 6 6 -0.828251 B 0 7 7 -1.771108 C 0 8 8 -0.607708 A 1 9 9 1.938848 C 1
print ( df. shape, df. shape[ 0 ] , df. shape[ 1 ] )
(10, 4) 10 4
df. columns
Index(['col_a', 'col_b', 'col_c', 'col_d'], dtype='object')
df. iloc[ : 5 ]
col_a col_b col_c col_d 0 0 2.182928 B 1 1 1 -0.830507 B 0 2 2 -0.497002 B 0 3 3 1.485496 B 0 4 4 1.302028 C 1
df[ [ 'col_a' , 'col_b' ] ]
col_a col_b 0 0 2.182928 1 1 -0.830507 2 2 -0.497002 3 3 1.485496 4 4 1.302028 5 5 0.480743 6 6 -0.828251 7 7 -1.771108 8 8 -0.607708 9 9 1.938848
df. iloc[ : 5 , : 2 ]
col_a col_b 0 0 2.182928 1 1 -0.830507 2 2 -0.497002 3 3 1.485496 4 4 1.302028
df. iat[ 0 , 1 ]
2.182928374642522
df[ ( df[ 'col_a' ] > 3 ) & ( df[ 'col_b' ] < 0 ) ]
col_a col_b col_c col_d 6 6 -0.828251 B 0 7 7 -1.771108 C 0 8 8 -0.607708 A 1
df[ df[ 'col_c' ] . isin( [ 'A' , 'B' ] ) ]
col_a col_b col_c col_d 0 0 2.182928 B 1 1 1 -0.830507 B 0 2 2 -0.497002 B 0 3 3 1.485496 B 0 5 5 0.480743 A 1 6 6 -0.828251 B 0 8 8 -0.607708 A 1
df[ 'col_e' ] = df[ 'col_a' ] + df[ 'col_b' ]
df
col_a col_b col_c col_d col_e 0 0 2.182928 B 1 2.182928 1 1 -0.830507 B 0 0.169493 2 2 -0.497002 B 0 1.502998 3 3 1.485496 B 0 4.485496 4 4 1.302028 C 1 5.302028 5 5 0.480743 A 1 5.480743 6 6 -0.828251 B 0 5.171749 7 7 -1.771108 C 0 5.228892 8 8 -0.607708 A 1 7.392292 9 9 1.938848 C 1 10.938848
df = df. drop( columns= 'col_e' )
df
col_a col_b col_c col_d 0 0 2.182928 B 1 1 1 -0.830507 B 0 2 2 -0.497002 B 0 3 3 1.485496 B 0 4 4 1.302028 C 1 5 5 0.480743 A 1 6 6 -0.828251 B 0 7 7 -1.771108 C 0 8 8 -0.607708 A 1 9 9 1.938848 C 1
df. drop( columns= df. columns[ 0 ] )
col_b col_c col_d 0 2.182928 B 1 1 -0.830507 B 0 2 -0.497002 B 0 3 1.485496 B 0 4 1.302028 C 1 5 0.480743 A 1 6 -0.828251 B 0 7 -1.771108 C 0 8 -0.607708 A 1 9 1.938848 C 1
df. T
0 1 2 3 4 5 6 7 8 9 col_a 0 1 2 3 4 5 6 7 8 9 col_b 2.18293 -0.830507 -0.497002 1.4855 1.30203 0.480743 -0.828251 -1.77111 -0.607708 1.93885 col_c B B B B C A B C A C col_d 1 0 0 0 1 1 0 0 1 1
df[ 'col_a' ] . astype( str )
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
Name: col_a, dtype: object
pd. Categorical( df[ 'col_c' ] )
[B, B, B, B, C, A, B, C, A, C]
Categories (3, object): [A, B, C]
df[ [ 'col_a' , 'col_b' ] ] . sum ( axis= 1 )
0 2.182928
1 0.169493
2 1.502998
3 4.485496
4 5.302028
5 5.480743
6 5.171749
7 5.228892
8 7.392292
9 10.938848
dtype: float64
df[ [ 'col_a' , 'col_b' ] ] . mean( axis= 0 )
col_a 4.500000
col_b 0.285547
dtype: float64
df[ [ 'col_a' , 'col_b' ] ] . apply ( lambda x: x. mean( ) + 10 )
col_a 14.500000
col_b 10.285547
dtype: float64
df2 = pd. DataFrame( { 'col_x' : np. arange( 10 ) ,
'col_y' : np. arange( 10 ) [ : : - 1 ] } )
df2
col_x col_y 0 0 9 1 1 8 2 2 7 3 3 6 4 4 5 5 5 4 6 6 3 7 7 2 8 8 1 9 9 0
pd. concat( [ df, df2] , axis= 1 )
col_a col_b col_c col_d col_x col_y 0 0 2.182928 B 1 0 9 1 1 -0.830507 B 0 1 8 2 2 -0.497002 B 0 2 7 3 3 1.485496 B 0 3 6 4 4 1.302028 C 1 4 5 5 5 0.480743 A 1 5 4 6 6 -0.828251 B 0 6 3 7 7 -1.771108 C 0 7 2 8 8 -0.607708 A 1 8 1 9 9 1.938848 C 1 9 0
df3 = pd. DataFrame( { 'col_a' : [ - 1 , - 2 ] ,
'col_b' : [ 0 , 1 ] ,
'col_c' : [ 'B' , 'C' ] ,
'col_d' : [ 1 , 0 ] } )
df3
col_a col_b col_c col_d 0 -1 0 B 1 1 -2 1 C 0
pd. concat( [ df, df3] , axis= 0 , ignore_index= True )
col_a col_b col_c col_d 0 0 2.182928 B 1 1 1 -0.830507 B 0 2 2 -0.497002 B 0 3 3 1.485496 B 0 4 4 1.302028 C 1 5 5 0.480743 A 1 6 6 -0.828251 B 0 7 7 -1.771108 C 0 8 8 -0.607708 A 1 9 9 1.938848 C 1 10 -1 0.000000 B 1 11 -2 1.000000 C 0
data = pd. read_csv( 'https://vincentarelbundock.github.io/Rdatasets/csv/ggplot2/diamonds.csv' , index_col= 0 )
data
carat cut color clarity depth table price x y z 1 0.23 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43 2 0.21 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31 3 0.23 Good E VS1 56.9 65.0 327 4.05 4.07 2.31 4 0.29 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63 5 0.31 Good J SI2 63.3 58.0 335 4.34 4.35 2.75 ... ... ... ... ... ... ... ... ... ... ... 53936 0.72 Ideal D SI1 60.8 57.0 2757 5.75 5.76 3.50 53937 0.72 Good D SI1 63.1 55.0 2757 5.69 5.75 3.61 53938 0.70 Very Good D SI1 62.8 60.0 2757 5.66 5.68 3.56 53939 0.86 Premium H SI2 61.0 58.0 2757 6.15 6.12 3.74 53940 0.75 Ideal D SI2 62.2 55.0 2757 5.83 5.87 3.64
53940 rows × 10 columns
cor_matrix = data. corr( )
data. corr( )
carat depth table price x y z carat 1.000000 0.028224 0.181618 0.921591 0.975094 0.951722 0.953387 depth 0.028224 1.000000 -0.295779 -0.010647 -0.025289 -0.029341 0.094924 table 0.181618 -0.295779 1.000000 0.127134 0.195344 0.183760 0.150929 price 0.921591 -0.010647 0.127134 1.000000 0.884435 0.865421 0.861249 x 0.975094 -0.025289 0.195344 0.884435 1.000000 0.974701 0.970772 y 0.951722 -0.029341 0.183760 0.865421 0.974701 1.000000 0.952006 z 0.953387 0.094924 0.150929 0.861249 0.970772 0.952006 1.000000
data. corr( ) [ 'price' ]
carat 0.921591
depth -0.010647
table 0.127134
price 1.000000
x 0.884435
y 0.865421
z 0.861249
Name: price, dtype: float64
data[ 'price' ] . corr( data[ "x" ] )
0.8844351610161268
data. corr( method= 'spearman' )
carat depth table price x y z carat 1.000000 0.030104 0.194980 0.962883 0.996117 0.995572 0.993183 depth 0.030104 1.000000 -0.245061 0.010020 -0.023442 -0.025425 0.103498 table 0.194980 -0.245061 1.000000 0.171784 0.202231 0.195734 0.159878 price 0.962883 0.010020 0.171784 1.000000 0.963196 0.962719 0.957232 x 0.996117 -0.023442 0.202231 0.963196 1.000000 0.997895 0.987355 y 0.995572 -0.025425 0.195734 0.962719 0.997895 1.000000 0.987068 z 0.993183 0.103498 0.159878 0.957232 0.987355 0.987068 1.000000
data. corr( method= 'pearson' ) [ 'price' ]
carat 0.921591
depth -0.010647
table 0.127134
price 1.000000
x 0.884435
y 0.865421
z 0.861249
Name: price, dtype: float64
data[ 'price' ] . corr( data[ "x" ] , method= 'pearson' )
0.8844351610161268
from numpy. random import rand
from numpy. random import seed
from scipy. stats import spearmanr
seed( 1 )
data1 = data[ 'x' ]
data2 = data[ 'price' ]
coef, p = spearmanr( data1, data2)
print ( 'Spearmans correlation coefficient: %.3f' % coef)
Spearmans correlation coefficient: 0.963
alpha = 0.05
if p > alpha:
print ( 'Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else :
print ( 'Samples are correlated (reject H0) p=%.3f' % p)
Samples are correlated (reject H0) p=0.000
p
0.0
from scipy. stats import kendalltau
seed( 1 )
coef, p = kendalltau( data1, data2)
print ( 'Kendall correlation coefficient: %.3f' % coef)
alpha = 0.05
if p > alpha:
print ( 'Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else :
print ( 'Samples are correlated (reject H0) p=%.3f' % p)
Kendall correlation coefficient: 0.831
Samples are correlated (reject H0) p=0.000
from scipy import stats
from scipy. stats import pearsonr
seed( 1 )
coef, p = pearsonr( data1, data2)
print ( 'pearsonr correlation coefficient: %.3f' % coef)
alpha = 0.05
if p > alpha:
print ( 'Samples are uncorrelated (fail to reject H0) p=%.3f' % p)
else :
print ( 'Samples are correlated (reject H0) p=%.3f' % p)
pearsonr correlation coefficient: 0.884
Samples are correlated (reject H0) p=0.000