import numpy as np
import pandas as pd
print ( "=============Dataframe:基本概念及创建=====================" )
'''
Pandas数据结构Dataframe:基本概念及创建
"二维数组"Dataframe:是一个表格型的数据结构,包含一组有序的列,其列的值类型可以是数值、字符串、布尔值等。
Dataframe中的数据以一个或多个二维块存放,不是列表、字典或一维数组结构。
'''
data= { 'name' : [ 'Jack' , 'Tom' , 'Mary' ] ,
'age' : [ 18 , 19 , 20 ] ,
'gander' : [ 'm' , 'm' , 'w' ]
}
print ( pd. Series( data) )
frame= pd. DataFrame( data)
print ( frame)
print ( frame. index, '\n该数据类型为:' , type ( frame. index) )
print ( frame. columns, '\n该数据类型为:' , type ( frame. columns) )
print ( frame. values, '\n该数据类型为:' , type ( frame. values) )
print ( "==========Dataframe创建方法====================" )
data1= { 'a' : [ 1 , 2 , 3 ] ,
'b' : [ 3 , 4 , 5 ] ,
'c' : [ 4 , 5 , 6 ] }
data2= { 'one' : np. random. rand( 3 ) ,
'two' : np. random. rand( 3 ) }
print ( data1)
print ( data2)
print ( pd. DataFrame( data1) )
print ( pd. DataFrame( data2) )
print ( '---------------------' )
df1= pd. DataFrame( data1, columns= [ 'b' , 'c' , 'a' , 'd' ] )
df2= pd. DataFrame( data2, columns= [ 'b' , 'c' ] )
print ( df1)
df2= pd. DataFrame( data2, index= [ 'f1' , 'f2' , 'f3' ] )
print ( df2)
print ( "==========# Dataframe 创建方法二:由Series组成的字典================" )
data1= { 'one' : pd. Series( np. random. rand( 2 ) ) ,
'two' : pd. Series( np. random. rand( 3 ) ) }
data1= { 'one' : pd. Series( np. random. rand( 2 ) , index= [ 'a' , 'b' ] ) ,
'two' : pd. Series( np. random. rand( 3 ) , index= [ 'a' , 'b' , 'c' ] ) }
print ( data1)
print ( data2)
df1= pd. DataFrame( data1)
df2= pd. DataFrame( data2)
print ( df1)
print ( df2)
print ( "==================创建方法三:通过二维数组直接创建=======================" )
ar= np. random. rand( 9 ) . reshape( 3 , 3 )
print ( ar)
df1= pd. DataFrame( ar)
df2= pd. DataFrame( ar, index= [ 'a' , 'b' , 'c' ] , columns= [ 'one' , 'two' , 'three' ] )
print ( df1)
print ( df2)
print ( "==================创建方法四:由字典组成的列表====================" )
data= [ { 'one' : 1 , 'two' : 2 } , { 'one' : 5 , 'two' : 10 , 'three' : 20 } ]
print ( data)
df1= pd. DataFrame( data)
print ( df1)
df2= pd. DataFrame( data, index= [ 'a' , 'b' ] )
print ( df2)
df3= pd. DataFrame( data, columns= [ 'one' , 'two' ] )
print ( df3)
print ( "==================创建方法五:由字典组成的字典====================" )
data= { 'Jack' : { 'math' : 90 , 'english' : 89 , 'art' : 78 } ,
'Marry' : { 'math' : 82 , 'english' : 95 , 'art' : 92 } ,
'Tom' : { 'math' : 78 , 'english' : 67 } }
df1= pd. DataFrame( data)
print ( df1)
df2= pd. DataFrame( data, columns= [ 'Jack' , 'Tom' , 'Bob' ] )
df3= pd. DataFrame( data, index= [ 'a' , 'b' , 'c' ] )
print ( df2)
print ( df3)
print ( "============Dataframe:索引================" )
'''
Pandas数据结构Dataframe:索引
Dataframe既有行索引也有列索引,可以被看做由Series组成的字典(共用一个索引)
选择列 / 选择行 / 切片 / 布尔判断
'''
df= pd. DataFrame( np. random. rand( 12 ) . reshape( 3 , 4 ) * 100 ,
index= [ "one" , "two" , "three" , ] ,
columns= [ "a" , "b" , "c" , "d" ] )
print ( df)
data1= df[ "a" ]
data2= df[ [ "a" ] ]
print ( data1, type ( data1) )
print ( data2, type ( data2) )
data3= df. loc[ 'one' ]
data4= df. loc[ [ 'one' , 'two' ] ]
print ( data3)
print ( data4)
print ( "================df[] - 选择列=================" )
df= pd. DataFrame( np. random. rand( 12 ) . reshape( 3 , 4 ) * 100 ,
index= [ 'one' , 'two' , 'three' ] ,
columns= [ 'a' , 'b' , 'c' , 'd' ] )
print ( df)
print ( '==================' )
data1= df[ 'a' ]
data2= df[ [ 'b' , 'c' ] ]
print ( data1)
print ( data2)
data3= df[ : 1 ]
print ( "================df.loc[] - 按index选择行=================" )
df1= pd. DataFrame( np. random. rand( 16 ) . reshape( 4 , 4 ) * 100 ,
index= [ 'one' , 'two' , 'three' , 'four' ] ,
columns= [ 'a' , 'b' , 'c' , 'd' ]
)
df2= pd. DataFrame( np. random. rand( 16 ) . reshape( 4 , 4 ) * 100 ,
columns= [ 'a' , 'b' , 'c' , 'd' ] )
print ( df1)
print ( df2)
print ( "======================" )
data1= df1. loc[ 'one' ]
data2= df2. loc[ 1 ]
print ( data1)
print ( data2)
print ( '单标签索引\n-----' )
data3= df2. loc[ [ 3 , 2 , 1 ] ]
print ( data3)
print ( data4)
print ( '多标签索引\n======' )
data5= df1. loc[ 'one' : 'three' ]
data6= df2. loc[ 1 : 3 ]
print ( data5)
print ( data6)
print ( '切片索引' )
print ( "============df.iloc[] - 按照整数位置(从轴的0到length-1)选择行===============" )
df= pd. DataFrame( np. random. rand( 16 ) . reshape( 4 , 4 ) * 100 ,
index= [ 'one' , 'two' , 'three' , 'four' ] ,
columns= [ 'a' , 'b' , 'c' , 'b' ] )
print ( df)
print ( "===================" )
print ( df. iloc[ 0 ] )
print ( df. iloc[ - 1 ] )
print ( df. iloc[ [ 0 , 2 ] ] )
print ( df. iloc[ [ 3 , 2 , 1 ] ] )
print ( '多位置索引\n=========' )
print ( df. iloc[ 1 : 3 ] )
print ( df. iloc[ : : 2 ] )
print ( "==============# 布尔型索引================" )
df= pd. DataFrame( np. random. rand( 16 ) . reshape( 4 , 4 ) * 100 ,
index= [ 'one' , 'two' , 'three' , 'four' ] ,
columns= [ 'a' , 'b' , 'c' , 'd' ]
)
print ( df)
print ( df. loc[ 'one' , 'a' ] )
print ( df. loc[ [ 'one' , 'two' ] , [ 'a' , 'b' ] ] )
print ( df. loc[ 'one' : 'three' , 'a' : 'c' ] )
print ( df. loc[ : , 'a' : 'c' ] )
print ( "================" )
b1= df< 20
print ( b1, type ( b1) )
print ( df[ b1] )
print ( "============" )
b2= df[ 'a' ] > 50
print ( b2, type ( b2) )
print ( df[ b2] )
print ( df)
b3= df[ [ 'a' , 'b' ] ] > 50
print ( b3, type ( b3) )
print ( df[ b3] )
print ( "===============" )
print ( df)
b4 = df. loc[ [ 'one' , 'three' ] ] < 50
print ( b4, type ( b4) )
print ( df[ b4] )
print ( '----------------------------------------------------' )
print ( "=======loc/iloc[]补充============" )
df = pd. DataFrame( np. random. rand( 16 ) . reshape( 4 , 4 ) * 100 ,
index= [ 'one' , 'two' , 'three' , 'four' ] ,
columns= [ 'a' , 'b' , 'c' , 'd' ] )
print ( df)
print ( df. loc[ 'one' , 'a' ] )
print ( df. loc[ [ 'one' , 'two' ] , [ 'c' , 'd' ] ] )
print ( df. loc[ 'one' : 'three' , [ 'b' , 'd' ] ] )
print ( df. loc[ : , 'b' : 'c' ] )
print ( df. loc[ 'one' , : ] )
print ( '--------------------' )
print ( df)
print ( df. iloc[ 1 , 1 ] )
print ( df. iloc[ [ 0 , 1 ] , [ 2 , 3 ] ] )
print ( df. iloc[ 0 : 2 , 2 : 4 ] )
print ( df. iloc[ : , 2 : 4 ] )
print ( "==================多重索引=====================" )
df = pd. DataFrame( np. random. rand( 16 ) . reshape( 4 , 4 ) * 100 , index= [ 'one' , 'two' , 'three' , 'four' ] , columns= [ 'a' , 'b' , 'c' , 'd' ] )
print ( df)
print ( df. iloc[ [ 0 , 2 ] , 0 ] )
print ( df. loc[ [ 'one' , 'three' ] , 'a' ] )
print ( df[ 'a' ] . loc[ [ 'one' , 'three' ] ] )
print ( df[ [ 'b' , 'c' , 'd' ] ] . iloc[ : : 2 ] )
print ( df[ df[ 'a' ] < 50 ] . iloc[ : 2 ] )
print ( "=======================pandas官网学习============================" )
dates = pd. date_range( '20130101' , periods= 6 )
print ( dates, type ( dates) )
df = pd. DataFrame( np. random. randn( 6 , 4 ) , index= dates, columns= list ( 'ABCD' ) )
print ( df)
print ( df. head( 3 ) )
print ( df. to_numpy( ) )
print ( df. describe( ) )
print ( df)
print ( df. sort_index( axis= 1 , ascending= False ) )
print ( df. sort_index( axis= 0 , ascending= False ) )
print ( df. sort_values( by= 'B' , ascending= False ) )
print ( dates[ 0 ] )
print ( df. loc[ dates[ 0 ] ] )
df2 = df. copy( )
print ( df2)
df2[ 'E' ] = [ 'one' , 'one' , 'two' , 'three' , 'four' , 'three' ]
print ( df2)
print ( df2[ 'E' ] . isin( [ 'two' , 'four' ] ) )
print ( df2[ df2[ 'E' ] . isin( [ 'two' , 'four' ] ) ] )
s1 = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 ] , index= pd. date_range( '20130102' , periods= 6 ) )
print ( s1)
print ( df)
df[ 'F' ] = s1
print ( df)
df. at[ dates[ 0 ] , 'A' ] = 0
print ( df)
df. iat[ 0 , 1 ] = 0
print ( df)
df. loc[ : , 'D' ] = np. array( [ 5 ] * len ( df) )
print ( df)
df1 = df. reindex( index= dates[ 0 : 4 ] , columns= list ( df. columns) + [ 'E' ] )
print ( df1)
df1. loc[ dates[ 0 ] : dates[ 1 ] , 'E' ] = 1
print ( df1)
print ( df1. dropna( how= 'any' ) )
print ( df1. fillna( value= 5 ) )