Pandas相当于字典型的Numpy,可以给矩阵的下标命名。
1 Pandas安装
pip3 install pandas
2 主要数据结构
2.1 Series(带索引的一维数组)
Series的字符串表现形式为:索引在左边,值在右边。由于我们没有为数据指定索引。于是会自动创建一个0到N-1(N为长度)的整数型索引。
import numpy as np
import pandas as pd
#np.nan即null,类似numpy一维数组
s=pd.Series([1,3,6,np.nan,44,1])
print(s)
'''
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
'''
2.2 DataFrame(带索引的二维数组)
dates=pd.date_range('20210131',periods=6)
#print(dates)#打印日期
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print(df)
'''
a b c d
2021-01-31 -0.040826 0.551323 -0.691311 -0.426433
2021-02-01 0.683486 1.045233 0.013433 0.193675
2021-02-02 0.596419 0.351275 -0.455355 -0.152276
2021-02-03 -3.077742 0.219348 -0.311051 -0.583136
2021-02-04 0.441740 -1.472139 1.324866 -0.025587
2021-02-05 0.471326 0.607313 0.728324 0.436690
'''
print(df['b'])
'''
2021-01-31 0.551323
2021-02-01 1.045233
2021-02-02 0.351275
2021-02-03 0.219348
2021-02-04 -1.472139
2021-02-05 0.607313
Freq: D, Name: b, dtype: float64
'''
#创建一组没有给定标签的df
df1=pd.DataFrame(np.arange(12).reshape((3,4)))
#print(df1)
'''
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
'''
#创建一组给定列标签的df
df2=pd.DataFrame({
'a':1,
'b':pd.Timestamp('20210202'),
'c':pd.Series(np.array([1,3,5,7]),index=list(range(4)),dtype='int32'),
'd':np.array([3]*4,dtype='int32'),
'e':pd.Categorical(["are","you","ok","thank you"])
})
print(df2)
'''
a b c d e
0 1 2021-02-02 1 3 are
1 1 2021-02-02 3 3 you
2 1 2021-02-02 5 3 ok
3 1 2021-02-02 7 3 thank you
'''
import numpy as np
import pandas as pd
#创建一组给定列标签的df
df2=pd.DataFrame({
'a':1,
'b':pd.Timestamp('20210202'),
'c':pd.Series(np.array([1,3,5,7]),index=list(range(4)),dtype='int32'),
'd':np.array([3]*4,dtype='int32'),
'e':pd.Categorical(["are","you","ok","thank you"])
})
print(df2)
'''
a b c d e
0 1 2021-02-02 1 3 are
1 1 2021-02-02 3 3 you
2 1 2021-02-02 5 3 ok
3 1 2021-02-02 7 3 thank you
'''
#查看每列的类型
print(df2.dtypes)
'''
a int64
b datetime64[ns]
c int32
d int32
e category
dtype: object
'''
#查看行下标
print(df2.index)
'''
Int64Index([0, 1, 2, 3], dtype='int64')
'''
#查看列下标
print(df2.columns)
'''
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
'''
#只查看df2的值
print(df2.values)
'''
[[1 Timestamp('2021-02-02 00:00:00') 1 3 'are']
[1 Timestamp('2021-02-02 00:00:00') 3 3 'you']
[1 Timestamp('2021-02-02 00:00:00') 5 3 'ok']
[1 Timestamp('2021-02-02 00:00:00') 7 3 'thank you']]
'''
#查看数据总结
print(df2.describe())
'''
a c d
count 4.0 4.000000 4.0
mean 1.0 4.000000 3.0
std 0.0 2.581989 0.0
min 1.0 1.000000 3.0
25% 1.0 2.500000 3.0
50% 1.0 4.000000 3.0
75% 1.0 5.500000 3.0
max 1.0 7.000000 3.0
'''
#翻转数据
print(df2.T)
'''
0 ... 3
a 1 ... 1
b 2021-02-02 00:00:00 ... 2021-02-02 00:00:00
c 1 ... 7
d 3 ... 3
e are ... thank you
[5 rows x 4 columns]
'''
#对数据index排序并输出
print(df2.sort_index(axis=1,ascending=False))
'''
e d c b a
0 are 3 1 2021-02-02 1
1 you 3 3 2021-02-02 1
2 ok 3 5 2021-02-02 1
3 thank you 3 7 2021-02-02 1
'''
#对数据的值排序
print(df2.sort_values(by='c',axis=0,ascending=False))
'''
a b c d e
3 1 2021-02-02 7 3 thank you
2 1 2021-02-02 5 3 ok
1 1 2021-02-02 3 3 you
0 1 2021-02-02 1 3 are
'''