先马
官方文档:https://pandas.pydata.org/pandas-docs/stable/getting_started/index.html
dataframework的创建
import pandas as pd
import numpy as np
a=pd.Series([1,3,5])#创建一个一维数组,注意Series大写
#二维数组DataFram的创建
date=pd.date_range(start='2012',end='2016',periods=5)
col=['A','B','C','D']
sht=pd.DataFrame(np.random.randn(4,5),index=col,columns=date)
sht
Out[14]:
2012-01-01 ... 2016-01-01
A -1.208135 ... 1.209262
B 0.098231 ... 0.145908
C 1.247936 ... 1.124116
D 0.263908 ... -1.422481
[4 rows x 5 columns]
#二维表也可以用字典创建,key值表示colums
a={'TALL':[180,178,188],
'GENDER':'MALE',
'GRADE':np.array([90,95,96])}
b=pd.DataFrame(a,index=['tom','mike','bob'])
dataframework的简单操作
b.describe()#描述性统计
Out[16]:
TALL GRADE
count 3.000000 3.000000
mean 182.000000 93.666667
std 5.291503 3.214550
min 178.000000 90.000000
25% 179.000000 92.500000
50% 180.000000 95.000000
b.T#转置
Out[17]:
tom mike bob
TALL 180 178 188
GENDER MALE MALE MALE
GRADE 90 95 96
b.values#显示数值
Out[22]:
array([[180, 'MALE', 90],
[178, 'MALE', 95],
[188, 'MALE', 96]], dtype=object)
b.index#显示行标签
Out[23]: Index(['tom', 'mike', 'bob'], dtype='object')
b.TALL#通过列标签查看此列数据
Out[27]:
tom 180
mike 178
bob 188
Name: TALL, dtype: int64
数据查看
sht
Out[29]:
2012-01-01 ... 2016-01-01
A -1.208135 ... 1.209262
B 0.098231 ... 0.145908
C 1.247936 ... 1.124116
D 0.263908 ... -1.422481
[4 rows x 5 columns]
sht.head(2)#前两行
Out[30]:
2012-01-01 ... 2016-01-01
A -1.208135 ... 1.209262
B 0.098231 ... 0.145908
[2 rows x 5 columns]
sht.tail(1)#后一行
Out[31]:
2012-01-01 ... 2016-01-01
D 0.263908 ... -1.422481
[1 rows x 5 columns]
排序
# 首先创建一个sheet
sht
Out[60]:
2012-12-31 2013-12-31 2014-12-31 2015-12-31 2016-12-31
BOB -0.804521 0.163748 0.321372 0.321230 -1.568735
ARON -1.782169 0.471212 1.211516 -0.002611 1.971601
CARTER 0.490189 2.341276 -1.872003 0.422649 1.700321
DAVID 0.082172 1.387305 -0.142821 -0.616505 1.235456
sht.sort_index(axis=1,ascending=False)#列标签排序,降序
Out[61]:
2016-12-31 2015-12-31 2014-12-31 2013-12-31 2012-12-31
BOB -1.568735 0.321230 0.321372 0.163748 -0.804521
ARON 1.971601 -0.002611 1.211516 0.471212 -1.782169
CARTER 1.700321 0.422649 -1.872003 2.341276 0.490189
DAVID 1.235456 -0.616505 -0.142821 1.387305 0.082172
sht.sort_index(axis=0)#行标签排序,升序
Out[62]:
2012-12-31 2013-12-31 2014-12-31 2015-12-31 2016-12-31
ARON -1.782169 0.471212 1.211516 -0.002611 1.971601
BOB -0.804521 0.163748 0.321372 0.321230 -1.568735
CARTER 0.490189 2.341276 -1.872003 0.422649 1.700321
DAVID 0.082172 1.387305 -0.142821 -0.616505 1.235456
sht.sort_values(by='2012-12-31')#按某列的值进行排序
Out[65]:
2012-12-31 2013-12-31 2014-12-31 2015-12-31 2016-12-31
ARON -1.782169 0.471212 1.211516 -0.002611 1.971601
BOB -0.804521 0.163748 0.321372 0.321230 -1.568735
DAVID 0.082172 1.387305 -0.142821 -0.616505 1.235456
CARTER 0.490189 2.341276 -1.872003 0.422649 1.700321
sht.sort_values(by='ARON',axis=1)#按某行的值进行排序
Out[68]:
2012-12-31 2015-12-31 2013-12-31 2014-12-31 2016-12-31
BOB -0.804521 0.321230 0.163748 0.321372 -1.568735
ARON -1.782169 -0.002611 0.471212 1.211516 1.971601
CARTER 0.490189 0.422649 2.341276 -1.872003 1.700321
DAVID 0.082172 -0.616505 1.387305 -0.142821 1.235456
loc/iloc函数(dataframework的灵活截取)
按照行标签/序数索引
sht.loc['BOB':'ARON']#标签索引
Out[72]:
2012-12-31 2013-12-31 2014-12-31 2015-12-31 2016-12-31
BOB -0.804521 0.163748 0.321372 0.321230 -1.568735
ARON -1.782169 0.471212 1.211516 -0.002611 1.971601
sht.iloc[2:3]#序数索引
Out[69]:
2012-12-31 2013-12-31 2014-12-31 2015-12-31 2016-12-31
CARTER 0.490189 2.341276 -1.872003 0.422649 1.700321
sht.iloc[0:2,1:3]#前两行及二三列
Out[82]:
2013-12-31 2014-12-31
BOB 0.163748 0.321372
ARON 0.471212 1.211516
sht.iloc[0:2,[0,1,3]]#前两行以及一二五列
Out[84]:
2012-12-31 2013-12-31 2015-12-31
BOB -0.804521 0.163748 0.321230
ARON -1.782169 0.471212 -0.002611
精确索引改变目标数值
sht.iloc[1,1]=10
sht
Out[98]:
2012-12-31 2013-12-31 2014-12-31 2015-12-31 2016-12-31
BOB -0.804521 0.163748 0.321372 0.321230 -1.568735
ARON -1.782169 10.000000 1.211516 -0.002611 1.971601
CARTER 0.490189 2.341276 -1.872003 0.422649 1.700321
DAVID 0.082172 1.387305 -0.142821 -0.616505 1.235456