1import pandas as pd
2import numpy as np
索引
In [2]:1
s = pd.Series(np.random.rand(5), index =list('abcde'))2
s
Out[2]:
a 0.143653
b 0.648518
c 0.782733
d 0.739794
e 0.005048
dtype: float64
In [3]:1
s.index
Out[3]:
Index(['a','b','c','d','e'], dtype='object')
In [4]:1
s.index.name ='alpha'# 给索引命名2
s
Out[4]:
alpha
a 0.143653
b 0.648518
c 0.782733
d 0.739794
e 0.005048
dtype: float64
In [5]:1
s = pd.DataFrame(np.random.randn(4,3), columns =['one','two','three'])2
s
Out[5]:
one two three
00.6504810.7298760.6798251-0.985725-0.048903-2.19698321.1432420.624922-0.8180833-0.3797131.7784750.898793
In [6]:1
s.index
Out[6]:
RangeIndex(start=0, stop=4, step=1)
In [7]:1
s.columns
Out[7]:
Index(['one','two','three'], dtype='object')
In [8]:1
s.index.name ='row'2
s.columns.name ='col'3
s
Out[8]:
col one two three
row
00.6504810.7298760.6798251-0.985725-0.048903-2.19698321.1432420.624922-0.8180833-0.3797131.7784750.898793
重复索引
In [9]:1
s = pd.Series(np.arange(6), index =list('abcbda'))2
s
Out[9]:
a 0
b 1
c 2
b 3
d 4
a 5
dtype: int32
In [10]:1
s['a']# 重复索引,返回的是一个Series
Out[10]:
a 0
a 5
dtype: int32
In [11]:1
s['c']# 返回的只是一个数字
Out[11]:2
In [12]:1
s.index.is_unique # 判断有没有重复索引
Out[12]:False
In [13]:1
s.index.unique()# 返回只包含唯一索引的列表
Out[13]:
Index(['a','b','c','d'], dtype='object')
In [14]:1
s.groupby(s.index).sum()# 对重复索引的值求和
Out[14]:
a 5
b 4
c 2
d 4
dtype: int32
In [15]:1
s.groupby(s.index).first()# 对重复索引的值只取第一项
Out[15]:
a 0
b 1
c 2
d 4
dtype: int32
In [16]:1
s.groupby(s.index).mean()# 对重复索引的值取平均值
Out[16]:
a 2.5
b 2.0
c 2.0
d 4.0
dtype: float64
多级索引
In [17]:1
a =[['a','a','a','b','b','c','c'],[1,2,3,1,2,2,3]]2
t =list(zip(*a))3
t
Out[17]:[('a',1),('a',2),('a',3),('b',1),('b',2),('c',2),('c',3)]
In [19]:1
index = pd.MultiIndex.from_tuples(t, names =['level1','level2'])2
index
Out[19]:
MultiIndex([('a',1),('a',2),('a',3),('b',1),('b',2),('c',2),('c',3)],
names=['level1','level2'])
In [20]:1
s = pd.Series(np.random.rand(7), index = index)2
s
Out[20]:
level1 level2
a 10.25156120.74652530.762872
b 10.15409320.980960
c 20.29642530.178885
dtype: float64
In [21]:1
s['b']# 返回以level2为索引的Series
Out[21]:
level2
10.15409320.980960
dtype: float64
In [22]:1
s['b':'c']
Out[22]:
level1 level2
b 10.15409320.980960
c 20.29642530.178885
dtype: float64
In [23]:1
s[['a','c']]
Out[23]:
level1 level2
a 10.25156120.74652530.762872
c 20.29642530.178885
dtype: float64
In [24]:1
s[:,2]# 第一级索引为任意值,第二级索引为2
Out[24]:
level1
a 0.746525
b 0.980960
c 0.296425
dtype: float64
DateFrame的多级索引
In [27]:1
df = pd.DataFrame(np.random.randint(1,10,(4,3)), index =[['a','a','b','b'],[1,2,1,2]],2
columns =[['one','one','two'],['blue','red','blue']])3
df.index.names =['row_1','row_2']4
df.columns.names =['col_1','col_2']5
df
Out[27]:
col_1 one two
col_2 blue red blue
row_1 row_2
a 12142893
b 18252788
In [28]:1
df.loc['a']
Out[28]:
col_1 one two
col_2 blue red blue
row_2
12142893
In [29]:1
df.loc['a',1]# 返回一个Series
Out[29]:
col_1 col_2
one blue 2
red 1
two blue 4
Name:(a,1), dtype: int32
In [30]:1
df.loc['a'].loc[1]
Out[30]:
col_1 col_2
one blue 2
red 1
two blue 4
Name:1, dtype: int32
索引的交换
In [31]:1
df2 = df.swaplevel('row_1','row_2')2
df2
Out[31]:
col_1 one two
col_2 blue red blue
row_2 row_1
1 a 2142 a 8931 b 8252 b 788
In [38]:1
df2.sort_index(0)# 按照1级索引排序
Out[38]:
col_1 one two
col_2 blue red blue
row_2 row_1
1 a 214
b 8252 a 893
b 788
In [39]:1
df2.sort_index(1)# 按照2级索引排序
Out[39]:
col_1 one two
col_2 blue red blue
row_2 row_1
1 a 2142 a 8931 b 8252 b 788
In [40]:1
df
Out[40]:
col_1 one two
col_2 blue red blue
row_1 row_2
a 12142893
b 18252788
In [41]:1
df.sum(level =0)# 根据第一级索引求和
Out[41]:
col_1 one two
col_2 blue red blue
row_1
a 10107
b 151013
In [42]:1
df.sum(level =1)# 根据第二级索引求和
Out[42]:
col_1 one two
col_2 blue red blue
row_2
110392151711
把列数据转换成索引
In [43]:1
df = pd.DataFrame({'a':np.arange(7),2'b':np.arange(7,0,-1),3'c':['one','one','one','two','two','two','two'],4'd':[0,1,2,0,1,2,3]})5
df
Out[43]:
a b c d
007 one 0116 one 1225 one 2334 two 0443 two 1552 two 2661 two 3
In [45]:1
df.set_index('c')# 把c这一列设置成索引值
Out[45]:
a b d
c
one 070
one 161
one 252
two 340
two 431
two 522
two 613
In [46]:1
df.set_index(['c','d'])# 把c和d设置成二级索引
Out[46]:
a b
c d
one 007116225
two 034143252361
In [47]:1
df2 = df.set_index(['c','d'])2
df2
Out[47]:
a b
c d
one 007116225
two 034143252361
In [48]:1
df2.reset_index()# 把多级索引重新转化成一级索引
Out[48]:
c d a b
0 one 0071 one 1162 one 2253 two 0344 two 1435 two 2526 two 361
In [49]:1
df2.reset_index().sort_index('columns')# 根据列索引排序
Out[49]:
a b c d
007 one 0116 one 1225 one 2334 two 0443 two 1552 two 2661 two 3
In []:1
2 分组计算
1import pandas as pd
2import numpy as np
In [2]:1
df = pd.DataFrame({'key1':['a','a','b','b','a'],2'key2':['one','two','one','two','one'],3'data1':np.random.randint(1,10,5),4'data2':np.random.randint(1,10,5)})5
df
Out[2]:
key1 key2 data1 data2
0 a one 591 a two 562 b one 543 b two 364 a one 62
In [3]:1
df['data1']
Out[3]:0515253346
Name: data1, dtype: int32
In [4]:1
df['data1'].groupby(df['key1'])
Out[4]:<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000004BC4731630>
In [5]:1
df['data1'].groupby(df['key1']).mean()
Out[5]:
key1
a 5.333333
b 4.000000
Name: data1, dtype: float64
In [6]:1
df
Out[6]:
key1 key2 data1 data2
0 a one 591 a two 562 b one 543 b two 364 a one 62
In [7]:1# 也可以自己创建key2
key =[1,2,1,1,2]3
df['data1'].groupby(key).mean()# 传入的key是一个列表
Out[7]:14.33333325.500000
Name: data1, dtype: float64
多级分组
In [8]:1# 也可以是多级索引2
df['data1'].groupby([df['key1'], df['key2']]).sum()
Out[8]:
key1 key2
a one 11
two 5
b one 5
two 3
Name: data1, dtype: int32
In [9]:1
df['data1'].groupby([df['key1'], df['key2']]).size()# 各个分组的数据个数
Out[9]:
key1 key2
a one 2
two 1
b one 1
two 1
Name: data1, dtype: int64
In [10]:1
df.groupby('key1').sum()2# 结果中key2没有出现,因为key2的元素不是数值,所以在求和的时候忽略
Out[10]:
data1 data2
key1
a 1617
b 810
In [11]:1
df
Out[11]:
key1 key2 data1 data2
0 a one 591 a two 562 b one 543 b two 364 a one 62
In [12]:1
df.groupby('key1').sum()['data1']
Out[12]:
key1
a 16
b 8
Name: data1, dtype: int32
多级索引的分组
In [13]:1
mean = df.groupby(['key1','key2']).sum()2
mean
Out[13]:
data1 data2
key1 key2
a one 1111
two 56
b one 54
two 36
In [14]:1type(mean)
Out[14]:
pandas.core.frame.DataFrame
In [15]:1
mean.unstack()# 转换成dataframe
Out[15]:
data1 data2
key2 one two one two
key1
a 115116
b 5346
In [16]:1# groupby支持迭代器协议2for name, group in df.groupby('key1'):3print(name)4print(group)
a
key1 key2 data1 data2
0 a one 591 a two 564 a one 62
b
key1 key2 data1 data2
2 b one 543 b two 36
In [17]:1
df
Out[17]:
key1 key2 data1 data2
0 a one 591 a two 562 b one 543 b two 364 a one 62
In [18]:1# 可以转换成列表2list(df.groupby('key1'))
Out[18]:[('a', key1 key2 data1 data2
0 a one 591 a two 564 a one 62),('b', key1 key2 data1 data2
2 b one 543 b two 36)]
In [19]:1type(list(df.groupby('key1'))[0][0])
Out[19]:str
In [20]:1# 也可以转换成字典2dict(list(df.groupby('key1')))
Out[20]:{'a': key1 key2 data1 data2
0 a one 591 a two 564 a one 62,'b': key1 key2 data1 data2
2 b one 543 b two 36}
In [21]:1
df.dtypes # 返回每一列的数据类型
Out[21]:
key1 object
key2 object
data1 int32
data2 int32
dtype:object
In [22]:1type(df.dtypes)
Out[22]:
pandas.core.series.Series
In [23]:1
df
Out[23]:
key1 key2 data1 data2
0 a one 591 a two 562 b one 543 b two 364 a one 62
In [24]:1
df.groupby(df.dtypes, axis =1).sum()
Out[24]:
int32 object014 aone
111 atwo
29 bone
39 btwo
48 aone
通过字典分组
In [25]:1
df = pd.DataFrame(np.random.randint(1,10,(5,5)), columns =list('abcde'),2
index =['alice','bob','jack','marry','brand'])3
df.iloc[1,1:3]= np.NaN
4
df
Out[25]:
a b c d e
alice 41.02.079
bob 9 NaN NaN 45
jack 45.03.035
marry 58.03.042
brand 34.05.029
In [26]:1
mapping ={'a':'red','b':'red','c':'blue','d':'orange','e':'blue'}2
grouped = df.groupby(mapping, axis =1)3
grouped.sum()4# 从结果可看出,nan被当成0处理
Out[26]:
blue orange red
alice 11.07.05.0
bob 5.04.09.0
jack 8.03.09.0
marry 5.04.013.0
brand 14.02.07.0
In [27]:1
grouped.size()
Out[27]:
blue 2
orange 1
red 2
dtype: int64
In [28]:1
grouped.count()
Out[28]:
blue orange red
alice 212
bob 111
jack 212
marry 212
brand 212
通过函数分组
In [51]:1def_group_key(idx):2print(idx)3return idx
4
5
df.groupby(_group_key)
alice
bob
jack
marry
brand
Out[51]:<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000004BC47B8550>
In [54]:1type(df.index)
Out[54]:
pandas.core.indexes.base.Index
In [44]:1
df
Out[44]:
a b c d e
alice 41.02.079
bob 9 NaN NaN 45
jack 45.03.035
marry 58.03.042
brand 34.05.029
In [45]:1def_group_key(idx):2print(idx)3return idx
4
5
df.groupby(_group_key).size()
alice
bob
jack
marry
brand
Out[45]:
alice 1
bob 1
brand 1
jack 1
marry 1
dtype: int64
In [48]:1def_group_key(idx):2print(idx)3returnlen(idx)4
5
df.groupby(_group_key).size
alice
bob
jack
marry
brand
Out[48]:<bound method GroupBy.size of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000004BC4800C88>>
In [49]:1
df.groupby(len).size()
Out[49]:314153
dtype: int64
In [50]:1
df
Out[50]:
a b c d e
alice 41.02.079
bob 9 NaN NaN 45
jack 45.03.035
marry 58.03.042
brand 34.05.029
In [52]:1
df.groupby(len).sum()
Out[52]:
a b c d e
390.00.045445.03.03551213.010.01320
多级索引的分组
In [34]:1# columns = pd.MultiIndex.from_arrays([['China', 'USA', 'China', 'USA', 'China'], ['a', 'a', 'b', 'c', 'b']],2# names = ['country', 'index'])3# df = pd.DataFrame(np.random.randint(0, 10, (5, 5)), columns = columns)4# df
In [35]:1# 按照country分组2# df.groupby(level = 'country', axis = 1).sum() # 默认按照行分组,所以要指定axis=1
In [36]:1# 按照index
3 聚合运算
内置聚合函数
In [1]:1import pandas as pd
2import numpy as np
In [6]:1
df = pd.DataFrame({'key1':['a','a','b','b','a'],2'key2':['one','two','one','two','one'],3'data1':np.random.randint(1,10,5),4'data2':np.random.randint(1,10,5)})5
df
Out[6]:
key1 key2 data1 data2
0 a one 951 a two 212 b one 463 b two 554 a one 23
In [7]:1
df.groupby('key1').sum()
Out[7]:
data1 data2
key1
a 139
b 911
In [8]:1
df.groupby('key1').mean()
Out[8]:
data1 data2
key1
a 4.3333333.0
b 4.5000005.5
In [9]:1
df.groupby('key1').min()
Out[9]:
key2 data1 data2
key1
a one 21
b one 45
In [10]:1
df.groupby('key1').max()
Out[10]:
key2 data1 data2
key1
a two 95
b two 56
In [11]:1
df.groupby('key1').describe()
Out[11]:
data1 data2
count mean std min25%50%75%max count mean std min25%50%75%max
key1
a 3.04.3333334.0414522.02.002.05.509.03.03.02.0000001.02.003.04.005.0
b 2.04.5000000.7071074.04.254.54.755.02.05.50.7071075.05.255.55.756.0
自定义聚合函数
In [12]:1
grouped = df.groupby('key1')2
grouped
Out[12]:<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000A6A6987F60>
In [15]:1
df
Out[15]:
key1 key2 data1 data2
0 a one 951 a two 212 b one 463 b two 554 a one 23
In [17]:1defpeak_range(s):2print(type(s))3return s.max()- s.min()4
5
grouped.agg(peak_range)<class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'>
Out[17]:
data1 data2
key1
a 74
b 11
In [18]:1# 一次应用多个聚合函数2
grouped.agg(['std','mean','sum', peak_range])<class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'>
Out[18]:
data1 data2
std mean sum peak_range std mean sum peak_range
key1
a 4.0414524.3333331372.0000003.094
b 0.7071074.500000910.7071075.5111
In [19]:1# 修改自定义的聚合函数的列名称2
grouped.agg(['std','mean','sum',('range', peak_range)])<class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'><class'pandas.core.series.Series'>
Out[19]:
data1 data2
std mean sumrange std mean sumrange
key1
a 4.0414524.3333331372.0000003.094
b 0.7071074.500000910.7071075.5111
对不同的列应用不同的聚合函数
In [22]:1
d ={'data1':['mean',('range', peak_range)],'data2':'sum'}2
grouped.agg(d)<class'pandas.core.series.Series'><class'pandas.core.series.Series'>
Out[22]:
data1 data2
mean rangesum
key1
a 4.33333379
b 4.500000111
In [23]:1# 将key1作为数据而不是索引2
grouped.agg(d).reset_index()<class'pandas.core.series.Series'><class'pandas.core.series.Series'>
Out[23]:
key1 data1 data2
mean rangesum0 a 4.333333791 b 4.500000111
In [24]:1
df.groupby('key1', as_index =False).agg(d)# 将key1作为数据而不是索引<class'pandas.core.series.Series'><class'pandas.core.series.Series'>
Out[24]:
key1 data1 data2
mean rangesum0 a 4.333333791 b 4.500000111
分组运算的高级应用
In [25]:1
df = pd.DataFrame({'key1':['a','a','b','b','a'],2'key2':['one','two','one','two','one'],3'data1':np.random.randint(1,10,5),4'data2':np.random.randint(1,10,5)})5
df
Out[25]:
key1 key2 data1 data2
0 a one 761 a two 362 b one 123 b two 154 a one 92
In [28]:1
k1_mean = df.groupby('key1').mean()2
k1_mean
Out[28]:
data1 data2
key1
a 6.3333334.666667
b 1.0000003.500000
In [29]:1# 更改k1_mean的列名称2
k1_mean = df.groupby('key1').mean().add_prefix('mean_')3
k1_mean
Out[29]:
mean_data1 mean_data2
key1
a 6.3333334.666667
b 1.0000003.500000
In [30]:1# 如何将k1_mean与df合并2
pd.merge(df, k1_mean, left_on ='key1', right_index =True)
Out[30]:
key1 key2 data1 data2 mean_data1 mean_data2
0 a one 766.3333334.6666671 a two 366.3333334.6666674 a one 926.3333334.6666672 b one 121.0000003.5000003 b two 151.0000003.500000
In [32]:1# 另一种实现方法2
k1_mean = df.groupby('key1').transform(np.mean).add_prefix('mean_')3
k1_mean
Out[32]:
mean_data1 mean_data2
06.3333334.66666716.3333334.66666721.0000003.50000031.0000003.50000046.3333334.666667
In [33]:1
df[k1_mean.columns]= k1_mean
2
df
Out[33]:
key1 key2 data1 data2 mean_data1 mean_data2
0 a one 766.3333334.6666671 a two 366.3333334.6666672 b one 121.0000003.5000003 b two 151.0000003.5000004 a one 926.3333334.666667
用自定义函数调用transform
In [34]:1
df = pd.DataFrame(np.random.randint(1,10,(5,5)), index =['alice','bob','marry','jack','bran'],2
columns =list('abcde'))3
df
Out[34]:
a b c d e
alice 13611
bob 13855
marry 96882
jack 54729
bran 72728
In [37]:1defdemean(s):2return s - s.mean()3
4
key =['one','one','two','one','two']5
demeaned = df.groupby(key).transform(demean)6
demeaned
Out[37]:
a b c d e
alice -1.333333-0.333333-1.0-1.666667-4.0
bob -1.333333-0.3333331.02.3333330.0
marry 1.0000002.0000000.53.000000-3.0
jack 2.6666670.6666670.0-0.6666674.0
bran -1.000000-2.000000-0.5-3.0000003.0apply()方法
In [39]:1
df = pd.DataFrame({'key1':list('aabbababaa'),2'key2':['one','two','one','two','one','one','two','one','one','two'],3'data1':np.random.randint(1,10,10),4'data2':np.random.randint(1,10,10)})5
df
Out[39]:
key1 key2 data1 data2
0 a one 981 a two 692 b one 143 b two 624 a one 425 b one 936 a two 567 b one 498 a one 129 a two 89
In [42]:1# 输出按照data1排序的最大的前两列2deftop(g, n =2, columns ='data1'):3return g.sort_values(by = columns, ascending =False)[:n]4
top(df)
Out[42]:
key1 key2 data1 data2
0 a one 985 b one 93
In [43]:1
top(df, n =3)
Out[43]:
key1 key2 data1 data2
0 a one 985 b one 939 a two 89
In [44]:1
df.groupby('key1').apply(top)
Out[44]:
key1 key2 data1 data2
key1
a 0 a one 989 a two 89
b 5 b one 933 b two 62
In [47]:1
states =['e1','e2','e3','e4','w1','w2','w3','w4']2
group_key =['east']*4+['west']*43
data = pd.Series(np.random.randn(8), index = states)4
data[['e1','e3','w4']]= np.NaN # 同data['e1', 'e3', 'w4'] = np.NaN5
data
Out[47]:
e1 NaN
e2 -0.218916
e3 NaN
e4 -0.939582
w1 0.713938
w2 0.572105
w3 0.330894
w4 NaN
dtype: float64
In [48]:1# 要求east分组里的nan用east分组的平均值填充,west分组的nan一样2# 先求出east分组和west分组的平均值3
data.groupby(group_key).mean()
Out[48]:
east -0.579249
west 0.538979
dtype: float64
In [50]:1
data.groupby(group_key).apply(lambda g : g.fillna(g.mean()))
Out[50]:
e1 -0.579249
e2 -0.218916
e3 -0.579249
e4 -0.939582
w1 0.713938
w2 0.572105
w3 0.330894
w4 0.538979
dtype: float64
In []:1
4 时间日期
python里的datetime
In [1]:1import pandas as pd
2import numpy as np
In [2]:1from datetime import datetime
2from datetime import timedelta
In [3]:1
now = datetime.now()# 返回现在的时间2
now
Out[3]:
datetime.datetime(2020,4,10,9,23,31,801144)
In [4]:1
now.year
Out[4]:2020
In [5]:1type(now.year)
Out[5]:int
In [6]:1
now.year, now.month, now.day
Out[6]:(2020,4,10)
timedelta
In [7]:1
date1 = datetime(2020,4,20)2
date2 = datetime(2020,4,16)3
delta = date1 - date2
4
delta
Out[7]:
datetime.timedelta(days=4)
In [8]:1
delta.days
Out[8]:4
In [9]:1
delta.total_seconds()
Out[9]:345600.0
In [10]:1
date2 + delta
Out[10]:
datetime.datetime(2020,4,20,0,0)
In [11]:1
date2 + timedelta(4.5)# date2加上4天半
Out[11]:
datetime.datetime(2020,4,20,12,0)
字符串与datetime之间的转换
In [12]:1# datetime转换成字符串2
date = datetime(2020,4,9,17,17,17)3
date
Out[12]:
datetime.datetime(2020,4,9,17,17,17)
In [13]:1str(date)
Out[13]:'2020-04-09 17:17:17'
In [14]:1# 将datetime格式化为字符串2
date.strftime('%Y/%m/%d %H:%M:%S')# 年月日用/连接,时分秒用:连接
Out[14]:'2020/04/09 17:17:17'
In [15]:1
date.strftime('%y/%m/%d %H:%M:%S')# 大写的Y表示四位数的年份,小写的y表示两位数的年份
Out[15]:'20/04/09 17:17:17'
In [16]:1# 字符串转换成datetime2
date.strptime('2020-4-9 17:24:24','%Y-%m-%d %H:%M:%S')
Out[16]:
datetime.datetime(2020,4,9,17,24,24)
python里的时间序列
In [17]:1
dates =[datetime(2020,4,1), datetime(2020,4,2), datetime(2020,4,3), datetime(2020,4,4)]2
s = pd.Series(np.random.randn(4), index = dates)3
s
Out[17]:2020-04-01-0.8435032020-04-02-0.1359302020-04-03-0.2909672020-04-04-1.474397
dtype: float64
In [18]:1type(s.index)
Out[18]:
pandas.core.indexes.datetimes.DatetimeIndex
In [19]:1type(s.index[0])
Out[19]:
pandas._libs.tslibs.timestamps.Timestamp
date_range生成时间序列
In [20]:1
pd.date_range('20200401','20200406')
Out[20]:
DatetimeIndex(['2020-04-01','2020-04-02','2020-04-03','2020-04-04','2020-04-05','2020-04-06'],
dtype='datetime64[ns]', freq='D')
In [21]:1
pd.date_range('20200401', periods =10)# 生成10个
Out[21]:
DatetimeIndex(['2020-04-01','2020-04-02','2020-04-03','2020-04-04','2020-04-05','2020-04-06','2020-04-07','2020-04-08','2020-04-09','2020-04-10'],
dtype='datetime64[ns]', freq='D')
In [22]:1
pd.date_range('20200401 17:42:42', periods =10)
Out[22]:
DatetimeIndex(['2020-04-01 17:42:42','2020-04-02 17:42:42','2020-04-03 17:42:42','2020-04-04 17:42:42','2020-04-05 17:42:42','2020-04-06 17:42:42','2020-04-07 17:42:42','2020-04-08 17:42:42','2020-04-09 17:42:42','2020-04-10 17:42:42'],
dtype='datetime64[ns]', freq='D')
In [23]:1# 假设不需要时分秒2
pd.date_range('20200401 17:42:42', periods =10, normalize =True)
Out[23]:
DatetimeIndex(['2020-04-01','2020-04-02','2020-04-03','2020-04-04','2020-04-05','2020-04-06','2020-04-07','2020-04-08','2020-04-09','2020-04-10'],
dtype='datetime64[ns]', freq='D')
In [24]:1
pd.date_range('20200401', periods =10, freq ='M')# 以月为单位
Out[24]:
DatetimeIndex(['2020-04-30','2020-05-31','2020-06-30','2020-07-31','2020-08-31','2020-09-30','2020-10-31','2020-11-30','2020-12-31','2021-01-31'],
dtype='datetime64[ns]', freq='M')
In [25]:1
pd.date_range('20200401', periods =10, freq ='4H')# 以4小时为单位
Out[25]:
DatetimeIndex(['2020-04-01 00:00:00','2020-04-01 04:00:00','2020-04-01 08:00:00','2020-04-01 12:00:00','2020-04-01 16:00:00','2020-04-01 20:00:00','2020-04-02 00:00:00','2020-04-02 04:00:00','2020-04-02 08:00:00','2020-04-02 12:00:00'],
dtype='datetime64[ns]', freq='4H')
时期(某一个时间范围)
In [26]:1
p = pd.Period(2020)2
p
3# 结果中A表示年,DEC表示12月,表示这一年以12月结束
Out[26]:
Period('2020','A-DEC')
In [27]:1
p +2
Out[27]:
Period('2022','A-DEC')
In [28]:1
p = pd.Period(2020, freq ='M')# 指定单位为月份2
p
Out[28]:
Period('2020-01','M')
In [30]:1
p +2
Out[30]:
Period('2020-03','M')
时期序列 period_range
In [33]:1
pd.period_range('2020-04', periods =10, freq ='M')
Out[33]:
PeriodIndex(['2020-04','2020-05','2020-06','2020-07','2020-08','2020-09','2020-10','2020-11','2020-12','2021-01'],
dtype='period[M]', freq='M')
In [34]:1# 创建季度序列2
pd.period_range('2020Q1', periods =10, freq ='Q')
Out[34]:
PeriodIndex(['2020Q1','2020Q2','2020Q3','2020Q4','2021Q1','2021Q2','2021Q3','2021Q4','2022Q1','2022Q2'],
dtype='period[Q-DEC]', freq='Q-DEC')
时间单位的转化
In [35]:1
s = pd.Period('2020')# 以年为单位2
s
Out[35]:
Period('2020','A-DEC')
In [36]:1
s.asfreq('M')# 以月为单位
Out[36]:
Period('2020-12','M')
In [37]:1
s.asfreq('M', how ='start')
Out[37]:
Period('2020-01','M')
In [38]:1
p = pd.Period('2020-04', freq ='M')2
p
Out[38]:
Period('2020-04','M')
In [39]:1
p.asfreq('A-DEC')# 转化为以年为单位,并且以12月为结束
Out[39]:
Period('2020','A-DEC')
In [40]:1
p.asfreq('A-MAR')# 转化为以年为单位,并且以3月份为结束。所以2020-04属于2021年
Out[40]:
Period('2021','A-MAR')
In [41]:1
p = pd.Period('2020Q4', freq ='Q-JAN')# 以季度为单位,并且以一月份为结束2
p
Out[41]:
Period('2020Q4','Q-JAN')
In [42]:1
p.asfreq('M', how ='start'), p.asfreq('M', how ='end')
Out[42]:(Period('2019-11','M'), Period('2020-01','M'))
In [52]:1# 获取该季度倒数第二个工作日下午4点20分2(p.asfreq('B', how ='end')-1).asfreq('T', how ='start')+16*60+20
Out[52]:
Period('2020-01-30 16:20','T')
In [49]:1
p.asfreq('B')
Out[49]:
Period('2020-01-31','B')
In []:1
5 时间重采样
timestamp和period的相互转化
In [2]:1import pandas as pd
2import numpy as np
In [3]:1
s = pd.Series(np.random.randn(5), index = pd.date_range('20200401', periods =5, freq ='M'))2
s
Out[3]:2020-04-300.2503422020-05-310.3934922020-06-30-0.8331732020-07-311.5616022020-08-31-2.354146
Freq: M, dtype: float64
In [4]:1# 将s转化成时期的时间序列2
s.to_period()
Out[4]:2020-040.2503422020-050.3934922020-06-0.8331732020-071.5616022020-08-2.354146
Freq: M, dtype: float64
In [6]:1
ts = pd.Series(np.random.randn(5), index = pd.date_range('20161229', periods =5, freq ='D'))2
ts
Out[6]:2016-12-290.3284732016-12-300.4600582016-12-31-0.0910542017-01-010.0984862017-01-02-0.642376
Freq: D, dtype: float64
In [33]:1type(ts)
Out[33]:
pandas.core.series.Series
In [7]:1
ts.to_period()
Out[7]:2016-12-290.3284732016-12-300.4600582016-12-31-0.0910542017-01-010.0984862017-01-02-0.642376
Freq: D, dtype: float64
In [9]:1
pts = ts.to_period(freq ='M')2
pts
Out[9]:2016-120.3284732016-120.4600582016-12-0.0910542017-010.0984862017-01-0.642376
Freq: M, dtype: float64
In [12]:1
pts.index
Out[12]:
PeriodIndex(['2016-12','2016-12','2016-12','2017-01','2017-01'], dtype='period[M]', freq='M')
In [13]:1# 根据月份分组求和2
pts.groupby(level =0).sum()
Out[13]:2016-120.6974782017-01-0.543890
Freq: M, dtype: float64
In [16]:1# 将pts转化回基于时间戳的时间序列2
pts.to_timestamp()
Out[16]:2016-12-010.3284732016-12-010.4600582016-12-01-0.0910542017-01-010.0984862017-01-01-0.642376
dtype: float64
In [17]:1
pts.to_timestamp(how ='end')
Out[17]:2016-12-3123:59:59.9999999990.3284732016-12-3123:59:59.9999999990.4600582016-12-3123:59:59.999999999-0.0910542017-01-3123:59:59.9999999990.0984862017-01-3123:59:59.999999999-0.642376
dtype: float64
重采样
In [18]:1
ts = pd.Series(np.random.randint(0,50,60),2
index = pd.date_range('20200410 09:30:00', periods =60, freq ='T'))3
ts
Out[18]:2020-04-1009:30:00102020-04-1009:31:00292020-04-1009:32:00142020-04-1009:33:00452020-04-1009:34:00222020-04-1009:35:0032020-04-1009:36:00292020-04-1009:37:00192020-04-1009:38:00322020-04-1009:39:00442020-04-1009:40:0052020-04-1009:41:0012020-04-1009:42:00132020-04-1009:43:00452020-04-1009:44:00362020-04-1009:45:00232020-04-1009:46:0092020-04-1009:47:00202020-04-1009:48:00302020-04-1009:49:00412020-04-1009:50:00412020-04-1009:51:00142020-04-1009:52:00352020-04-1009:53:00482020-04-1009:54:00192020-04-1009:55:00142020-04-1009:56:00112020-04-1009:57:00332020-04-1009:58:00472020-04-1009:59:00362020-04-1010:00:00152020-04-1010:01:00332020-04-1010:02:00342020-04-1010:03:00352020-04-1010:04:00282020-04-1010:05:0042020-04-1010:06:0052020-04-1010:07:00332020-04-1010:08:00302020-04-1010:09:00112020-04-1010:10:00192020-04-1010:11:00152020-04-1010:12:00402020-04-1010:13:00172020-04-1010:14:0032020-04-1010:15:00282020-04-1010:16:00392020-04-1010:17:00392020-04-1010:18:00442020-04-1010:19:00472020-04-1010:20:00452020-04-1010:21:00272020-04-1010:22:00352020-04-1010:23:00382020-04-1010:24:00272020-04-1010:25:00222020-04-1010:26:00442020-04-1010:27:00242020-04-1010:28:00492020-04-1010:29:008
Freq: T, dtype: int32
In [23]:1# 每五分钟重新采一次样,并求和2
ts.resample('5min').sum()# 默认以时间段起始时间为行索引
Out[23]:2020-04-1009:30:001202020-04-1009:35:001272020-04-1009:40:001002020-04-1009:45:001232020-04-1009:50:001572020-04-1009:55:001412020-04-1010:00:001452020-04-1010:05:00832020-04-1010:10:00942020-04-1010:15:001972020-04-1010:20:001722020-04-1010:25:00147
Freq: 5T, dtype: int32
In [24]:1
ts.resample('5min', label ='right').sum()# 以时间段结束时间为行索引
Out[24]:2020-04-1009:35:001202020-04-1009:40:001272020-04-1009:45:001002020-04-1009:50:001232020-04-1009:55:001572020-04-1010:00:001412020-04-1010:05:001452020-04-1010:10:00832020-04-1010:15:00942020-04-1010:20:001972020-04-1010:25:001722020-04-1010:30:00147
Freq: 5T, dtype: int32
In [26]:1
ts.resample('5min').ohlc()# o:开盘价;h:最高价;l:最低价;c:收盘价
Out[26]:open high low close
2020-04-1009:30:00104510222020-04-1009:35:003443442020-04-1009:40:005451362020-04-1009:45:0023419412020-04-1009:50:00414814192020-04-1009:55:00144711362020-04-1010:00:00153515282020-04-1010:05:004334112020-04-1010:10:001940332020-04-1010:15:00284728472020-04-1010:20:00454527272020-04-1010:25:00224988
通过groupby重采样
In [27]:1
ts = pd.Series(np.random.randint(0,50,100), index = pd.date_range('20200401', periods =100, freq ='D'))2
ts
Out[27]:2020-04-01172020-04-02372020-04-03292020-04-04192020-04-0524..2020-07-05192020-07-06482020-07-07202020-07-08372020-07-096
Freq: D, Length:100, dtype: int32
通过月份重采样
In [28]:1# 方法一2
ts.groupby(lambda x : x.month).sum()
Out[28]:4725569566407215
dtype: int32
In [30]:1# 方法二2
ts.groupby(ts.index.to_period('M')).sum()
Out[30]:2020-047252020-056952020-066402020-07215
Freq: M, dtype: int32
In [32]:1type(ts.index)
Out[32]:
pandas.core.indexes.datetimes.DatetimeIndex
In [35]:1
df = pd.DataFrame(np.random.randint(0,50,2), index = pd.date_range('20160422', periods =2, freq ='W-FRI'))# W-FRI表示以星期为单位,并且周五为一周的结束2
df
Out[35]:02016-04-22432016-04-297
In [37]:1# df.resample('D', fill_method = 'ffill')---------------------------------------------------------------------------
TypeError Traceback (most recent call last)<ipython-input-37-1600fd4048a6>in<module>---->1 df.resample('D', fill_method ='ffill')
TypeError: resample() got an unexpected keyword argument 'fill_method'
时期的重采样
In [39]:1
df = pd.DataFrame(np.random.randint(2,30,(24,4)),2
index = pd.period_range('202001','202112', freq ='M'), columns =list('ABCD'))3
df
Out[39]:
A B C D
2020-0127712102020-022282232020-032981462020-04211314102020-053191982020-06191616272020-071542762020-0822114282020-09281715282020-1024523142020-1119291792020-125614102021-0151417182021-0291929192021-033177162021-0410161442021-05292726162021-06142619282021-0711218152021-08467242021-09171211132021-10232213162021-1117321152021-121081321
In [41]:1
df.resample('A-DEC').mean()
Out[41]:
A B C D
202017.83333312.75000017.25000013.250000202112.66666715.91666715.41666717.083333
In [42]:1
df.resample('A-MAR').mean()
Out[42]:
A B C D
202026.007.66666716.0000006.333333202112.7515.00000017.66666716.083333202215.0015.66666714.66666716.888889
In []:1