import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
s
s = pd.Series(np.random.rand(5),index=list('abcde'))
s
a 0.272040
b 0.867085
c 0.850881
d 0.467187
e 0.115594
dtype: float64
s.index
s.index
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
ha
s.index.name = 'alpha'
s
s
alpha
a 0.272040
b 0.867085
c 0.850881
d 0.467187
e 0.115594
dtype: float64
n
df = pd.DataFrame(np.random.randn(4,3),columns=['one','two','three'])
df
one two three
0 -1.756171 1.211440 0.721272
1 -0.240596 0.885698 0.430981
2 -1.337347 -0.296998 -0.012915
3 -1.688144 0.821678 0.419868
df.index
df.index
RangeIndex(start=0, stop=4, step=1)
df.columns
df.columns
Index(['one', 'two', 'three'], dtype='object')
o
df.index.name='row'
s
df.columns.name='col'
df
col one two three
row
0 -1.756171 1.211440 0.721272
1 -0.240596 0.885698 0.430981
2 -1.337347 -0.296998 -0.012915
3 -1.688144 0.821678 0.419868
)
s = pd.Series(np.arange(6),index = list('abcbda'))
s
s
a 0
b 1
c 2
b 3
d 4
a 5
dtype: int32
a
s['a']
a 0
a 5
dtype: int32
s.index.is_unique
False
s.index.unique()
Index(['a', 'b', 'c', 'd'], dtype='object')
s.unique()
array([0, 1, 2, 3, 4, 5], dtype=int64)
求和
s.groupby(s.index).sum()#分组求和
a 5
b 4
c 2
d 4
dtype: int32
分组计算
df = pd.DataFrame({'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5)})
f
df
key1 key2 data1 data2
0 a one 9 1
1 a two 3 9
2 b one 5 8
3 b two 1 1
4 a one 5 8
df['data1'].groupby(df['key1']).mean()
key1
a 5.666667
b 3.000000
Name: data1, dtype: float64
oup
for name,group in df.groupby('key1'):
print(name)
print(group)
a
key1 key2 data1 data2
0 a one 9 1
1 a two 3 9
4 a one 5 8
b
key1 key2 data1 data2
2 b one 5 8
3 b two 1 1
list(df.groupby('key1')
dict(list(df.groupby('key1')))
{'a': key1 key2 data1 data2
0 a one 9 1
1 a two 3 9
4 a one 5 8, 'b': key1 key2 data1 data2
2 b one 5 8
3 b two 1 1}
)
list(df.groupby('key1'))
[('a', key1 key2 data1 data2
0 a one 9 1
1 a two 3 9
4 a one 5 8), ('b', key1 key2 data1 data2
2 b one 5 8
3 b two 1 1)]
聚合函数
grouped.agg(['std','mean','sum',('range',oeak_range)])
d = {'data1':'mean',
'data2':'sum'}
grouped.agg(d)#对不同的数据采用不同的聚合函数
时间序列
from datetime import timedelta
from datetime import datetime
from datetime import timedelta
now = datetime.now()
now
datetime.datetime(2020, 8, 21, 17, 43, 51, 104762)
now.year,now.month,now.day
now.year,now.month,now.day
(2020, 8, 21)
-now
date1 = datetime(2020,10,16)-now
date1
date1
datetime.timedelta(days=55, seconds=22568, microseconds=895238)
date1
str(date1)
'55 days, 6:16:08.895238'
now.strftime('%Y/%m/%d %H:%M:%S')
now.strftime('%Y/%m/%d %H:%M:%S')
'2020/08/21 17:43:51'
pd.period_range('2016-01','2016-12',freq='M')
pd.period_range('2016-01','2016-12',freq='M')
PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
'2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12'],
dtype='period[M]', freq='M')
Q
pd.period_range('2016Q1',periods=10,freq='Q')
PeriodIndex(['2016Q1', '2016Q2', '2016Q3', '2016Q4', '2017Q1', '2017Q2',
'2017Q3', '2017Q4', '2018Q1', '2018Q2'],
dtype='period[Q-DEC]', freq='Q-DEC')
import datetime as datetime
import numpy as np
import pandas as pd
import numpy as np
ts
ts = pd.Series(np.random.randint(0,50,60),index = pd.date_range('2016-04-25 09:30',periods=60,freq = 'T'))
ts
2016-04-25 09:30:00 45
2016-04-25 09:31:00 37
2016-04-25 09:32:00 39
2016-04-25 09:33:00 47
2016-04-25 09:34:00 1
2016-04-25 09:35:00 46
2016-04-25 09:36:00 31
2016-04-25 09:37:00 8
2016-04-25 09:38:00 2
2016-04-25 09:39:00 40
2016-04-25 09:40:00 43
2016-04-25 09:41:00 19
2016-04-25 09:42:00 0
2016-04-25 09:43:00 17
2016-04-25 09:44:00 7
2016-04-25 09:45:00 45
2016-04-25 09:46:00 3
2016-04-25 09:47:00 23
2016-04-25 09:48:00 35
2016-04-25 09:49:00 38
2016-04-25 09:50:00 18
2016-04-25 09:51:00 2
2016-04-25 09:52:00 42
2016-04-25 09:53:00 39
2016-04-25 09:54:00 1
2016-04-25 09:55:00 26
2016-04-25 09:56:00 8
2016-04-25 09:57:00 35
2016-04-25 09:58:00 34
2016-04-25 09:59:00 18
2016-04-25 10:00:00 37
2016-04-25 10:01:00 28
2016-04-25 10:02:00 45
2016-04-25 10:03:00 20
2016-04-25 10:04:00 46
2016-04-25 10:05:00 30
2016-04-25 10:06:00 14
2016-04-25 10:07:00 23
2016-04-25 10:08:00 12
2016-04-25 10:09:00 7
2016-04-25 10:10:00 29
2016-04-25 10:11:00 0
2016-04-25 10:12:00 1
2016-04-25 10:13:00 15
2016-04-25 10:14:00 20
2016-04-25 10:15:00 20
2016-04-25 10:16:00 6
2016-04-25 10:17:00 30
2016-04-25 10:18:00 48
2016-04-25 10:19:00 28
2016-04-25 10:20:00 45
2016-04-25 10:21:00 15
2016-04-25 10:22:00 24
2016-04-25 10:23:00 14
2016-04-25 10:24:00 45
2016-04-25 10:25:00 3
2016-04-25 10:26:00 38
2016-04-25 10:27:00 46
2016-04-25 10:28:00 29
2016-04-25 10:29:00 44
Freq: T, dtype: int32
ts.resample('5min').sum()#重采样
2016-04-25 09:30:00 169
2016-04-25 09:35:00 127
2016-04-25 09:40:00 86
2016-04-25 09:45:00 144
2016-04-25 09:50:00 102
2016-04-25 09:55:00 121
2016-04-25 10:00:00 176
2016-04-25 10:05:00 86
2016-04-25 10:10:00 65
2016-04-25 10:15:00 132
2016-04-25 10:20:00 143
2016-04-25 10:25:00 160
Freq: 5T, dtype: int32
数据可视化
nline
%matplotlib inline
ts = pd.Series(np.random.randn(1000),index = pd.date_range('2000/1/1',periods=1000))
ts = ts.cumsum()
ts.describe()
count 1000.000000
mean 7.292909
std 9.028347
min -15.164886
25% 1.531374
50% 9.228288
75% 14.624091
max 23.938509
dtype: float64
',
ts.plot(title = 'cumsum',style= 'r-',figsize =(10,6))
<matplotlib.axes._subplots.AxesSubplot at 0x21647116e10>