import pandas as pd
import numpy as np
df=pd.read_excel('./pandas中的数据分组与透视表/data.xlsx')
df
| 日期 | 名称 | 类别 | 单价 | 数量 | 金额 |
---|
0 | 2018-07-01 | 商品A | 服装 | 20 | 2 | 40 |
1 | 2018-07-02 | 商品B | 服装 | 200 | 3 | 600 |
2 | 2018-07-03 | 商品C | 食品 | 1200 | 4 | 4800 |
3 | 2018-07-04 | 商品A | 服装 | 22 | 5 | 110 |
4 | 2018-07-05 | 商品B | 服装 | 220 | 6 | 1320 |
5 | 2018-07-06 | 商品C | 食品 | 1000 | 7 | 7000 |
6 | 2018-07-07 | 商品A | 服装 | 30 | 3 | 90 |
7 | 2018-07-08 | 商品A | 服装 | 800 | 1 | 800 |
8 | 2018-07-09 | 商品C | 食品 | 1300 | 4 | 5200 |
9 | 2018-07-10 | 商品B | 服装 | 230 | 3 | 690 |
10 | 2018-07-11 | 商品A | 服装 | 28 | 1 | 28 |
grouped=df.groupby('类别')
grouped
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021B20F06888>
for name, data in grouped:
print(name)
print(data)
服装
日期 名称 类别 单价 数量 金额
0 2018-07-01 商品A 服装 20 2 40
1 2018-07-02 商品B 服装 200 3 600
3 2018-07-04 商品A 服装 22 5 110
4 2018-07-05 商品B 服装 220 6 1320
6 2018-07-07 商品A 服装 30 3 90
7 2018-07-08 商品A 服装 800 1 800
9 2018-07-10 商品B 服装 230 3 690
10 2018-07-11 商品A 服装 28 1 28
食品
日期 名称 类别 单价 数量 金额
2 2018-07-03 商品C 食品 1200 4 4800
5 2018-07-06 商品C 食品 1000 7 7000
8 2018-07-09 商品C 食品 1300 4 5200
grouped['数量','金额'].sum()
grouped['单价'].max()
类别
服装 800
食品 1300
Name: 单价, dtype: int64
grouped=df.groupby(['类别','名称'])
for i,k in grouped:
print(i,k)
('服装', '商品A') 日期 名称 类别 单价 数量 金额
0 2018-07-01 商品A 服装 20 2 40
3 2018-07-04 商品A 服装 22 5 110
6 2018-07-07 商品A 服装 30 3 90
7 2018-07-08 商品A 服装 800 1 800
10 2018-07-11 商品A 服装 28 1 28
('服装', '商品B') 日期 名称 类别 单价 数量 金额
1 2018-07-02 商品B 服装 200 3 600
4 2018-07-05 商品B 服装 220 6 1320
9 2018-07-10 商品B 服装 230 3 690
('食品', '商品C') 日期 名称 类别 单价 数量 金额
2 2018-07-03 商品C 食品 1200 4 4800
5 2018-07-06 商品C 食品 1000 7 7000
8 2018-07-09 商品C 食品 1300 4 5200
grouped['单价'].max()
类别 名称
服装 商品A 800
商品B 230
食品 商品C 1300
Name: 单价, dtype: int64
grouped[['单价']].mean()
| | 单价 |
---|
类别 | 名称 | |
---|
服装 | 商品A | 180.000000 |
商品B | 216.666667 |
食品 | 商品C | 1166.666667 |
grouped=df.groupby('类别')
for name, data in grouped:
print(name)
print(data['名称'].unique())
服装
['商品A' '商品B']
食品
['商品C']
time datetime
import time
time.time()
1585909836.4156868
m=time.localtime(1585835029)
l=time.strftime('%Y--%m--%d %H:%M:%S',time.localtime(1585835029))
time.mktime(m)
1585835029.0
from datetime import datetime
datetime.now()
datetime.datetime(2020, 4, 3, 18, 30, 36, 462421)
datetime.now().year
2020
m=datetime.now().strftime('%Y-%m-%d')
type(m)
str
l=datetime.strptime('2020-04-02','%Y-%m-%d')
datetime.fromtimestamp(1585835029.0)
datetime.datetime(2020, 4, 2, 21, 43, 49)
datetime.now().timestamp()
1585909836.51427
time.mktime(m)
1585835029.0
datetime.now().timestamp()
1585910834.269616
delta=datetime.now()-datetime(2018,5,21)
delta.days
683
pamdas 时间序列
pd.date_range('2020-3-1','2020-4-3')
DatetimeIndex(['2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04',
'2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08',
'2020-03-09', '2020-03-10', '2020-03-11', '2020-03-12',
'2020-03-13', '2020-03-14', '2020-03-15', '2020-03-16',
'2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20',
'2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24',
'2020-03-25', '2020-03-26', '2020-03-27', '2020-03-28',
'2020-03-29', '2020-03-30', '2020-03-31', '2020-04-01',
'2020-04-02', '2020-04-03'],
dtype='datetime64[ns]', freq='D')
pd.date_range('2020-3-1','2020-4-3',freq='w')
DatetimeIndex(['2020-03-01', '2020-03-08', '2020-03-15', '2020-03-22',
'2020-03-29'],
dtype='datetime64[ns]', freq='W-SUN')
pd.date_range('2020-3-1',freq='Q',periods=10)
DatetimeIndex(['2020-03-31', '2020-06-30', '2020-09-30', '2020-12-31',
'2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
'2022-03-31', '2022-06-30'],
dtype='datetime64[ns]', freq='Q-DEC')
data={
'time':pd.date_range('2019-1-1',freq='T',periods=200000),
'cpu':np.random.randn(200000) + 10
}
df=pd.DataFrame(data,columns= ['time','cpu'])
df
| time | cpu |
---|
0 | 2019-01-01 00:00:00 | 10.608832 |
1 | 2019-01-01 00:01:00 | 10.107891 |
2 | 2019-01-01 00:02:00 | 9.502407 |
3 | 2019-01-01 00:03:00 | 10.453344 |
4 | 2019-01-01 00:04:00 | 10.025292 |
... | ... | ... |
199995 | 2019-05-19 21:15:00 | 9.354853 |
199996 | 2019-05-19 21:16:00 | 10.311234 |
199997 | 2019-05-19 21:17:00 | 10.764722 |
199998 | 2019-05-19 21:18:00 | 10.994048 |
199999 | 2019-05-19 21:19:00 | 10.775006 |
200000 rows × 2 columns
df.head()
| time | cpu |
---|
0 | 2019-01-01 00:00:00 | 10.608832 |
1 | 2019-01-01 00:01:00 | 10.107891 |
2 | 2019-01-01 00:02:00 | 9.502407 |
3 | 2019-01-01 00:03:00 | 10.453344 |
4 | 2019-01-01 00:04:00 | 10.025292 |
df.tail()
| time | cpu |
---|
199995 | 2019-05-19 21:15:00 | 9.354853 |
199996 | 2019-05-19 21:16:00 | 10.311234 |
199997 | 2019-05-19 21:17:00 | 10.764722 |
199998 | 2019-05-19 21:18:00 | 10.994048 |
199999 | 2019-05-19 21:19:00 | 10.775006 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 2 columns):
time 200000 non-null datetime64[ns]
cpu 200000 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 3.1 MB
s=pd.to_datetime(df.time)
df.index=s
df.head()
| time | cpu |
---|
time | | |
---|
2019-01-01 00:00:00 | 2019-01-01 00:00:00 | 10.608832 |
2019-01-01 00:01:00 | 2019-01-01 00:01:00 | 10.107891 |
2019-01-01 00:02:00 | 2019-01-01 00:02:00 | 9.502407 |
2019-01-01 00:03:00 | 2019-01-01 00:03:00 | 10.453344 |
2019-01-01 00:04:00 | 2019-01-01 00:04:00 | 10.025292 |
df.drop('time',axis=1)
| cpu |
---|
time | |
---|
2019-01-01 00:00:00 | 10.608832 |
2019-01-01 00:01:00 | 10.107891 |
2019-01-01 00:02:00 | 9.502407 |
2019-01-01 00:03:00 | 10.453344 |
2019-01-01 00:04:00 | 10.025292 |
... | ... |
2019-05-19 21:15:00 | 9.354853 |
2019-05-19 21:16:00 | 10.311234 |
2019-05-19 21:17:00 | 10.764722 |
2019-05-19 21:18:00 | 10.994048 |
2019-05-19 21:19:00 | 10.775006 |
200000 rows × 1 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 200000 entries, 2019-01-01 00:00:00 to 2019-05-19 21:19:00
Data columns (total 2 columns):
time 200000 non-null datetime64[ns]
cpu 200000 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 9.6 MB
df['2019-02-01 08:00:00':'2019-02-01 09:10:00']
| time | cpu |
---|
time | | |
---|
2019-02-01 08:00:00 | 2019-02-01 08:00:00 | 10.016300 |
2019-02-01 08:01:00 | 2019-02-01 08:01:00 | 11.500588 |
2019-02-01 08:02:00 | 2019-02-01 08:02:00 | 9.639299 |
2019-02-01 08:03:00 | 2019-02-01 08:03:00 | 9.365297 |
2019-02-01 08:04:00 | 2019-02-01 08:04:00 | 9.016025 |
... | ... | ... |
2019-02-01 09:06:00 | 2019-02-01 09:06:00 | 10.236108 |
2019-02-01 09:07:00 | 2019-02-01 09:07:00 | 10.644256 |
2019-02-01 09:08:00 | 2019-02-01 09:08:00 | 10.251236 |
2019-02-01 09:09:00 | 2019-02-01 09:09:00 | 10.343902 |
2019-02-01 09:10:00 | 2019-02-01 09:10:00 | 11.026916 |
71 rows × 2 columns
df['2019-01-01']
| time | cpu |
---|
time | | |
---|
2019-01-01 00:00:00 | 2019-01-01 00:00:00 | 10.608832 |
2019-01-01 00:01:00 | 2019-01-01 00:01:00 | 10.107891 |
2019-01-01 00:02:00 | 2019-01-01 00:02:00 | 9.502407 |
2019-01-01 00:03:00 | 2019-01-01 00:03:00 | 10.453344 |
2019-01-01 00:04:00 | 2019-01-01 00:04:00 | 10.025292 |
... | ... | ... |
2019-01-01 23:55:00 | 2019-01-01 23:55:00 | 9.533162 |
2019-01-01 23:56:00 | 2019-01-01 23:56:00 | 10.536986 |
2019-01-01 23:57:00 | 2019-01-01 23:57:00 | 11.243809 |
2019-01-01 23:58:00 | 2019-01-01 23:58:00 | 8.733646 |
2019-01-01 23:59:00 | 2019-01-01 23:59:00 | 9.542733 |
1440 rows × 2 columns
df.groupby(df.index.date).mean()
| cpu |
---|
2019-01-01 | 9.957422 |
2019-01-02 | 9.935621 |
2019-01-03 | 10.005468 |
2019-01-04 | 9.992855 |
2019-01-05 | 9.995959 |
... | ... |
2019-05-15 | 10.031266 |
2019-05-16 | 9.973673 |
2019-05-17 | 10.045151 |
2019-05-18 | 10.023991 |
2019-05-19 | 9.994609 |
139 rows × 1 columns
df.groupby(df.index.hour).mean()
| cpu |
---|
time | |
---|
0 | 10.007654 |
1 | 9.992322 |
2 | 9.987562 |
3 | 9.987208 |
4 | 10.008922 |
5 | 10.022125 |
6 | 9.978566 |
7 | 9.997908 |
8 | 10.012145 |
9 | 9.987106 |
10 | 9.988722 |
11 | 9.990424 |
12 | 10.002575 |
13 | 10.002844 |
14 | 10.001017 |
15 | 10.006581 |
16 | 9.993640 |
17 | 10.015496 |
18 | 10.001964 |
19 | 9.995674 |
20 | 9.972376 |
21 | 9.998824 |
22 | 10.000342 |
23 | 9.998308 |
df.groupby(df.index.week).mean()
| cpu |
---|
time | |
---|
1 | 9.977973 |
2 | 9.997662 |
3 | 9.998770 |
4 | 9.994222 |
5 | 9.999088 |
6 | 9.997922 |
7 | 9.995254 |
8 | 10.010069 |
9 | 10.004777 |
10 | 10.007423 |
11 | 10.001689 |
12 | 10.001328 |
13 | 9.999866 |
14 | 9.997371 |
15 | 9.995390 |
16 | 10.003722 |
17 | 9.991836 |
18 | 9.988586 |
19 | 9.979943 |
20 | 10.013066 |
df.resample('5T').mean()
| cpu |
---|
time | |
---|
2019-01-01 00:00:00 | 10.139553 |
2019-01-01 00:05:00 | 9.544466 |
2019-01-01 00:10:00 | 10.198618 |
2019-01-01 00:15:00 | 10.135603 |
2019-01-01 00:20:00 | 9.613786 |
... | ... |
2019-05-19 20:55:00 | 10.348141 |
2019-05-19 21:00:00 | 10.104422 |
2019-05-19 21:05:00 | 9.483435 |
2019-05-19 21:10:00 | 9.783999 |
2019-05-19 21:15:00 | 10.439973 |
40000 rows × 1 columns
df.groupby(df.index.date)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000021B265E5748>