groupby, unique, date_range, to_datetime, resample
# hello pandas
"""
Series : 列
DataFrame : 表
"""
import numpy as np
import pandas as pd
#
df_excel = pd.read_excel('./pandasFile/data.xlsx')
print('\n12:')
print(df_excel)
"""
日期 名称 类别 单价 数量 金额
0 2020-07-21 商品A 服装 30 3 90
1 2020-07-22 商品B 服装 200 2 400
2 2020-07-23 商品C 食品 1000 4 4000
3 2020-07-24 商品A 服装 35 2 70
4 2020-07-25 商品C 食品 1100 5 5500
5 2020-07-26 商品B 服装 180 5 900
"""
# 对类别进行分组
grouped = df_excel.groupby('类别')
print('\n26:')
# <pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000023E52EF5190>
print(grouped)
# 查看类别中的去重的 名称 unique
for name, data in grouped:
print('\n32:')
print(name)
print(data['名称'].unique())
"""
服装
['商品A' '商品B']
食品
['商品C']
"""
# 得到类别组
for name, data in grouped:
print('\n45:')
print(name) # 服装 食品
print(data)
# 得到的类别组 求 数量和金额之和
grouped_sum = grouped[['金额', '数量']].sum()
print('\n51:')
print(grouped_sum)
"""
金额 数量
类别
服装 1460 12
食品 9500 9
"""
# 求类别中单价最高的
grouped = df_excel.groupby(['类别', '名称'])
grouped_max = grouped[['单价']].max()
print('\n63:')
print(grouped_max) # 平均值:mean
"""
单价
类别 名称
服装 商品A 35
商品B 200
食品 商品C 1100
"""
# pandas 时间序列
df_time = pd.date_range('2020-12-1', '2021-1-20') # 初始化时间序列
print('\n76:')
print(df_time)
"""
DatetimeIndex(['2020-12-01', '2020-12-02', '2020-12-03', '2020-12-04',
'2020-12-05', '2020-12-06', '2020-12-07', '2020-12-08',
'2020-12-09', '2020-12-10', '2020-12-11', '2020-12-12',
'2020-12-13', '2020-12-14', '2020-12-15', '2020-12-16',
'2020-12-17', '2020-12-18', '2020-12-19', '2020-12-20',
'2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24',
'2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28',
'2020-12-29', '2020-12-30', '2020-12-31', '2021-01-01',
'2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05',
'2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09',
'2021-01-10', '2021-01-11', '2021-01-12', '2021-01-13',
'2021-01-14', '2021-01-15', '2021-01-16', '2021-01-17',
'2021-01-18', '2021-01-19', '2021-01-20'],
dtype='datetime64[ns]', freq='D')
freq:频率 D代表天的意思
"""
# 以w周为频率查询
df_time2 = pd.date_range('2020-12-1', '2021-1-20', freq='w')
print('\n97:')
print(df_time2) # M:月; Q:季度; H:小时; T:分; S:秒
"""
DatetimeIndex(['2020-12-06', '2020-12-13', '2020-12-20', '2020-12-27',
'2021-01-03', '2021-01-10', '2021-01-17'],
dtype='datetime64[ns]', freq='W-SUN')
"""
# 从2020年12-1 开始 返回10个周的 周期 periods
df_time3 = pd.date_range('2020-12-1', freq='w', periods=10)
print('\n108:')
print(df_time3)
"""
DatetimeIndex(['2020-12-06', '2020-12-13', '2020-12-20', '2020-12-27',
'2021-01-03', '2021-01-10', '2021-01-17', '2021-01-24',
'2021-01-31', '2021-02-07'],
dtype='datetime64[ns]', freq='W-SUN')
"""
data = {
'time': pd.date_range('2021-1-1', periods=10000, freq='T'),
'cpu': np.random.randn(10000) + 10 # 随机数 都很平均
}
df = pd.DataFrame(data, columns=['time', 'cpu'])
print('\n124:')
print(df.head(5))
"""
time cpu
0 2021-01-01 00:00:00 8.902294
1 2021-01-01 00:01:00 10.094323
2 2021-01-01 00:02:00 11.266734
3 2021-01-01 00:03:00 11.289507
4 2021-01-01 00:04:00 10.089253
"""
print('\n134:')
print(df.tail(5))
"""
time cpu
9995 2021-01-07 22:35:00 7.666639
9996 2021-01-07 22:36:00 9.253340
9997 2021-01-07 22:37:00 9.262185
9998 2021-01-07 22:38:00 10.740172
9999 2021-01-07 22:39:00 11.003444
"""
print('\n145:')
print(df.info())
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 time 10000 non-null datetime64[ns]
1 cpu 10000 non-null float64
dtypes: datetime64[ns](1), float64(1)
memory usage: 156.4 KB
"""
# 将索引变成时间参数 改成5分钟为一个间隔 (时间序列变成时间索引)
s = pd.to_datetime(df.time)
df.index = s
df = df.drop('time', axis=1)
print('\n165:')
print(df.head())
"""
cpu
time
2021-01-01 00:00:00 10.236112
2021-01-01 00:01:00 9.514566
2021-01-01 00:02:00 9.204780
2021-01-01 00:03:00 9.778110
2021-01-01 00:04:00 11.129042
"""
# 筛选出 10分钟的 数据
# df['2021-01-01 00:00:00':'2021-01-01 00:10:00']
# 筛选出一天的数据
# df['2021-01-01']
print('\n183:')
print(df.groupby(df.index.date).mean()) # 以日期进行分组 index.hour index.week
"""
cpu
2021-01-01 9.992389
2021-01-02 9.989297
2021-01-03 10.027705
2021-01-04 10.002606
2021-01-05 10.023792
2021-01-06 10.030702
2021-01-07 9.985970
"""
# 重采样 resample 之前是按照一分钟一次采集数据 这次按照五小时采集
df_5H = df.resample('5H').mean()
print('\n200:')
print(df_5H.head())
"""
cpu
time
2021-01-01 00:00:00 9.920996
2021-01-01 05:00:00 10.019273
2021-01-01 10:00:00 9.999199
2021-01-01 15:00:00 9.933910
2021-01-01 20:00:00 9.886203
"""