重采样:将时间序列从一个频率转换为另一个频率的过程,且会有数据的结合
- 降采样:高频数据 → 低频数据,eg.以天为频率的数据转为以月为频率的数据
- 升采样:低频数据 → 高频数据,eg.以年为频率的数据转为以月为频率的数据
一、Timestamp重采样【rng = pd.date_range()】
1、降采样
1.1 OHLC采样(金融领域)
OHLC:金融领域的时间序列聚合方式 → open开盘、high最大值、low最小值、close收盘
import numpy as np
import pandas as pd
# 重采样:.resample()
# 创建一个以天为频率的TimeSeries,重采样为按2天为频率
rng = pd.date_range('20170101', periods=12)
ts = pd.Series(np.arange(12), index=rng)
print("ts = \n", ts)
print('-' * 200)
# freq:重采样频率 → ts.resample('5D')
# .sum():聚合方法
ts_re = ts.resample('5D') # ts.resample('5D'):得到一个重采样构建器,频率改为5天
ts_re2 = ts.resample('5D').sum() # ts.resample('5D').sum():得到一个新的聚合后的Series,聚合方式为求和
print("ts_re = ts.resample('5D') = \n{0} \ntype(ts_re) = {1}".format(ts_re, type(ts_re)))
print('-' * 50)
print("ts_re2 = ts.resample('5D').sum() = \n{0} \ntype(ts_re2) = {1}".format(ts_re2, type(ts_re2)))
print('-' * 200)
# OHLC:金融领域的时间序列聚合方式 → open开盘、high最大值、low最小值、close收盘
print("第一个值: ts.resample('5D').first() = \n{0}".format(ts.resample('5D').first()))
print('-' * 50)
print("最大值: ts.resample('5D').max() = \n{0}".format(ts.resample('5D').max()))
print('-' * 50)
print("平均值: ts.resample('5D').mean() = \n{0}".format(ts.resample('5D').mean()))
print('-' * 50)
print("最小值: ts.resample('5D').min() = \n{0}".format(ts.resample('5D').min()))
print('-' * 50)
print("中值: ts.resample('5D').median() = \n{0}".format(ts.resample('5D').median()))
print('-' * 50)
print("最后一个值: ts.resample('5D').last() = \n{0}".format(ts.resample('5D').last()))
print('-' * 50)
print("OHLC重采样: ts.resample('5D').ohlc() = \n{0}".format(ts.resample('5D').ohlc()))
打印结果:
ts =
2017-01-01 0
2017-01-02 1
2017-01-03 2
2017-01-04 3
2017-01-05 4
2017-01-06 5
2017-01-07 6
2017-01-08 7
2017-01-09 8
2017-01-10 9
2017-01-11 10
2017-01-12 11
Freq: D, dtype: int32
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
ts_re = ts.resample('5D') =
DatetimeIndexResampler [freq=<5 * Days>, axis=0, closed=left, label=left, convention=start, origin=start_day]
type(ts_re) = <class 'pandas.core.resample.DatetimeIndexResampler'>
--------------------------------------------------
ts_re2 = ts.resample('5D').sum() =
2017-01-01 10
2017-01-06 35
2017-01-11 21
Freq: 5D, dtype: int32
type(ts_re2) = <class 'pandas.core.series.Series'>
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
第一个值: ts.resample('5D').first() =
2017-01-01 0
2017-01-06 5
2017-01-11 10
Freq: 5D, dtype: int32
--------------------------------------------------
最大值: ts.resample('5D').max() =
2017-01-01 4
2017-01-06 9
2017-01-11 11
Freq: 5D, dtype: int32
--------------------------------------------------
平均值: ts.resample('5D').mean() =
2017-01-01 2.0
2017-01-06 7.0
2017-01-11 10.5
Freq: 5D, dtype: float64
--------------------------------------------------
最小值: ts.resample('5D').min() =
2017-01-01 0
2017-01-06 5
2017-01-11 10
Freq: 5D, dtype: int32
--------------------------------------------------
中值: ts.resample('5D').median() =
2017-01-01 2.0
2017-01-06 7.0
2017-01-11 10.5
Freq: 5D, dtype: float64
--------------------------------------------------
最后一个值: ts.resample('5D').last() =
2017-01-01 4
2017-01-06 9
2017-01-11 11
Freq: 5D, dtype: int32
--------------------------------------------------
OHLC重采样: ts.resample('5D').ohlc() =
open high low close
2017-01-01 0 4 0 4
2017-01-06 5 9 5 9
2017-01-11 10 11 10 11
Process finished with exit code 0
1.2 closed、label参数
import numpy as np
import pandas as pd
# 降采样
rng = pd.date_range('20170101', periods=12)
ts = pd.Series(np.arange(1, 13), index=rng)
print("ts = \n", ts)
print('-' * 200)
# closed:各时间段哪一端是闭合(即包含)的,默认 左闭右闭
# 详解:这里values为0-11,按照5D重采样 → [1,2,3,4,5],[6,7,8,9,10],[11,12]
ts_re = ts.resample('5D').sum()
print("ts_re = ts.resample('5D').sum() = \n{0} \ntype(ts_re) = {1}".format(ts_re, type(ts_re)))
print('-' * 50)
# left指定间隔左边为结束 → [1,2,3,4,5],[6,7,8,9,10],[11,12]
ts_re_left = ts.resample('5D', closed='left').sum()
print("ts_re_left = ts.resample('5D', closed='left').sum() = \n{0} \ntype(ts_re_left) = {1}".format(ts_re_left, type(ts_re_left)))
print('-' * 50)
# right指定间隔右边为结束 → [1],[2,3,4,5,6],[7,8,9,10,11],[12]
ts_re_right = ts.resample('5D', closed='right').sum()
print("ts_re_right = ts.resample('5D', closed='right').sum() = \n{0} \ntype(ts_re_right) = {1}".format(ts_re_right, type(ts_re_right)))
print('-' * 200)
# label:聚合值的index,默认为取左
# 值采样认为默认(这里closed默认)
ts_re_left_label = ts.resample('5D', label='left').sum()
ts_re_right_label = ts.resample('5D', label='right').sum()
print("ts_re_left_label = ts.resample('5D', label='left').sum() = \n{0}".format(ts_re_left_label))
print('-' * 50)
print("ts_re_right_label = ts.resample('5D', label='right').sum() = \n{0}".format(ts_re_right_label))
打印结果:
ts =
2017-01-01 1
2017-01-02 2
2017-01-03 3
2017-01-04 4
2017-01-05 5
2017-01-06 6
2017-01-07 7
2017-01-08 8
2017-01-09 9
2017-01-10 10
2017-01-11 11
2017-01-12 12
Freq: D, dtype: int32
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
ts_re = ts.resample('5D').sum() =
2017-01-01 15
2017-01-06 40
2017-01-11 23
Freq: 5D, dtype: int32
type(ts_re) = <class 'pandas.core.series.Series'>
--------------------------------------------------
ts_re_left = ts.resample('5D', closed='left').sum() =
2017-01-01 15
2017-01-06 40
2017-01-11 23
Freq: 5D, dtype: int32
type(ts_re_left) = <class 'pandas.core.series.Series'>
--------------------------------------------------
ts_re_right = ts.resample('5D', closed='right').sum() =
2016-12-27 1
2017-01-01 20
2017-01-06 45
2017-01-11 12
Freq: 5D, dtype: int32
type(ts_re_right) = <class 'pandas.core.series.Series'>
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
ts_re_left_label = ts.resample('5D', label='left').sum() =
2017-01-01 15
2017-01-06 40
2017-01-11 23
Freq: 5D, dtype: int32
--------------------------------------------------
ts_re_right_label = ts.resample('5D', label='right').sum() =
2017-01-06 15
2017-01-11 40
2017-01-16 23
Freq: 5D, dtype: int32
Process finished with exit code 0
2、升采样
低频转高频,主要是如何插值
- .asfreq():不做填充,返回Nan
- .ffill():向上填充
- .bfill():向下填充
import numpy as np
import pandas as pd
# 升采样及插值
rng = pd.date_range('2017/1/1 0:0:0', periods=5, freq='H')
ts = pd.DataFrame(np.arange(15).reshape(5, 3),
index=rng,
columns=['a', 'b', 'c'])
print("ts = \n", ts)
print('-' * 200)
# 低频转高频,主要是如何插值
# .asfreq():不做填充,返回Nan
# .ffill():向上填充
# .bfill():向下填充
ts_re1 = ts.resample('15T').asfreq()
print("升采样,不填充:ts_re1 = ts.resample('15T').asfreq() = \n", ts_re1)
print('-' * 50)
ts_re2 = ts.resample('15T').ffill()
print("升采样,向上填充:ts_re2 = ts.resample('15T').ffill() = \n", ts_re2)
print('-' * 50)
ts_re3 = ts.resample('15T').bfill()
print("升采样,向下填充:ts_re3 = ts.resample('15T').bfill() = \n", ts_re3)
print('-' * 200)
打印结果:
ts =
a b c
2017-01-01 00:00:00 0 1 2
2017-01-01 01:00:00 3 4 5
2017-01-01 02:00:00 6 7 8
2017-01-01 03:00:00 9 10 11
2017-01-01 04:00:00 12 13 14
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
升采样,不填充:ts_re1 = ts.resample('15T').asfreq() =
a b c
2017-01-01 00:00:00 0.0 1.0 2.0
2017-01-01 00:15:00 NaN NaN NaN
2017-01-01 00:30:00 NaN NaN NaN
2017-01-01 00:45:00 NaN NaN NaN
2017-01-01 01:00:00 3.0 4.0 5.0
2017-01-01 01:15:00 NaN NaN NaN
2017-01-01 01:30:00 NaN NaN NaN
2017-01-01 01:45:00 NaN NaN NaN
2017-01-01 02:00:00 6.0 7.0 8.0
2017-01-01 02:15:00 NaN NaN NaN
2017-01-01 02:30:00 NaN NaN NaN
2017-01-01 02:45:00 NaN NaN NaN
2017-01-01 03:00:00 9.0 10.0 11.0
2017-01-01 03:15:00 NaN NaN NaN
2017-01-01 03:30:00 NaN NaN NaN
2017-01-01 03:45:00 NaN NaN NaN
2017-01-01 04:00:00 12.0 13.0 14.0
--------------------------------------------------
升采样,向上填充:ts_re2 = ts.resample('15T').ffill() =
a b c
2017-01-01 00:00:00 0 1 2
2017-01-01 00:15:00 0 1 2
2017-01-01 00:30:00 0 1 2
2017-01-01 00:45:00 0 1 2
2017-01-01 01:00:00 3 4 5
2017-01-01 01:15:00 3 4 5
2017-01-01 01:30:00 3 4 5
2017-01-01 01:45:00 3 4 5
2017-01-01 02:00:00 6 7 8
2017-01-01 02:15:00 6 7 8
2017-01-01 02:30:00 6 7 8
2017-01-01 02:45:00 6 7 8
2017-01-01 03:00:00 9 10 11
2017-01-01 03:15:00 9 10 11
2017-01-01 03:30:00 9 10 11
2017-01-01 03:45:00 9 10 11
2017-01-01 04:00:00 12 13 14
--------------------------------------------------
升采样,向下填充:ts_re3 = ts.resample('15T').bfill() =
a b c
2017-01-01 00:00:00 0 1 2
2017-01-01 00:15:00 3 4 5
2017-01-01 00:30:00 3 4 5
2017-01-01 00:45:00 3 4 5
2017-01-01 01:00:00 3 4 5
2017-01-01 01:15:00 6 7 8
2017-01-01 01:30:00 6 7 8
2017-01-01 01:45:00 6 7 8
2017-01-01 02:00:00 6 7 8
2017-01-01 02:15:00 9 10 11
2017-01-01 02:30:00 9 10 11
2017-01-01 02:45:00 9 10 11
2017-01-01 03:00:00 9 10 11
2017-01-01 03:15:00 12 13 14
2017-01-01 03:30:00 12 13 14
2017-01-01 03:45:00 12 13 14
2017-01-01 04:00:00 12 13 14
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
二、Period(时期)重采样【prng = pd.period_range()】
import numpy as np
import pandas as pd
# 时期重采样 - Period
prng = pd.period_range('2016', '2017', freq='M')
ts = pd.Series(np.arange(len(prng)), index=prng)
print("ts = \n", ts)
print('-' * 200)
# 降采样
ts_re_down = ts.resample('Y').sum()
print("ts_re_down = ts.resample('Y').sum() = \n", ts_re_down)
print('-' * 50)
# 升采样
ts_re_up = ts.resample('15D').ffill()
print("ts_re_up = ts.resample('15D').ffill() = \n", ts_re_up)
print('-' * 200)
打印结果:
ts =
2016-01 0
2016-02 1
2016-03 2
2016-04 3
2016-05 4
2016-06 5
2016-07 6
2016-08 7
2016-09 8
2016-10 9
2016-11 10
2016-12 11
2017-01 12
Freq: M, dtype: int32
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
ts_re_down = ts.resample('Y').sum() =
2016 66
2017 12
Freq: A-DEC, dtype: int32
--------------------------------------------------
ts_re_up = ts.resample('15D').ffill() =
2016-01-01 0
2016-01-16 0
2016-01-31 0
2016-02-15 1
2016-03-01 2
2016-03-16 2
2016-03-31 2
2016-04-15 3
2016-04-30 3
2016-05-15 4
2016-05-30 4
2016-06-14 5
2016-06-29 5
2016-07-14 6
2016-07-29 6
2016-08-13 7
2016-08-28 7
2016-09-12 8
2016-09-27 8
2016-10-12 9
2016-10-27 9
2016-11-11 10
2016-11-26 10
2016-12-11 11
2016-12-26 11
2017-01-10 12
2017-01-25 12
Freq: 15D, dtype: int32
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Process finished with exit code 0
三、DataFrame降采样-案例
resample.py
import os
import pandas as pd
from pathlib import Path
csv_input_dir = Path("../raw_files")
csv_files = csv_input_dir.glob("holding-register*.csv")
for csv_file in csv_files:
file_name = csv_file.name.split(".")[0]
print("\nfile_name = ", file_name)
pd_file = pd.read_csv("../raw_files/" + file_name + ".csv", encoding='utf-8')
# 将int64类型的“时间”列的时间样式数据(20211215235920)转为datetime类型
pd_file['Time_index'] = pd.to_datetime(pd_file['时间'], format='%Y%m%d%H%M%S')
pd_file.set_index("Time_index", inplace=True)
output_dir_1min = "resample_result_1min"
output_dir_1h = "resample_result_1h"
if not os.path.exists(output_dir_1min):
os.mkdir(output_dir_1min)
if not os.path.exists(output_dir_1h):
os.mkdir(output_dir_1h)
# 按1min聚合
result = pd_file.resample('1h').mean()
result.drop(['Unnamed: 0', '时间'], axis=1, inplace=True)
result.to_csv(output_dir_1min + "/resample_result" + file_name + ".csv")
# 按1h聚合
result = pd_file.resample('1h').mean()
result.drop(['Unnamed: 0', '时间'], axis=1, inplace=True)
result.to_csv(output_dir_1h + "/resample_result" + file_name + ".csv")
resample.ipynb
import pandas as pd
import time
# 使用pandas加载csv文件
pd_file = pd.read_csv("raw_data.csv", encoding='utf-8')
# 介绍数据集各列的数据类型,是否为空值,内存占用情况
pd_file.info()
# 主要介绍数据集各列的数据统计情况(最大值、最小值、标准偏差、分位数等等)
pd_file.describe()
# 查看pd_file文件数据
pd_file
# 将int64类型的“时间”列的时间样式数据(20211215235920)转为datetime类型
pd_file['Time_index'] = pd.to_datetime(pd_file['时间'], format='%Y%m%d%H%M%S')
pd_file.info()
pd_file
# 将Time_index列设置为索引列
pd_file.set_index("Time_index", inplace=True)
pd_file.info()
pd_file
# 根据1min/1h进行降采样,用mean进行聚合
result_min = pd_file.resample('1min').mean()
result_h = pd_file.resample('1h').mean()
result_min
result_h
# 删掉不需要的列:“Unnamed: 0”、“时间”
result_min.drop(['Unnamed: 0', '时间'], axis=1, inplace=True)
result_h.drop(['Unnamed: 0', '时间'], axis=1, inplace=True)
result_min
result_h
# 保存降采样后的数据
result_min.to_csv("result_min_resampled.csv")
result_h.to_csv("result_h_resampled.csv")
参考资料:
python在resample后用agg对多列使用自定义函数
python(18)-pandas 重采样-与降采样-resample-PeriodIndex