第四课 Pandas时序型数据分析
第五节 时序数据重采样
import pandas as pd
import numpy as np
data_df = pd.read_csv('./datasets/day_stats.csv', index_col='date', parse_dates=True, dayfirst=True)
data_df.head()
city PM_China PM_US Post Polluted State CH Polluted State US
date
2013-10-01 beijing 67.416667 71.458333 light light
2013-10-10 beijing 74.041667 81.583333 light medium
2013-10-11 beijing 59.694444 59.291667 light light
2013-10-12 beijing 69.236111 70.458333 light light
2013-10-13 beijing 54.513889 57.333333 light light
beijing_data = data_df[data_df['city'] == 'beijing'].copy()
降采样
# 将数据聚合到月
month_sum_sample = beijing_data['PM_China'].resample('M').sum()
month_mean_sample = beijing_data['PM_China'].resample('M').mean()
month_ohlc_sample = beijing_data['PM_China'].resample('M').ohlc()
print('降采样: sum')
print(month_sum_sample.head())
降采样: sum
date
2013-03-31 3305.051329
2013-04-30 1851.145984
2013-05-31 2762.017753
2013-06-30 2545.845238
2013-07-31 2407.417044
Freq: M, Name: PM_China, dtype: float64
print('降采样: mean')
print(month_mean_sample.head())
降采样: mean
date
2013-03-31 122.409308
2013-04-30 61.704866
2013-05-31 89.097347
2013-06-30 110.688923
2013-07-31 77.658614
Freq: M, Name: PM_China, dtype: float64
print('降采样: ohlc')
print(month_ohlc_sample.head())
降采样: ohlc
open high low close
date
2013-03-31 187.666667 305.060606 20.444444 183.246377
2013-04-30 86.236111 183.666667 12.013889 20.152778
2013-05-31 38.791667 231.666667 14.569444 95.888889
2013-06-30 77.202899 331.541667 27.291667 176.871795
2013-07-31 161.190476 165.250000 14.736111 70.888889
升采样
df = pd.DataFrame(np.random.randn(5, 3),
index=pd.date_range('20190101', periods=5, freq='W-MON'),
columns=['S1', 'S2', 'S3'])
df
S1 S2 S3
2019-01-07 -0.975204 1.960144 0.518937
2019-01-14 -0.805442 0.056005 -0.174179
2019-01-21 2.703792 -1.827181 -0.543472
2019-01-28 -0.765458 -1.787616 -0.382259
2019-02-04 -0.099521 1.013889 0.606310
# 直接重采样会产生空值
df.resample('D').asfreq()
S1 S2 S3
2019-01-07 -0.975204 1.960144 0.518937
2019-01-08 NaN NaN NaN
2019-01-09 NaN NaN NaN
2019-01-10 NaN NaN NaN
2019-01-11 NaN NaN NaN
2019-01-12 NaN NaN NaN
2019-01-13 NaN NaN NaN
2019-01-14 -0.805442 0.056005 -0.174179
2019-01-15 NaN NaN NaN
2019-01-16 NaN NaN NaN
2019-01-17 NaN NaN NaN
2019-01-18 NaN NaN NaN
2019-01-19 NaN NaN NaN
2019-01-20 NaN NaN NaN
2019-01-21 2.703792 -1.827181 -0.543472
2019-01-22 NaN NaN NaN
2019-01-23 NaN NaN NaN
2019-01-24 NaN NaN NaN
2019-01-25 NaN NaN NaN
2019-01-26 NaN NaN NaN
2019-01-27 NaN NaN NaN
2019-01-28 -0.765458 -1.787616 -0.382259
2019-01-29 NaN NaN NaN
2019-01-30 NaN NaN NaN
2019-01-31 NaN NaN NaN
2019-02-01 NaN NaN NaN
2019-02-02 NaN NaN NaN
2019-02-03 NaN NaN NaN
2019-02-04 -0.099521 1.013889 0.606310
# ffill 拿前面的值填充两个空值
df.resample('D').ffill(2)
S1 S2 S3
2019-01-07 -0.975204 1.960144 0.518937
2019-01-08 -0.975204 1.960144 0.518937
2019-01-09 -0.975204 1.960144 0.518937
2019-01-10 NaN NaN NaN
2019-01-11 NaN NaN NaN
2019-01-12 NaN NaN NaN
2019-01-13 NaN NaN NaN
2019-01-14 -0.805442 0.056005 -0.174179
2019-01-15 -0.805442 0.056005 -0.174179
2019-01-16 -0.805442 0.056005 -0.174179
2019-01-17 NaN NaN NaN
2019-01-18 NaN NaN NaN
2019-01-19 NaN NaN NaN
2019-01-20 NaN NaN NaN
2019-01-21 2.703792 -1.827181 -0.543472
2019-01-22 2.703792 -1.827181 -0.543472
2019-01-23 2.703792 -1.827181 -0.543472
2019-01-24 NaN NaN NaN
2019-01-25 NaN NaN NaN
2019-01-26 NaN NaN NaN
2019-01-27 NaN NaN NaN
2019-01-28 -0.765458 -1.787616 -0.382259
2019-01-29 -0.765458 -1.787616 -0.382259
2019-01-30 -0.765458 -1.787616 -0.382259
2019-01-31 NaN NaN NaN
2019-02-01 NaN NaN NaN
2019-02-02 NaN NaN NaN
2019-02-03 NaN NaN NaN
2019-02-04 -0.099521 1.013889 0.606310
# bfill 后面往前填充
df.resample('D').bfill()
S1 S2 S3
2019-01-07 -0.975204 1.960144 0.518937
2019-01-08 -0.805442 0.056005 -0.174179
2019-01-09 -0.805442 0.056005 -0.174179
2019-01-10 -0.805442 0.056005 -0.174179
2019-01-11 -0.805442 0.056005 -0.174179
2019-01-12 -0.805442 0.056005 -0.174179
2019-01-13 -0.805442 0.056005 -0.174179
2019-01-14 -0.805442 0.056005 -0.174179
2019-01-15 2.703792 -1.827181 -0.543472
2019-01-16 2.703792 -1.827181 -0.543472
2019-01-17 2.703792 -1.827181 -0.543472
...
2019-01-31 -0.099521 1.013889 0.606310
2019-02-01 -0.099521 1.013889 0.606310
2019-02-02 -0.099521 1.013889 0.606310
2019-02-03 -0.099521 1.013889 0.606310
2019-02-04 -0.099521 1.013889 0.606310
# 线性插值
df.resample('D').interpolate('linear')
S1 S2 S3
2019-01-07 -0.975204 1.960144 0.518937
2019-01-08 -0.950952 1.688125 0.419920
2019-01-09 -0.926701 1.416105 0.320904
2019-01-10 -0.902449 1.144085 0.221887
2019-01-11 -0.878197 0.872065 0.122871
2019-01-12 -0.853946 0.600045 0.023854
2019-01-13 -0.829694 0.328025 -0.075162
2019-01-14 -0.805442 0.056005 -0.174179
2019-01-15 -0.304123 -0.213021 -0.226935
2019-01-16 0.197196 -0.482048 -0.279691
...
2019-01-28 -0.765458 -1.787616 -0.382259
2019-01-29 -0.670324 -1.387401 -0.241035
2019-01-30 -0.575190 -0.987186 -0.099811
2019-01-31 -0.480056 -0.586971 0.041413
2019-02-01 -0.384922 -0.186756 0.182637
2019-02-02 -0.289789 0.213459 0.323861
2019-02-03 -0.194655 0.613674 0.465085
2019-02-04 -0.099521 1.013889 0.606310