按日期筛选数据
- 读取并整理数据
# -*- coding: utf-8 -*-
import pandas as pd
# 读取csv数据
with open('date.csv', 'r') as f:
df = pd.read_csv(f, header=None) # 数据中不含header,故第一行不设为haeder
# 添加列名
df.columns = ['date', "number"]
# 将date装化成日期格式
df['date'] = pd.to_datetime(df['date'])
# 将date列设置为 索引列
df = df.set_index('date')
# 显示前两行数据
print(df.head(2))
# 显示最后两行数据
print(df.tail(2))
# 显示dataframe的几行几列
print(df.shape)
# number
# date
# 2013-10-24 3
# 2013-10-25 4
# number
# date
# 2015-01-29 465
# 2015-01-30 466
# (464, 1)
print(type(df))
print(df.index) # 显示dataframe的索引列
print(type(df.index))
# <class 'pandas.core.frame.DataFrame'>
# DatetimeIndex(['2013-10-24', '2013-10-25', '2013-10-26', '2013-10-27',
# '2013-10-28', '2013-10-29', '2013-10-30', '2013-10-31',
# '2013-11-01', '2013-11-02',
# ...
# '2015-01-21', '2015-01-22', '2015-01-23', '2015-01-24',
# '2015-01-25', '2015-01-26', '2015-01-27', '2015-01-28',
# '2015-01-29', '2015-01-30'],
# dtype='datetime64[ns]', name='date', length=464, freq=None)
# <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
- 构造Series数据结构
s = pd.Series(df['number'], index=df.index)
print(type(s))
print(s.shape)
print(s.head(2))
# <class 'pandas.core.series.Series'>
# (464,)
# date
# 2013-10-24 3
# 2013-10-25 4
# Name: number, dtype: int64
- 按日期筛选
1) 按年度获取数据
# 获取2013年至2014年的数据
print(df['2013': '2014'].head(2))
print(df['2013': '2014'].tail(2))
# number
# date
# 2013-10-24 3
# 2013-10-25 4
# number
# date
# 2014-12-30 435
# 2014-12-31 436
2)获取某月的数据
# 获取某月的数据
print(df['2014-10'])
# number
# date
# 2014-10-01 345
# 2014-10-02 346
# 2014-10-03 347
# 2014-10-04 348
# 2014-10-05 349
# 2014-10-06 350
# 2014-10-07 351
# 2014-10-08 352
# 2014-10-09 353
# 2014-10-10 354
# 2014-10-11 355
# 2014-10-12 356
# 2014-10-13 357
# 2014-10-14 358
# 2014-10-15 359
# 2014-10-16 360
# 2014-10-17 361
# 2014-10-18 362
# 2014-10-19 363
# 2014-10-20 364
# 2014-10-21 365
# 2014-10-22 366
# 2014-10-23 367
# 2014-10-24 368
# 2014-10-25 369
# 2014-10-26 370
# 2014-10-27 371
# 2014-10-28 372
# 2014-10-29 373
# 2014-10-30 374
# 2014-10-31 375
3)获取具体某天的数据
对于Series可以直接通过某天来获取其数据;但是对于dataframe来说会报错,需要用区间来获取某天数据
print(s['2014-01-28'])
print(df['2014-01-28':'2014-01-28'])
# 99
# number
# date
# 2014-01-28 99
dataframe 的 truncate 函数可以获取某个时期之前或之后的数据,或者某个时间区间的数据,但一般建议直接用切片(slice),这样更为直观,方便
# 获取某个日期之前的数据
print(df.truncate(after='2013-11'))
# number
# date
# 2013-10-24 3
# 2013-10-25 4
# 2013-10-26 5
# 2013-10-27 6
# 2013-10-28 7
# 2013-10-29 8
# 2013-10-30 9
# 2013-10-31 10
# 2013-11-01 11
# 获取某个日期之后的数据
print(df.truncate(before="2015-01'"))
# number
# date
# 2015-01-01 437
# 2015-01-02 438
# 2015-01-03 439
# 2015-01-04 440
# 2015-01-05 441
# 2015-01-06 442
# 2015-01-07 443
# 2015-01-08 444
# 2015-01-09 445
# 2015-01-10 446
# 2015-01-11 447
# 2015-01-12 448
# 2015-01-13 449
# 2015-01-14 450
# 2015-01-15 451
# 2015-01-16 452
# 2015-01-17 453
# 2015-01-18 454
# 2015-01-19 455
# 2015-01-20 456
# 2015-01-21 457
# 2015-01-22 458
# 2015-01-23 459
# 2015-01-24 460
# 2015-01-25 461
# 2015-01-26 462
# 2015-01-27 463
# 2015-01-28 464
# 2015-01-29 465
# 2015-01-30 466
按日期显示数据
- to_period()方法, 其返回的数据类型是PeriodIndex; df.index的数据类型是DatetimeIndex
按月显示,但不统计
df_period = df.to_period('M')
print(df_period)
print(type(df_period))
print(type(df_period.index))
# number
# date
# 2013-10 3
# 2013-10 4
# 2013-10 5
# 2013-10 6
# 2013-10 7
# ... ...
# 2015-01 462
# 2015-01 463
# 2015-01 464
# 2015-01 465
# 2015-01 466
#
# [464 rows x 1 columns]
# <class 'pandas.core.frame.DataFrame'>
# <class 'pandas.core.indexes.period.PeriodIndex
按季度显示,但不统计
print(df.to_period('Q'))
# number
# date
# 2013Q4 3
# 2013Q4 4
# 2013Q4 5
# 2013Q4 6
# 2013Q4 7
# ... ...
# 2015Q1 462
# 2015Q1 463
# 2015Q1 464
# 2015Q1 465
# 2015Q1 466
#
# [464 rows x 1 columns]
按年度显示,但不统计
print(df.to_period('A'))
# number
# date
# 2013 3
# 2013 4
# 2013 5
# 2013 6
# 2013 7
# ... ...
# 2015 462
# 2015 463
# 2015 464
# 2015 465
# 2015 466
#
# [464 rows x 1 columns]
- asfreq()方法
按年度频率显示
# 'A' 默认'A-DEC',其 他 如 'A-JAN'
print(df_period.index.asfreq('A'))
# PeriodIndex(['2013', '2013', '2013', '2013', '2013', '2013', '2013', '2013',
# '2013', '2013',
# ...
# '2015', '2015', '2015', '2015', '2015', '2015', '2015', '2015',
# '2015', '2015'],
# dtype='period[A-DEC]', name='date', length=464, freq='A-DEC')
print(df_period.index.asfreq('A-JAN'))
# PeriodIndex(['2014', '2014', '2014', '2014', '2014', '2014', '2014', '2014',
# '2014', '2014',
# ...
# '2015', '2015', '2015', '2015', '2015', '2015', '2015', '2015',
# '2015', '2015'],
# dtype='period[A-JAN]', name='date', length=464, freq='A-JAN')
按季度显示
print(df_period.index.asfreq('Q'))
# PeriodIndex(['2013Q4', '2013Q4', '2013Q4', '2013Q4', '2013Q4', '2013Q4',
# '2013Q4', '2013Q4', '2013Q4', '2013Q4',
# ...
# '2015Q1', '2015Q1', '2015Q1', '2015Q1', '2015Q1', '2015Q1',
# '2015Q1', '2015Q1', '2015Q1', '2015Q1'],
# dtype='period[Q-DEC]', name='date', length=464, freq='Q-DEC')
print(df_period.index.asfreq('Q-SEP'))
# PeriodIndex(['2014Q1', '2014Q1', '2014Q1', '2014Q1', '2014Q1', '2014Q1',
# '2014Q1', '2014Q1', '2014Q1', '2014Q1',
# ...
# '2015Q2', '2015Q2', '2015Q2', '2015Q2', '2015Q2', '2015Q2',
# '2015Q2', '2015Q2', '2015Q2', '2015Q2'],
# dtype='period[Q-SEP]', name='date', length=464, freq='Q-SEP')
按季度频率在不同情形下的显示,可参考下图所示:
按月度频率显示
print(df_period.index.asfreq('M'))
# PeriodIndex(['2013-10', '2013-10', '2013-10', '2013-10', '2013-10', '2013-10',
# '2013-10', '2013-10', '2013-11', '2013-11',
# ...
# '2015-01', '2015-01', '2015-01', '2015-01', '2015-01', '2015-01',
# '2015-01', '2015-01', '2015-01', '2015-01'],
# dtype='period[M]', name='date', length=464, freq='M')
按工作日显示
print(df_period.index.asfreq('B', how="start"))
# PeriodIndex(['2013-10-01', '2013-10-01', '2013-10-01', '2013-10-01',
# '2013-10-01', '2013-10-01', '2013-10-01', '2013-10-01',
# '2013-11-01', '2013-11-01',
# ...
# '2015-01-01', '2015-01-01', '2015-01-01', '2015-01-01',
# '2015-01-01', '2015-01-01', '2015-01-01', '2015-01-01',
# '2015-01-01', '2015-01-01'],
# dtype='period[B]', name='date', length=464, freq='B')
print(df_period.index.asfreq('B', how='end'))
# PeriodIndex(['2013-10-31', '2013-10-31', '2013-10-31', '2013-10-31',
# '2013-10-31', '2013-10-31', '2013-10-31', '2013-10-31',
# '2013-11-29', '2013-11-29',
# ...
# '2015-01-30', '2015-01-30', '2015-01-30', '2015-01-30',
# '2015-01-30', '2015-01-30', '2015-01-30', '2015-01-30',
# '2015-01-30', '2015-01-30'],
# dtype='period[B]', name='date', length=464, freq='B')
按日期统计数据
- 按周统计数据
print(df.resample('w').sum().head())
# number
# date
# 2013-10-27 18
# 2013-11-03 70
# 2013-11-10 119
# 2013-11-17 168
# 2013-11-24 217
- 按月统计数据
print(df.resample('M').sum().head())
# number
# date
# 2013-10-31 52
# 2013-11-30 765
# 2013-12-31 1736
# 2014-01-31 2697
# 2014-02-28 3262
按月度统计并显示
print(df.resample('M').sum().to_period('M').head())
# number
# date
# 2013-10 52
# 2013-11 765
# 2013-12 1736
# 2014-01 2697
# 2014-02 3262
- 按季度统计数据
print(df.resample('Q').sum().head())
# number
# date
# 2013-12-31 2553
# 2014-03-31 10485
# 2014-06-30 18837
# 2014-09-30 27462
# 2014-12-31 35926
按季度统计并显示
print(df.resample('Q').sum().to_period('Q').head())
# number
# date
# 2013Q4 2553
# 2014Q1 10485
# 2014Q2 18837
# 2014Q3 27462
# 2014Q4 35926
- 按年统计数据
# ”AS” 是 每 年 第 一 天 为 开 始 日 期 , ”A 是 每 年 最 后 一 天
print(df.resample('A').sum().head())
# number
# date
# 2013-12-31 2553
# 2014-12-31 92710
# 2015-12-31 13545
print(df.resample('AS').sum().head())
# number
# date
# 2013-01-01 2553
# 2014-01-01 92710
# 2015-01-01 13545
按年度统计并显示
print(df.resample('AS').sum().to_period('A').head())
# number
# date
# 2013 2553
# 2014 92710
# 2015 13545
关于日期的类型,按参考下图所示来选择合适的分期频率:
根据条件创建新的列
new_df_1 = pd.DataFrame({'team_A': ['Spain', 'Germany', 'Brazil', 'France'],
'team_B': ['USA', 'Argentina', 'Mexico', 'Belgium'],
'score_A': [5, 3, 2, 0],
'score_B': [4, 0, 3, 0]},
columns = ['team_A', 'team_B', 'score_A', 'score_B'])
print(new_df_1)
# team_A team_B score_A score_B
# 0 Spain USA 5 4
# 1 Germany Argentina 3 0
# 2 Brazil Mexico 2 3
# 3 France Belgium 0 0
问题:
从上面数据中创建新的一个数据列,用来存储获胜队伍的名称。即,根据 “score_A” 与 “score_B” 比较的结果,来获取相应的结果。例如,第一行,“Spain”:“USA” 为 5:4 ,“Spain” 获胜,新创建的列中存储的结果为 “Spain”。下面介绍两种方法来实现上述要求。
- 遍历每行数据来筛选
winner = []
for i, row in new_df_1.iterrows():
if row['score_A'] > row['score_B']:
winner.append(row['team_A'])
elif row['score_A'] < row['score_B']:
winner.append((row['team_B']))
else:
winner.append(('Draw'))
new_df_1['win_team'] = winner
print(new_df_1)
# team_A team_B score_A score_B win_team
# 0 Spain USA 5 4 Spain
# 1 Germany Argentina 3 0 Germany
# 2 Brazil Mexico 2 3 Mexico
# 3 France Belgium 0 0 Draw
- 利用 Pandas 中 DataFrame 的条件选择功能来实现
new_df_2 = new_df_1.copy()
print(new_df_2)
# team_A team_B score_A score_B
# 0 Spain USA 5 4
# 1 Germany Argentina 3 0
# 2 Brazil Mexico 2 3
# 3 France Belgium 0 0
new_df_2['win_team'] = ''
cond = new_df_2['score_A'] - new_df_2['score_B']
new_df_2.loc[cond > 0, 'win_team'] = new_df_2.loc[cond > 0, 'team_A']
new_df_2.loc[cond < 0, 'win_team'] = new_df_2.loc[cond < 0, 'team_B']
new_df_2.loc[cond == 0, 'win_team'] = ['Draw']
print(new_df_2)
# team_A team_B score_A score_B win_team
# 0 Spain USA 5 4 Spain
# 1 Germany Argentina 3 0 Germany
# 2 Brazil Mexico 2 3 Mexico
# 3 France Belgium 0 0 Draw