3.2 基本数据操作
- 3.2.1 索引操作
- 3.2.2 赋值操作
- 3.2.3 排序
读取数据
import pandas as pd
data = pd.read_excel('stock.xls', header=0, index_col=0, encoding='utf-8')
data
| close | open | high | low | pre_close | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | | |
---|
20200313 | 2887.4265 | 2804.2322 | 2910.8812 | 2799.9841 | 2923.4856 | -36.0591 | -1.2334 | 366450436.0 | 3.930197e+08 |
---|
20200312 | 2923.4856 | 2936.0163 | 2944.4651 | 2906.2838 | 2968.5174 | -45.0318 | -1.5170 | 307778457.0 | 3.282092e+08 |
---|
20200311 | 2968.5174 | 3001.7616 | 3010.0286 | 2968.5174 | 2996.7618 | -28.2444 | -0.9425 | 352470970.0 | 3.787666e+08 |
---|
20200310 | 2996.7618 | 2918.9347 | 3000.2963 | 2904.7989 | 2943.2907 | 53.4711 | 1.8167 | 393296648.0 | 4.250172e+08 |
---|
20200309 | 2943.2907 | 2987.1805 | 2989.2051 | 2940.7138 | 3034.5113 | -91.2206 | -3.0061 | 414560736.0 | 4.381439e+08 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
19910719 | 136.7000 | 137.6600 | 138.5400 | 136.6600 | 137.1700 | -0.4700 | -0.3426 | 10823.0 | 5.242826e+03 |
---|
19910718 | 137.1700 | 137.1700 | 137.1700 | 135.8100 | 135.8100 | 1.3600 | 1.0014 | 847.0 | 4.644160e+02 |
---|
19910717 | 135.8100 | 135.8100 | 135.8100 | 135.3900 | 134.4700 | 1.3400 | 0.9965 | 660.0 | 3.975240e+02 |
---|
19910716 | 134.4700 | 134.3900 | 134.4700 | 133.1400 | 133.1400 | 1.3300 | 0.9989 | 2796.0 | 1.328502e+03 |
---|
19910715 | 133.1400 | 133.9000 | 134.1000 | 131.8700 | 132.8000 | 0.3400 | 0.2560 | 11938.0 | 5.534900e+03 |
---|
7002 rows × 9 columns
data.drop(labels=['pre_close'], axis=1, inplace=True)
data
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 2804.2322 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 3.930197e+08 |
---|
20200312 | 2923.4856 | 2936.0163 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 3.282092e+08 |
---|
20200311 | 2968.5174 | 3001.7616 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 3.787666e+08 |
---|
20200310 | 2996.7618 | 2918.9347 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 4.250172e+08 |
---|
20200309 | 2943.2907 | 2987.1805 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 4.381439e+08 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
19910719 | 136.7000 | 137.6600 | 138.5400 | 136.6600 | -0.4700 | -0.3426 | 10823.0 | 5.242826e+03 |
---|
19910718 | 137.1700 | 137.1700 | 137.1700 | 135.8100 | 1.3600 | 1.0014 | 847.0 | 4.644160e+02 |
---|
19910717 | 135.8100 | 135.8100 | 135.8100 | 135.3900 | 1.3400 | 0.9965 | 660.0 | 3.975240e+02 |
---|
19910716 | 134.4700 | 134.3900 | 134.4700 | 133.1400 | 1.3300 | 0.9989 | 2796.0 | 1.328502e+03 |
---|
19910715 | 133.1400 | 133.9000 | 134.1000 | 131.8700 | 0.3400 | 0.2560 | 11938.0 | 5.534900e+03 |
---|
7002 rows × 8 columns
3.2.1 索引操作
- 直接索引(直接使用行列索引——先列后行)
- 按名字索引(按标签索引)
- 按数字索引(按位置索引)
- 组合索引(标签和位置混用)——已经被弃用
- DataFrame.ix[0:4, [‘open’, ‘close’, ‘high’, ‘low’]]
1. 直接索引(直接使用行列索引——先列后行)
data.head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 2804.2322 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 393019665.2 |
---|
20200312 | 2923.4856 | 2936.0163 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 328209202.4 |
---|
20200311 | 2968.5174 | 3001.7616 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 378766619.0 |
---|
20200310 | 2996.7618 | 2918.9347 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 425017184.8 |
---|
20200309 | 2943.2907 | 2987.1805 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 438143854.6 |
---|
data['open'][20200313]
2804.2322
2. 按名字(标签)索引
data.loc[20200313]['open']
2804.2322
data.loc[20200313, 'open']
2804.2322
3. 按数字索引(按位置索引)
data.iloc[0, 1]
2804.2322
4. 组合索引
- ix已经被废弃
- 可以使用loc和iloc来达到相同的效果
- DataFrame.columns.get_indexer():获得列标签索引对应的位置
- DataFrame.index.get_indexer():获得行标签索引对应的位置
data.loc[data.index[0:4], ['open', 'close', 'high', 'low']]
| open | close | high | low |
---|
trade_date | | | | |
---|
20200313 | 2804.2322 | 2887.4265 | 2910.8812 | 2799.9841 |
---|
20200312 | 2936.0163 | 2923.4856 | 2944.4651 | 2906.2838 |
---|
20200311 | 3001.7616 | 2968.5174 | 3010.0286 | 2968.5174 |
---|
20200310 | 2918.9347 | 2996.7618 | 3000.2963 | 2904.7989 |
---|
data.iloc[0:4, data.columns.get_indexer(['open', 'close', 'high', 'low'])]
| open | close | high | low |
---|
trade_date | | | | |
---|
20200313 | 2804.2322 | 2887.4265 | 2910.8812 | 2799.9841 |
---|
20200312 | 2936.0163 | 2923.4856 | 2944.4651 | 2906.2838 |
---|
20200311 | 3001.7616 | 2968.5174 | 3010.0286 | 2968.5174 |
---|
20200310 | 2918.9347 | 2996.7618 | 3000.2963 | 2904.7989 |
---|
data.index.get_indexer([20200310])
array([3], dtype=int64)
data.columns.get_indexer(['open'])
array([1], dtype=int64)
3.2.2 赋值操作
data.head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 2804.2322 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 393019665.2 |
---|
20200312 | 2923.4856 | 2936.0163 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 328209202.4 |
---|
20200311 | 2968.5174 | 3001.7616 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 378766619.0 |
---|
20200310 | 2996.7618 | 2918.9347 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 425017184.8 |
---|
20200309 | 2943.2907 | 2987.1805 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 438143854.6 |
---|
data['open']
trade_date
20200313 2804.2322
20200312 2936.0163
20200311 3001.7616
20200310 2918.9347
20200309 2987.1805
...
19910719 137.6600
19910718 137.1700
19910717 135.8100
19910716 134.3900
19910715 133.9000
Name: open, Length: 7002, dtype: float64
data.open
trade_date
20200313 2804.2322
20200312 2936.0163
20200311 3001.7616
20200310 2918.9347
20200309 2987.1805
...
19910719 137.6600
19910718 137.1700
19910717 135.8100
19910716 134.3900
19910715 133.9000
Name: open, Length: 7002, dtype: float64
data.open = 100
data.head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 100 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 393019665.2 |
---|
20200312 | 2923.4856 | 100 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 328209202.4 |
---|
20200311 | 2968.5174 | 100 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 378766619.0 |
---|
20200310 | 2996.7618 | 100 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 425017184.8 |
---|
20200309 | 2943.2907 | 100 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 438143854.6 |
---|
data.iloc[0, 1] = 222
data.head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 222 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 393019665.2 |
---|
20200312 | 2923.4856 | 100 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 328209202.4 |
---|
20200311 | 2968.5174 | 100 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 378766619.0 |
---|
20200310 | 2996.7618 | 100 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 425017184.8 |
---|
20200309 | 2943.2907 | 100 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 438143854.6 |
---|
3.2.3 排序
排序有两种形式,一种对内容进行排序,一种对索引进行排序
data.head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 222 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 393019665.2 |
---|
20200312 | 2923.4856 | 100 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 328209202.4 |
---|
20200311 | 2968.5174 | 100 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 378766619.0 |
---|
20200310 | 2996.7618 | 100 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 425017184.8 |
---|
20200309 | 2943.2907 | 100 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 438143854.6 |
---|
DataFrame
- 使用df.sort_values(by=, ascending=True)对内容进行排序
- 单个键或多个键进行排序,默认升序
- ascending=False:降序
- ascending=True:升序
按照单个字段排序
data.sort_values(by='high').head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
19910715 | 133.14 | 100 | 134.10 | 131.87 | 0.34 | 0.2560 | 11938.0 | 5534.900 |
---|
19910716 | 134.47 | 100 | 134.47 | 133.14 | 1.33 | 0.9989 | 2796.0 | 1328.502 |
---|
19910717 | 135.81 | 100 | 135.81 | 135.39 | 1.34 | 0.9965 | 660.0 | 397.524 |
---|
19910718 | 137.17 | 100 | 137.17 | 135.81 | 1.36 | 1.0014 | 847.0 | 464.416 |
---|
19910722 | 138.07 | 100 | 138.07 | 136.70 | 1.37 | 1.0022 | 2764.0 | 1423.205 |
---|
data.sort_values(by='high', ascending=False).head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20071016 | 6092.057 | 100 | 6124.044 | 6040.713 | 61.971 | 1.0277 | 88152159.0 | 1.635477e+08 |
---|
20071017 | 6036.281 | 100 | 6088.892 | 5982.203 | -55.776 | -0.9156 | 71312588.0 | 1.337010e+08 |
---|
20071018 | 5825.282 | 100 | 6055.473 | 5804.973 | -210.999 | -3.4955 | 69805425.0 | 1.303470e+08 |
---|
20071015 | 6030.086 | 100 | 6039.042 | 5866.131 | 126.822 | 2.1483 | 98707442.0 | 1.934696e+08 |
---|
20071101 | 5914.285 | 100 | 6005.131 | 5912.502 | -40.480 | -0.6798 | 64885103.0 | 1.107685e+08 |
---|
按照多个字段排序
先按照第一个字段排序,对第一个字段相同的,再按照第二个字段进行排序
data.sort_values(by=['high', 'pct_chg'], ascending=False).head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20071016 | 6092.057 | 100 | 6124.044 | 6040.713 | 61.971 | 1.0277 | 88152159.0 | 1.635477e+08 |
---|
20071017 | 6036.281 | 100 | 6088.892 | 5982.203 | -55.776 | -0.9156 | 71312588.0 | 1.337010e+08 |
---|
20071018 | 5825.282 | 100 | 6055.473 | 5804.973 | -210.999 | -3.4955 | 69805425.0 | 1.303470e+08 |
---|
20071015 | 6030.086 | 100 | 6039.042 | 5866.131 | 126.822 | 2.1483 | 98707442.0 | 1.934696e+08 |
---|
20071101 | 5914.285 | 100 | 6005.131 | 5912.502 | -40.480 | -0.6798 | 64885103.0 | 1.107685e+08 |
---|
对索引进行排序
data.head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
20200313 | 2887.4265 | 222 | 2910.8812 | 2799.9841 | -36.0591 | -1.2334 | 366450436.0 | 393019665.2 |
---|
20200312 | 2923.4856 | 100 | 2944.4651 | 2906.2838 | -45.0318 | -1.5170 | 307778457.0 | 328209202.4 |
---|
20200311 | 2968.5174 | 100 | 3010.0286 | 2968.5174 | -28.2444 | -0.9425 | 352470970.0 | 378766619.0 |
---|
20200310 | 2996.7618 | 100 | 3000.2963 | 2904.7989 | 53.4711 | 1.8167 | 393296648.0 | 425017184.8 |
---|
20200309 | 2943.2907 | 100 | 2989.2051 | 2940.7138 | -91.2206 | -3.0061 | 414560736.0 | 438143854.6 |
---|
data.sort_index().head()
| close | open | high | low | change | pct_chg | vol | amount |
---|
trade_date | | | | | | | | |
---|
19910715 | 133.14 | 100 | 134.10 | 131.87 | 0.34 | 0.2560 | 11938.0 | 5534.900 |
---|
19910716 | 134.47 | 100 | 134.47 | 133.14 | 1.33 | 0.9989 | 2796.0 | 1328.502 |
---|
19910717 | 135.81 | 100 | 135.81 | 135.39 | 1.34 | 0.9965 | 660.0 | 397.524 |
---|
19910718 | 137.17 | 100 | 137.17 | 135.81 | 1.36 | 1.0014 | 847.0 | 464.416 |
---|
19910719 | 136.70 | 100 | 138.54 | 136.66 | -0.47 | -0.3426 | 10823.0 | 5242.826 |
---|
Series
- 使用Series.sort_values(ascending=True)对内容进行排序
对Series排序时,只有一列,不需要参数
sr = data['pct_chg']
sr
trade_date
20200313 -1.2334
20200312 -1.5170
20200311 -0.9425
20200310 1.8167
20200309 -3.0061
...
19910719 -0.3426
19910718 1.0014
19910717 0.9965
19910716 0.9989
19910715 0.2560
Name: pct_chg, Length: 7002, dtype: float64
sr.sort_values()
trade_date
19950523 -16.3937
19931220 -13.0764
19940809 -12.6749
19930322 -11.7507
19921027 -11.1769
...
19940805 21.3711
19921125 29.9041
19950518 30.9870
19940801 33.4571
19920521 105.2691
Name: pct_chg, Length: 7002, dtype: float64
sr.sort_values(ascending=False)
trade_date
19920521 105.2691
19940801 33.4571
19950518 30.9870
19921125 29.9041
19940805 21.3711
...
19921027 -11.1769
19930322 -11.7507
19940809 -12.6749
19931220 -13.0764
19950523 -16.3937
Name: pct_chg, Length: 7002, dtype: float64
sr.sort_index()
trade_date
19910715 0.2560
19910716 0.9989
19910717 0.9965
19910718 1.0014
19910719 -0.3426
...
20200309 -3.0061
20200310 1.8167
20200311 -0.9425
20200312 -1.5170
20200313 -1.2334
Name: pct_chg, Length: 7002, dtype: float64