3 pandas

1 pandas介绍

以Numpy为基础,借力numpy模块在计算方面性能高的优势

基于matplotlib,能够简便的画图

独特的数据结构

1.1 案例

import pandas as pd
import numpy as np
stock_change = np.random.normal(0, 1, (10, 5))
stock_change
array([[-1.0098501 ,  0.91625883,  0.10632482,  0.79909581,  1.02338447],
       [ 0.10041493,  0.85587085, -0.22404526, -0.61776538, -0.62323357],
       [ 0.70700457,  1.62502952, -0.97157201,  0.1988851 ,  0.79445365],
       [ 1.75233775, -1.17758937,  1.49103167,  0.21690815,  2.89886016],
       [ 0.05850193,  0.03554848, -0.48183573,  0.59494259,  1.08469151],
       [-0.99142601,  0.96349396,  0.26639733,  0.80381255, -0.36345471],
       [ 0.21497035, -0.53030381,  0.22778879,  1.74725041,  0.20228056],
       [ 0.1410646 ,  0.69803466, -1.24066274,  0.10367728, -1.41174687],
       [-0.3225248 ,  0.4897488 , -0.9790225 , -0.76514549,  2.00229418],
       [-0.02618594, -0.59289681,  0.89586171,  1.10466512, -0.53421044]])
# 使用pandas中的数据结构
stock_day_rise = pd.DataFrame(stock_change)
stock_day_rise
01234
0-1.0098500.9162590.1063250.7990961.023384
10.1004150.855871-0.224045-0.617765-0.623234
20.7070051.625030-0.9715720.1988850.794454
31.752338-1.1775891.4910320.2169082.898860
40.0585020.035548-0.4818360.5949431.084692
5-0.9914260.9634940.2663970.803813-0.363455
60.214970-0.5303040.2277891.7472500.202281
70.1410650.698035-1.2406630.103677-1.411747
8-0.3225250.489749-0.979023-0.7651452.002294
9-0.026186-0.5928970.8958621.104665-0.534210
# 增加行索引
stock_code = ['股票' + str(i+1) for i in range(stock_day_rise.shape[0])]
data = pd.DataFrame(stock_change, index=stock_code)
data
01234
股票1-1.0098500.9162590.1063250.7990961.023384
股票20.1004150.855871-0.224045-0.617765-0.623234
股票30.7070051.625030-0.9715720.1988850.794454
股票41.752338-1.1775891.4910320.2169082.898860
股票50.0585020.035548-0.4818360.5949431.084692
股票6-0.9914260.9634940.2663970.803813-0.363455
股票70.214970-0.5303040.2277891.7472500.202281
股票80.1410650.698035-1.2406630.103677-1.411747
股票9-0.3225250.489749-0.979023-0.7651452.002294
股票10-0.026186-0.5928970.8958621.104665-0.534210
# 增加列索引, 'B'略过周末
date = pd.date_range('2017-01-01', periods=stock_day_rise.shape[1], freq='B')
data = pd.DataFrame(stock_change, index=stock_code, columns=date)
data
2017-01-022017-01-032017-01-042017-01-052017-01-06
股票1-1.0098500.9162590.1063250.7990961.023384
股票20.1004150.855871-0.224045-0.617765-0.623234
股票30.7070051.625030-0.9715720.1988850.794454
股票41.752338-1.1775891.4910320.2169082.898860
股票50.0585020.035548-0.4818360.5949431.084692
股票6-0.9914260.9634940.2663970.803813-0.363455
股票70.214970-0.5303040.2277891.7472500.202281
股票80.1410650.698035-1.2406630.103677-1.411747
股票9-0.3225250.489749-0.979023-0.7651452.002294
股票10-0.026186-0.5928970.8958621.104665-0.534210

1.2 DataFrame的属性

data.shape
(10, 5)
data.index
Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')
data.columns
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05',
               '2017-01-06'],
              dtype='datetime64[ns]', freq='B')
data.values
array([[-1.0098501 ,  0.91625883,  0.10632482,  0.79909581,  1.02338447],
       [ 0.10041493,  0.85587085, -0.22404526, -0.61776538, -0.62323357],
       [ 0.70700457,  1.62502952, -0.97157201,  0.1988851 ,  0.79445365],
       [ 1.75233775, -1.17758937,  1.49103167,  0.21690815,  2.89886016],
       [ 0.05850193,  0.03554848, -0.48183573,  0.59494259,  1.08469151],
       [-0.99142601,  0.96349396,  0.26639733,  0.80381255, -0.36345471],
       [ 0.21497035, -0.53030381,  0.22778879,  1.74725041,  0.20228056],
       [ 0.1410646 ,  0.69803466, -1.24066274,  0.10367728, -1.41174687],
       [-0.3225248 ,  0.4897488 , -0.9790225 , -0.76514549,  2.00229418],
       [-0.02618594, -0.59289681,  0.89586171,  1.10466512, -0.53421044]])
data.T
股票1股票2股票3股票4股票5股票6股票7股票8股票9股票10
2017-01-02-1.0098500.1004150.7070051.7523380.058502-0.9914260.2149700.141065-0.322525-0.026186
2017-01-030.9162590.8558711.625030-1.1775890.0355480.963494-0.5303040.6980350.489749-0.592897
2017-01-040.106325-0.224045-0.9715721.491032-0.4818360.2663970.227789-1.240663-0.9790230.895862
2017-01-050.799096-0.6177650.1988850.2169080.5949430.8038131.7472500.103677-0.7651451.104665
2017-01-061.023384-0.6232340.7944542.8988601.084692-0.3634550.202281-1.4117472.002294-0.534210
# 默认五条
data.head()
2017-01-022017-01-032017-01-042017-01-052017-01-06
股票1-1.0098500.9162590.1063250.7990961.023384
股票20.1004150.855871-0.224045-0.617765-0.623234
股票30.7070051.625030-0.9715720.1988850.794454
股票41.752338-1.1775891.4910320.2169082.898860
股票50.0585020.035548-0.4818360.5949431.084692
data.tail()
2017-01-022017-01-032017-01-042017-01-052017-01-06
股票6-0.9914260.9634940.2663970.803813-0.363455
股票70.214970-0.5303040.2277891.7472500.202281
股票80.1410650.698035-1.2406630.103677-1.411747
股票9-0.3225250.489749-0.979023-0.7651452.002294
股票10-0.026186-0.5928970.8958621.104665-0.534210

1.3 DataFrame索引的设置

stock_code = ['股票_' + str(i + 1) for i in range(stock_day_rise.shape[0])]
# 必须整体全部修改
data.index = stock_code
data
2017-01-022017-01-032017-01-042017-01-052017-01-06
股票_1-1.0098500.9162590.1063250.7990961.023384
股票_20.1004150.855871-0.224045-0.617765-0.623234
股票_30.7070051.625030-0.9715720.1988850.794454
股票_41.752338-1.1775891.4910320.2169082.898860
股票_50.0585020.035548-0.4818360.5949431.084692
股票_6-0.9914260.9634940.2663970.803813-0.363455
股票_70.214970-0.5303040.2277891.7472500.202281
股票_80.1410650.698035-1.2406630.103677-1.411747
股票_9-0.3225250.489749-0.979023-0.7651452.002294
股票_10-0.026186-0.5928970.8958621.104665-0.534210
# 重置索引
data.reset_index()
index2017-01-02 00:00:002017-01-03 00:00:002017-01-04 00:00:002017-01-05 00:00:002017-01-06 00:00:00
0股票_1-1.0098500.9162590.1063250.7990961.023384
1股票_20.1004150.855871-0.224045-0.617765-0.623234
2股票_30.7070051.625030-0.9715720.1988850.794454
3股票_41.752338-1.1775891.4910320.2169082.898860
4股票_50.0585020.035548-0.4818360.5949431.084692
5股票_6-0.9914260.9634940.2663970.803813-0.363455
6股票_70.214970-0.5303040.2277891.7472500.202281
7股票_80.1410650.698035-1.2406630.103677-1.411747
8股票_9-0.3225250.489749-0.979023-0.7651452.002294
9股票_10-0.026186-0.5928970.8958621.104665-0.534210
# drop=True
data.reset_index(drop=True)
2017-01-022017-01-032017-01-042017-01-052017-01-06
0-1.0098500.9162590.1063250.7990961.023384
10.1004150.855871-0.224045-0.617765-0.623234
20.7070051.625030-0.9715720.1988850.794454
31.752338-1.1775891.4910320.2169082.898860
40.0585020.035548-0.4818360.5949431.084692
5-0.9914260.9634940.2663970.803813-0.363455
60.214970-0.5303040.2277891.7472500.202281
70.1410650.698035-1.2406630.103677-1.411747
8-0.3225250.489749-0.979023-0.7651452.002294
9-0.026186-0.5928970.8958621.104665-0.534210
# 以某列值设置为新的索引
df = pd.DataFrame({'month': [1, 4, 7, 10],
                    'year': [2012, 2012, 2013, 2013],
                    'sale':[55, 40, 84, 31]})
df
monthyearsale
01201255
14201240
27201384
310201331
df.set_index('month')
yearsale
month
1201255
4201240
7201384
10201331
# 设置多索引
new_df = df.set_index(['year', 'month'])
new_df
sale
yearmonth
2012155
440
2013784
1031

1.4 Multilndex与Panel

# Multilndex
new_df.index.names
FrozenList(['year', 'month'])
new_df.index.levels
FrozenList([[2012, 2013], [1, 4, 7, 10]])
new_df['sale']
year  month
2012  1        55
      4        40
2013  7        84
      10       31
Name: sale, dtype: int64
new_df['sale'][2012][1]
55
# Panel 新版被废弃了
# p = pd.Panel(np.arange(24).reshape(4,3,2),
#             items=list('ABCD'),
#             major_axis=pd.date_range('20130101', periods=3),
#             minor_axis=['first', 'second'])
# p

1.5 Series结构

type(data['2017-01-02'])
pandas.core.series.Series
data['2017-01-02']['股票_1']
-1.0098501036527945
# 创建series
pd.Series(np.arange(10))
0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
pd.Series([6.7,5.6,3,10,2], index=[1,2,3,4,5])
1     6.7
2     5.6
3     3.0
4    10.0
5     2.0
dtype: float64
s = pd.Series({'red':100, 'blue': 200, 'green': 500, 'yellow': 1000})
s.index
Index(['red', 'blue', 'green', 'yellow'], dtype='object')
s.values
array([ 100,  200,  500, 1000])

2 基本数据操作

data = pd.read_csv('./data/stock_day.csv')
data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
.............................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

643 rows × 14 columns

data = data.drop(['ma5', 'ma10', 'ma20', 'v_ma5', 'v_ma10', 'v_ma20'], axis=1)
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58
...........................
2015-03-0613.1714.4814.2813.13179831.721.128.516.16
2015-03-0512.8813.4513.1612.8793180.390.262.023.19
2015-03-0412.8012.9212.9012.6167075.440.201.572.30
2015-03-0312.5213.0612.7012.52139071.610.181.444.76
2015-03-0212.2512.6712.5212.2096291.730.322.623.30

643 rows × 8 columns

2.1 索引操作

# 直接使用行列索引(先列后行)
data['open']['2018-02-27']
23.53
# 结合loc或者iloc使用索引
data.loc['2018-02-27': '2018-02-22', 'open']
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
Name: open, dtype: float64
data.iloc[0:100, 0:1].head()
open
2018-02-2723.53
2018-02-2622.80
2018-02-2322.88
2018-02-2222.25
2018-02-1421.49
# 使用ix组合索引
data.ix[0:4, ['open', 'close', 'high', 'low']]
/Users/admin/.virtualenvs/ai/lib/python3.7/site-packages/ipykernel_launcher.py:2: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  
/Users/admin/.virtualenvs/ai/lib/python3.7/site-packages/pandas/core/indexing.py:822: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
openclosehighlow
2018-02-2723.5324.1625.8823.53
2018-02-2622.8023.5323.7822.80
2018-02-2322.8822.8223.3722.71
2018-02-2222.2522.2822.7622.02

2.2 赋值操作

data['close'] = 1
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.88123.5395578.030.632.682.39
2018-02-2622.8023.78122.8060985.110.693.021.53
2018-02-2322.8823.37122.7152914.010.542.421.32
2018-02-2222.2522.76122.0236105.010.361.640.90
2018-02-1421.4921.99121.4823331.040.442.050.58
...........................
2015-03-0613.1714.48113.13179831.721.128.516.16
2015-03-0512.8813.45112.8793180.390.262.023.19
2015-03-0412.8012.92112.6167075.440.201.572.30
2015-03-0312.5213.06112.52139071.610.181.444.76
2015-03-0212.2512.67112.2096291.730.322.623.30

643 rows × 8 columns

data.close = 2
data
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.88223.5395578.030.632.682.39
2018-02-2622.8023.78222.8060985.110.693.021.53
2018-02-2322.8823.37222.7152914.010.542.421.32
2018-02-2222.2522.76222.0236105.010.361.640.90
2018-02-1421.4921.99221.4823331.040.442.050.58
...........................
2015-03-0613.1714.48213.13179831.721.128.516.16
2015-03-0512.8813.45212.8793180.390.262.023.19
2015-03-0412.8012.92212.6167075.440.201.572.30
2015-03-0312.5213.06212.52139071.610.181.444.76
2015-03-0212.2512.67212.2096291.730.322.623.30

643 rows × 8 columns

2.3 排序

data.sort_values(by='p_change', ascending=False).head()
openhighcloselowvolumeprice_changep_changeturnover
2015-08-2815.4016.46215.00117827.601.5010.034.03
2015-05-2127.5028.22226.50121190.112.5710.024.15
2016-12-2218.5020.42218.45150470.831.8610.023.77
2015-08-0416.2017.35215.8094292.631.5810.023.23
2016-07-0718.6618.66218.4148756.551.7010.021.67
data.sort_values(by=['open', 'high']).head()
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.67212.2096291.730.322.623.30
2015-09-0212.3014.11212.3070201.74-1.10-8.172.40
2015-03-0312.5213.06212.52139071.610.181.444.76
2015-03-0412.8012.92212.6167075.440.201.572.30
2015-03-0512.8813.45212.8793180.390.262.023.19
data.sort_index()
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.67212.2096291.730.322.623.30
2015-03-0312.5213.06212.52139071.610.181.444.76
2015-03-0412.8012.92212.6167075.440.201.572.30
2015-03-0512.8813.45212.8793180.390.262.023.19
2015-03-0613.1714.48213.13179831.721.128.516.16
...........................
2018-02-1421.4921.99221.4823331.040.442.050.58
2018-02-2222.2522.76222.0236105.010.361.640.90
2018-02-2322.8823.37222.7152914.010.542.421.32
2018-02-2622.8023.78222.8060985.110.693.021.53
2018-02-2723.5325.88223.5395578.030.632.682.39

643 rows × 8 columns

data['p_change'].sort_values().head()
2015-09-01   -10.03
2015-09-14   -10.02
2016-01-11   -10.02
2015-07-15   -10.02
2015-08-26   -10.01
Name: p_change, dtype: float64
data['p_change'].sort_index().head()
2015-03-02    2.62
2015-03-03    1.44
2015-03-04    1.57
2015-03-05    2.02
2015-03-06    8.51
Name: p_change, dtype: float64

3 DataFrame运算

data['open'].add(1)
2018-02-27    24.53
2018-02-26    23.80
2018-02-23    23.88
2018-02-22    23.25
2018-02-14    22.49
              ...  
2015-03-06    14.17
2015-03-05    13.88
2015-03-04    13.80
2015-03-03    13.52
2015-03-02    13.25
Name: open, Length: 643, dtype: float64
close = data['close']
open1 = data['open']
data['m_price_change'] = close.sub(open1)
data
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.88223.5395578.030.632.682.39-21.53
2018-02-2622.8023.78222.8060985.110.693.021.53-20.80
2018-02-2322.8823.37222.7152914.010.542.421.32-20.88
2018-02-2222.2522.76222.0236105.010.361.640.90-20.25
2018-02-1421.4921.99221.4823331.040.442.050.58-19.49
..............................
2015-03-0613.1714.48213.13179831.721.128.516.16-11.17
2015-03-0512.8813.45212.8793180.390.262.023.19-10.88
2015-03-0412.8012.92212.6167075.440.201.572.30-10.80
2015-03-0312.5213.06212.52139071.610.181.444.76-10.52
2015-03-0212.2512.67212.2096291.730.322.623.30-10.25

643 rows × 9 columns

3.1 逻辑运算符

data['p_change'] > 2
2018-02-27     True
2018-02-26     True
2018-02-23     True
2018-02-22    False
2018-02-14     True
              ...  
2015-03-06     True
2015-03-05     True
2015-03-04    False
2015-03-03    False
2015-03-02     True
Name: p_change, Length: 643, dtype: bool
# 筛选
data[data['p_change'] > 2]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.88223.5395578.030.632.682.39-21.53
2018-02-2622.8023.78222.8060985.110.693.021.53-20.80
2018-02-2322.8823.37222.7152914.010.542.421.32-20.88
2018-02-1421.4921.99221.4823331.040.442.050.58-19.49
2018-02-1220.7021.40220.6332445.390.824.030.81-18.70
..............................
2015-03-1314.1314.50214.0861342.220.362.552.10-12.13
2015-03-1014.2014.80214.01101213.510.342.383.46-12.20
2015-03-0613.1714.48213.13179831.721.128.516.16-11.17
2015-03-0512.8813.45212.8793180.390.262.023.19-10.88
2015-03-0212.2512.67212.2096291.730.322.623.30-10.25

183 rows × 9 columns

# 多条件
data[(data['p_change'] > 2) & (data['open'] > 15)]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.88223.5395578.030.632.682.39-21.53
2018-02-2622.8023.78222.8060985.110.693.021.53-20.80
2018-02-2322.8823.37222.7152914.010.542.421.32-20.88
2018-02-1421.4921.99221.4823331.040.442.050.58-19.49
2018-02-1220.7021.40220.6332445.390.824.030.81-18.70
..............................
2015-04-1319.6021.30219.50171822.691.708.755.88-17.60
2015-04-0918.2819.89218.02183119.051.206.516.27-16.28
2015-04-0817.6018.53217.60157725.970.885.025.40-15.60
2015-04-0716.5417.98216.50122471.850.885.284.19-14.54
2015-03-3015.9916.63215.9985090.450.654.122.91-13.99

161 rows × 9 columns

3.2 逻辑运算函数

data.query('p_change > 2 & turnover > 5').head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2017-11-1428.0029.89227.68243773.231.103.906.10-26.00
2017-10-3132.6235.22232.20361660.882.387.429.05-30.62
2017-10-2731.4533.20231.45333824.310.702.168.35-29.45
2017-10-2629.3032.70228.92501915.412.689.0112.56-27.30
2017-10-2527.8630.45227.54328947.311.685.998.23-25.86
data[data['turnover'].isin([4.91, 2.39])]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.88223.5395578.030.632.682.39-21.53
2016-09-2819.8820.98219.7195580.750.984.932.39-17.88

3.3 统计运算

# 综合分析
data.describe()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
count643.000000643.000000643.0643.000000643.000000643.000000643.000000643.000000643.000000
mean21.27270621.9005132.020.77183599905.5191140.0188020.1902802.936190-19.272706
std3.9309734.0775780.03.79196873879.1193540.8984764.0796982.0793753.930973
min12.25000012.6700002.012.2000001158.120000-3.520000-10.0300000.040000-32.990000
25%19.00000019.5000002.018.52500048533.210000-0.390000-1.8500001.360000-21.400000
50%21.44000021.9700002.020.98000083175.9300000.0500000.2600002.500000-19.440000
75%23.40000024.0650002.022.850000127580.0550000.4550002.3050003.915000-17.000000
max34.99000036.3500002.034.010000501915.4100003.03000010.03000012.560000-10.250000
# 统计函数
# 求和
print(data['open'].sum())
# 平均值
print(data['open'].mean())
# 中位数
print(data['open'].median())
# 最小值
print(data['open'].min())
# 最大值
print(data['open'].max())
# 众数
print(data['open'].mode())
# 绝对值
print(data['open'].abs().head(2))
# 乘积
print(data['open'].head().prod())
# 标准差
print(data['open'].std())
# 方差
print(data['open'].var())
# 最大元素的索引
print(data['open'].idxmax())
# 最小元素的索引
print(data['open'].idxmin())
13678.35
21.27270606531882
21.44
12.25
34.99
0    20.85
1    21.30
2    24.10
dtype: float64
2018-02-27    23.53
2018-02-26    22.80
Name: open, dtype: float64
5869204.273732801
3.930972922511381
15.45254811751767
2015-06-15
2015-03-02

3.4 累计统计函数

# 排序
data = data.sort_index()
data
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2015-03-0212.2512.67212.2096291.730.322.623.30-10.25
2015-03-0312.5213.06212.52139071.610.181.444.76-10.52
2015-03-0412.8012.92212.6167075.440.201.572.30-10.80
2015-03-0512.8813.45212.8793180.390.262.023.19-10.88
2015-03-0613.1714.48213.13179831.721.128.516.16-11.17
..............................
2018-02-1421.4921.99221.4823331.040.442.050.58-19.49
2018-02-2222.2522.76222.0236105.010.361.640.90-20.25
2018-02-2322.8823.37222.7152914.010.542.421.32-20.88
2018-02-2622.8023.78222.8060985.110.693.021.53-20.80
2018-02-2723.5325.88223.5395578.030.632.682.39-21.53

643 rows × 9 columns

# 累加
rise = data['p_change']
rise.cumsum()
2015-03-02      2.62
2015-03-03      4.06
2015-03-04      5.63
2015-03-05      7.65
2015-03-06     16.16
               ...  
2018-02-14    112.59
2018-02-22    114.23
2018-02-23    116.65
2018-02-26    119.67
2018-02-27    122.35
Name: p_change, Length: 643, dtype: float64
# 更好的显示
import matplotlib.pyplot as plt

rise.cumsum().plot()
plt.show()

在这里插入图片描述

# 最大值
rise.cummax()
2015-03-02     2.62
2015-03-03     2.62
2015-03-04     2.62
2015-03-05     2.62
2015-03-06     8.51
              ...  
2018-02-14    10.03
2018-02-22    10.03
2018-02-23    10.03
2018-02-26    10.03
2018-02-27    10.03
Name: p_change, Length: 643, dtype: float64
# 最小值
rise.cummin()
2015-03-02     2.62
2015-03-03     1.44
2015-03-04     1.44
2015-03-05     1.44
2015-03-06     1.44
              ...  
2018-02-14   -10.03
2018-02-22   -10.03
2018-02-23   -10.03
2018-02-26   -10.03
2018-02-27   -10.03
Name: p_change, Length: 643, dtype: float64
# 累乘
rise.head().cumprod()
2015-03-02      2.620000
2015-03-03      3.772800
2015-03-04      5.923296
2015-03-05     11.965058
2015-03-06    101.822643
Name: p_change, dtype: float64

3.5 自定义运算

data[['open', 'turnover']].apply(lambda x: x.max() - x.min(), axis=0)
open        22.74
turnover    12.52
dtype: float64

4 pandas画图

data.plot(figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x115454d10>

在这里插入图片描述

data.plot(y='open', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1185caa90>

在这里插入图片描述

data.plot(x='turnover', y='open', kind='scatter', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x118b23850>

在这里插入图片描述

data.loc[:'2015-03-2', 'open'].plot(kind='bar', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x119060d10>

在这里插入图片描述

data.loc[:'2015-03-2', 'open'].plot(kind='barh', figsize=(20, 8))
<matplotlib.axes._subplots.AxesSubplot at 0x1190e9350>

在这里插入图片描述

5 文件读取与存储

5.1 csv

data = pd.read_csv('./data/stock_day.csv', usecols=['open', 'close'])
data
openclose
2018-02-2723.5324.16
2018-02-2622.8023.53
2018-02-2322.8822.82
2018-02-2222.2522.28
2018-02-1421.4921.92
.........
2015-03-0613.1714.28
2015-03-0512.8813.16
2015-03-0412.8012.90
2015-03-0312.5212.70
2015-03-0212.2512.52

643 rows × 2 columns

data[:10].to_csv('./data/test.csv', columns=['open'])
pd.read_csv('./data/test.csv')
Unnamed: 0open
02018-02-2723.53
12018-02-2622.80
22018-02-2322.88
32018-02-2222.25
42018-02-1421.49
52018-02-1321.40
62018-02-1220.70
72018-02-0921.20
82018-02-0821.79
92018-02-0722.69
# index=False不会将索引变成一列数据
data[:10].to_csv('./data/test.csv', columns=['open'], index=False)
pd.read_csv('./data/test.csv')
open
023.53
122.80
222.88
322.25
421.49
521.40
620.70
721.20
821.79
922.69

5.2 HDF5

day = pd.read_hdf('./data/day_maxupordown.h5')
day
000001.SZ000002.SZ000004.SZ000005.SZ000006.SZ000007.SZ000008.SZ000009.SZ000010.SZ000011.SZ...001965.SZ603283.SH002920.SZ002921.SZ300684.SZ002922.SZ300735.SZ603329.SH603655.SH603080.SH
01100000000...0000000000
10001000000...0000000000
20-100-100-10-1...0000000000
30000000000...0000000000
40000000000...0000000000
..................................................................
26730000000010...0111111111
26740000000000...0101111111
26750000000000...0101111111
26760000000000...0100100111
26770000000000...0100100011

2678 rows × 3562 columns

day.to_hdf('./data/test.h5', key='one')
pd.read_hdf('./data/test.h5', key='one')
000001.SZ000002.SZ000004.SZ000005.SZ000006.SZ000007.SZ000008.SZ000009.SZ000010.SZ000011.SZ...001965.SZ603283.SH002920.SZ002921.SZ300684.SZ002922.SZ300735.SZ603329.SH603655.SH603080.SH
01100000000...0000000000
10001000000...0000000000
20-100-100-10-1...0000000000
30000000000...0000000000
40000000000...0000000000
..................................................................
26730000000010...0111111111
26740000000000...0101111111
26750000000000...0101111111
26760000000000...0100100111
26770000000000...0100100011

2678 rows × 3562 columns

5.3 json

json_read = pd.read_json('./data/columns.json', orient='records')
json_read
article_linkheadlineis_sarcastic
0https://www.huffingtonpost.com/entry/versace-b...former versace store clerk sues over secret 'b...0
1https://www.huffingtonpost.com/entry/roseanne-...the 'roseanne' revival catches up to our thorn...0
2https://local.theonion.com/mom-starting-to-fea...mom starting to fear son's web series closest ...1
3https://politics.theonion.com/boehner-just-wan...boehner just wants wife to listen, not come up...1
4https://www.huffingtonpost.com/entry/jk-rowlin...j.k. rowling wishes snape happy birthday in th...0
5https://www.huffingtonpost.com/entry/advancing...advancing the world's women0
6https://www.huffingtonpost.com/entry/how-meat-...the fascinating case for eating lab-grown meat0
7https://www.huffingtonpost.com/entry/boxed-col...this ceo will send your kids to school, if you...0
8https://politics.theonion.com/top-snake-handle...top snake handler leaves sinking huckabee camp...1
9https://www.huffingtonpost.com/entry/fridays-m...friday's morning email: inside trump's presser...0
json_read.to_json('./data/test.json', orient='records', lines=True)
pd.read_json('./data/test.json', lines=True)
article_linkheadlineis_sarcastic
0https://www.huffingtonpost.com/entry/versace-b...former versace store clerk sues over secret 'b...0
1https://www.huffingtonpost.com/entry/roseanne-...the 'roseanne' revival catches up to our thorn...0
2https://local.theonion.com/mom-starting-to-fea...mom starting to fear son's web series closest ...1
3https://politics.theonion.com/boehner-just-wan...boehner just wants wife to listen, not come up...1
4https://www.huffingtonpost.com/entry/jk-rowlin...j.k. rowling wishes snape happy birthday in th...0
5https://www.huffingtonpost.com/entry/advancing...advancing the world's women0
6https://www.huffingtonpost.com/entry/how-meat-...the fascinating case for eating lab-grown meat0
7https://www.huffingtonpost.com/entry/boxed-col...this ceo will send your kids to school, if you...0
8https://politics.theonion.com/top-snake-handle...top snake handler leaves sinking huckabee camp...1
9https://www.huffingtonpost.com/entry/fridays-m...friday's morning email: inside trump's presser...0

6 缺失值处理

6.1 判断缺失值

movie = pd.read_csv('./data/movies.csv')
movie
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.1376.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.4665.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.1262.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.3259.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.0240.0
.......................................
995996Secret in Their EyesCrime,Drama,MysteryA tight-knit team of rising investigators, alo...Billy RayChiwetel Ejiofor, Nicole Kidman, Julia Roberts...20151116.227585NaN45.0
996997Hostel: Part IIHorrorThree American college students studying abroa...Eli RothLauren German, Heather Matarazzo, Bijou Philli...2007945.57315217.5446.0
997998Step Up 2: The StreetsDrama,Music,RomanceRomantic sparks occur between two dance studen...Jon M. ChuRobert Hoffman, Briana Evigan, Cassie Ventura,...2008986.27069958.0150.0
998999Search PartyAdventure,ComedyA pair of friends embark on a mission to reuni...Scot ArmstrongAdam Pally, T.J. Miller, Thomas Middleditch,Sh...2014935.64881NaN22.0
9991000Nine LivesComedy,Family,FantasyA stuffy businessman finds himself trapped ins...Barry SonnenfeldKevin Spacey, Jennifer Garner, Robbie Amell,Ch...2016875.31243519.6411.0

1000 rows × 12 columns

pd.notnull(movie)
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
0TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
1TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
2TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
3TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
4TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
.......................................
995TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalseTrue
996TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
997TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue
998TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueFalseTrue
999TrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrueTrue

1000 rows × 12 columns

np.all(movie.isnull())
False
np.any(movie.isnull())
True

6.2 处理nan

# 不修改原数据
data = movie.dropna()
data
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.1376.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.4665.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.1262.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.3259.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.0240.0
.......................................
993994Resident Evil: AfterlifeAction,Adventure,HorrorWhile still out to destroy the evil Umbrella C...Paul W.S. AndersonMilla Jovovich, Ali Larter, Wentworth Miller,K...2010975.914090060.1337.0
994995Project XComedy3 high school seniors throw a birthday party t...Nima NourizadehThomas Mann, Oliver Cooper, Jonathan Daniel Br...2012886.716408854.7248.0
996997Hostel: Part IIHorrorThree American college students studying abroa...Eli RothLauren German, Heather Matarazzo, Bijou Philli...2007945.57315217.5446.0
997998Step Up 2: The StreetsDrama,Music,RomanceRomantic sparks occur between two dance studen...Jon M. ChuRobert Hoffman, Briana Evigan, Cassie Ventura,...2008986.27069958.0150.0
9991000Nine LivesComedy,Family,FantasyA stuffy businessman finds himself trapped ins...Barry SonnenfeldKevin Spacey, Jennifer Garner, Robbie Amell,Ch...2016875.31243519.6411.0

838 rows × 12 columns

# 替换
for i in movie.columns:
    if np.all(pd.notnull(movie[i])) == False:
        print(i)
        movie[i].fillna(movie[i].mean(), inplace=True)
Revenue (Millions)
Metascore
movie
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.13000076.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.46000065.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.12000062.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.32000059.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.02000040.0
.......................................
995996Secret in Their EyesCrime,Drama,MysteryA tight-knit team of rising investigators, alo...Billy RayChiwetel Ejiofor, Nicole Kidman, Julia Roberts...20151116.22758582.95637645.0
996997Hostel: Part IIHorrorThree American college students studying abroa...Eli RothLauren German, Heather Matarazzo, Bijou Philli...2007945.57315217.54000046.0
997998Step Up 2: The StreetsDrama,Music,RomanceRomantic sparks occur between two dance studen...Jon M. ChuRobert Hoffman, Briana Evigan, Cassie Ventura,...2008986.27069958.01000050.0
998999Search PartyAdventure,ComedyA pair of friends embark on a mission to reuni...Scot ArmstrongAdam Pally, T.J. Miller, Thomas Middleditch,Sh...2014935.6488182.95637622.0
9991000Nine LivesComedy,Family,FantasyA stuffy businessman finds himself trapped ins...Barry SonnenfeldKevin Spacey, Jennifer Garner, Robbie Amell,Ch...2016875.31243519.64000011.0

1000 rows × 12 columns

# 1. 遍历所有列名
for column in movie.columns:
    # 2. 判断每一列是否有缺失值, 如果有缺失进行处理
    if np.any(movie[column].isnull()): 
        # 如果是字符串数据, 就使用众数填充
        if movie['Description'].dtype == np.object:
             movie[column].fillna(movies[column].mode()[0], inplace=True)
        else:
            # 否则就计算均值填充
            movies[column].fillna(movies[column].mean(), inplace=True)

6.3 处理非nan

import ssl
ssl._create_default_https_context = ssl._create_unverified_context
wis = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
wis
1000025511.11.221.331.41.52.1
0100294554457103212
110154253111223112
210162776881343712
310170234113213112
410171228101087109714
....................................
6937767153111321112
6948417692111211112
6958888205101037381024
69689747148643410614
69789747148854510414

698 rows × 11 columns

# 替换成nan
wis = wis.replace(to_replace='?', value=np.nan)
np.any(wis.isnull())
True
# 删除
wis = wis.dropna()
wis
1000025511.11.221.331.41.52.1
0100294554457103212
110154253111223112
210162776881343712
310170234113213112
410171228101087109714
....................................
6937767153111321112
6948417692111211112
6958888205101037381024
69689747148643410614
69789747148854510414

682 rows × 11 columns

np.any(wis.isnull())
False

7 数据的离散化

data = pd.read_csv('./data/stock_day.csv')
p_change = data['p_change']
# 自行分组
qcut = pd.qcut(p_change, 10)
qcut
2018-02-27    (1.738, 2.938]
2018-02-26     (2.938, 5.27]
2018-02-23    (1.738, 2.938]
2018-02-22     (0.94, 1.738]
2018-02-14    (1.738, 2.938]
                   ...      
2015-03-06     (5.27, 10.03]
2015-03-05    (1.738, 2.938]
2015-03-04     (0.94, 1.738]
2015-03-03     (0.94, 1.738]
2015-03-02    (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
# 计算分到每个组数据个数
qcut.value_counts()
(5.27, 10.03]                    65
(0.26, 0.94]                     65
(-0.462, 0.26]                   65
(-10.030999999999999, -4.836]    65
(2.938, 5.27]                    64
(1.738, 2.938]                   64
(-1.352, -0.462]                 64
(-2.444, -1.352]                 64
(-4.836, -2.444]                 64
(0.94, 1.738]                    63
Name: p_change, dtype: int64
# 自定义区间
bins = [-100, -7, -5, -3 , 0, 3, 5, 7, 100]
p_counts = pd.cut(p_change, bins)
p_counts
2018-02-27      (0, 3]
2018-02-26      (3, 5]
2018-02-23      (0, 3]
2018-02-22      (0, 3]
2018-02-14      (0, 3]
                ...   
2015-03-06    (7, 100]
2015-03-05      (0, 3]
2015-03-04      (0, 3]
2015-03-03      (0, 3]
2015-03-02      (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
p_counts.value_counts()
(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(7, 100]       35
(5, 7]         35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64
# one-hot编码
dummies = pd.get_dummies(p_counts, prefix='ries')
dummies
ries_(-100, -7]ries_(-7, -5]ries_(-5, -3]ries_(-3, 0]ries_(0, 3]ries_(3, 5]ries_(5, 7]ries_(7, 100]
2018-02-2700001000
2018-02-2600000100
2018-02-2300001000
2018-02-2200001000
2018-02-1400001000
...........................
2015-03-0600000001
2015-03-0500001000
2015-03-0400001000
2015-03-0300001000
2015-03-0200001000

643 rows × 8 columns

8 合并

8.1 pd.concat

pd.concat([data, dummies], axis=1)
openhighcloselowvolumeprice_changep_changema5ma10ma20...v_ma20turnoverries_(-100, -7]ries_(-7, -5]ries_(-5, -3]ries_(-3, 0]ries_(0, 3]ries_(3, 5]ries_(5, 7]ries_(7, 100]
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.875...55576.112.3900001000
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.942...56007.501.5300000100
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.022...56372.851.3200001000
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.137...60149.600.9000001000
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.253...61716.110.5800001000
..................................................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112...115090.186.1600000001
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.820...98904.793.1900001000
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707...100812.932.3000001000
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610...117681.674.7600001000
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.520...96291.733.3000001000

643 rows × 22 columns

8.2 pd.merge

left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                        'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})

right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                        'key2': ['K0', 'K0', 'K0', 'K0'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})
# 默认内连接
pd.merge(left, right, on=['key1', 'key2'])
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
# 左连接
pd.merge(left, right, on=['key1', 'key2'], how='left')
key1key2ABCD
0K0K0A0B0C0D0
1K0K1A1B1NaNNaN
2K1K0A2B2C1D1
3K1K0A2B2C2D2
4K2K1A3B3NaNNaN
# 右连接
pd.merge(left, right, on=['key1', 'key2'], how='right')
key1key2ABCD
0K0K0A0B0C0D0
1K1K0A2B2C1D1
2K1K0A2B2C2D2
3K2K0NaNNaNC3D3
# 外连接
pd.merge(left, right, on=['key1', 'key2'], how='outer')
key1key2ABCD
0K0K0A0B0C0D0
1K0K1A1B1NaNNaN
2K1K0A2B2C1D1
3K1K0A2B2C2D2
4K2K1A3B3NaNNaN
5K2K0NaNNaNC3D3

9 交叉表与透视表

# 增加week列
date = pd.to_datetime(data.index).weekday
data['week'] = date
data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnoverweek
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.391
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.530
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.324
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.903
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.582
................................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.164
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.193
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.302
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.761
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.300

643 rows × 15 columns

# 把p_change按照0为界区分
data['posi_neg'] = np.where(data['p_change'] > 0, 1, 0)
data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnoverweekposi_neg
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.3911
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.5301
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.3241
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.9031
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.5821
...................................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.1641
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.1931
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.3021
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.7611
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.3001

643 rows × 16 columns

9.1 交叉表

count = pd.crosstab(data['week'], data['posi_neg'])
count
posi_neg01
week
06362
15576
26171
36365
45968
# 求比例
sum1 = count.sum(axis=1).astype(np.float32)
sum1
week
0    125.0
1    131.0
2    132.0
3    128.0
4    127.0
dtype: float32
pro = count.div(sum1, axis=0)
pro
posi_neg01
week
00.5040000.496000
10.4198470.580153
20.4621210.537879
30.4921880.507812
40.4645670.535433
pro.plot(kind='bar', stacked=True)
plt.show()

在这里插入图片描述

9.2 透视表

# 通过透视表,将整个过程变得更简单些
data.pivot_table(['posi_neg'], index='week')
posi_neg
week
00.496000
10.580153
20.537879
30.507812
40.535433

10 分组与聚合

col =pd.DataFrame({'color': ['white','red','green','red','green'], 'object': ['pen','pencil','pencil','ashtray','pen'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})
col
colorobjectprice1price2
0whitepen5.564.75
1redpencil4.204.12
2greenpencil1.301.60
3redashtray0.560.75
4greenpen2.753.15
# 对颜色分组,price进行聚合
col.groupby(['color'])['price1'].mean()
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64
col['price1'].groupby(col['color']).mean()
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64
# 分组,数据的结构不变
col.groupby(['color'], as_index=False)['price1'].mean()
colorprice1
0green2.025
1red2.380
2white5.560
# 星巴克示例
starbucks = pd.read_csv('./data/directory.csv')
starbucks
BrandStore NumberStore NameOwnership TypeStreet AddressCityState/ProvinceCountryPostcodePhone NumberTimezoneLongitudeLatitude
0Starbucks47370-257954Meritxell, 96LicensedAv. Meritxell, 96Andorra la Vella7ADAD500376818720GMT+1:00 Europe/Andorra1.5342.51
1Starbucks22331-212325Ajman Drive ThruLicensed1 Street 69, Al JarfAjmanAJAENaNNaNGMT+04:00 Asia/Dubai55.4725.42
2Starbucks47089-256771Dana MallLicensedSheikh Khalifa Bin Zayed St.AjmanAJAENaNNaNGMT+04:00 Asia/Dubai55.4725.39
3Starbucks22126-218024Twofour 54LicensedAl Salam StreetAbu DhabiAZAENaNNaNGMT+04:00 Asia/Dubai54.3824.48
4Starbucks17127-178586Al Ain TowerLicensedKhaldiya Area, Abu Dhabi IslandAbu DhabiAZAENaNNaNGMT+04:00 Asia/Dubai54.5424.51
..........................................
25595Starbucks21401-212072RexLicensed141 Nguyễn Huệ, Quận 1, Góc đường Pasteur và L...Thành Phố Hồ Chí MinhSGVN7000008 3824 4668GMT+000000 Asia/Saigon106.7010.78
25596Starbucks24010-226985PanoramaLicensedSN-44, Tòa Nhà Panorama, 208 Trần Văn Trà, Quận 7Thành Phố Hồ Chí MinhSGVN7000008 5413 8292GMT+000000 Asia/Saigon106.7110.72
25597Starbucks47608-253804Rosebank MallLicensedCnr Tyrwhitt and Cradock Avenue, RosebankJohannesburgGTZA219427873500159GMT+000000 Africa/Johannesburg28.04-26.15
25598Starbucks47640-253809Menlyn MaineLicensedShop 61B, Central Square, Cnr Aramist & Coroba...MenlynGTZA181NaNGMT+000000 Africa/Johannesburg28.28-25.79
25599Starbucks47609-253286Mall of AfricaLicensedShop 2077, Upper Level, Waterfall CityMidrandGTZA168227873500215GMT+000000 Africa/Johannesburg28.11-26.02

25600 rows × 13 columns

count = starbucks.groupby(['Country']).count()
count
BrandStore NumberStore NameOwnership TypeStreet AddressCityState/ProvincePostcodePhone NumberTimezoneLongitudeLatitude
Country
AD111111111111
AE1441441441441441441442478144144144
AR10810810810810810810810029108108108
AT181818181818181817181818
AU22222222222222220222222
.......................................
TT333333330333
TW39439439439439439439436539394394394
US136081360813608136081360813608136081360713122136081360813608
VN252525252525252523252525
ZA333333332333

73 rows × 12 columns

count['Brand'].sort_values(ascending=False)[:20].plot(kind='bar', figsize=(20, 8), fontsize=20)
plt.show()

在这里插入图片描述

starbucks.groupby(['Country', 'State/Province']).count().head(20)
BrandStore NumberStore NameOwnership TypeStreet AddressCityPostcodePhone NumberTimezoneLongitudeLatitude
CountryState/Province
AD711111111111
AEAJ22222200222
AZ484848484848720484848
DU8282828282821650828282
FU22222210222
RK33333303333
SH66666605666
UQ11111100111
ARB212121212121185212121
C7373737373737124737373
M55555520555
S33333330333
X66666660666
AT311111111111
533333333333
91414141414141413141414
AUNSW99999990999
QLD88888880888
VIC55555550555
AWAW33333303333
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值