pandas

pandas

import pandas as pd
import numpy as np
import re
s1 = np.random.normal(0,1,(10,5))

s1
array([[-0.20687054, -0.32658765,  0.08331198, -0.35200115, -0.47287752],
       [-1.12880273, -0.60517378, -1.58661317,  0.01774439, -0.1918528 ],
       [ 1.58558494,  0.05270011,  1.08498907,  0.52493911, -0.38949185],
       [-0.22147045,  1.3795081 , -1.04626624,  0.99894976, -2.07890203],
       [ 0.20435856, -0.07516624,  1.70947873, -2.51118659, -0.21273919],
       [ 0.14601014, -0.72608304,  0.20719284,  0.30527482,  0.86076936],
       [ 0.41551343,  0.38332934,  1.9463179 ,  1.31499648, -1.07326932],
       [-0.39235779,  0.06411874,  0.06089033,  0.54348207, -0.59586276],
       [-0.07680699, -1.56634337,  0.03776305,  0.37830907,  0.29248695],
       [-0.28961965,  0.17700808,  0.02997782,  0.89239819,  0.12259162]])
a =  pd.DataFrame(s)

a
01234
01.217958-2.0253881.009486-0.1465021.846820
1-0.182713-2.179872-0.9255580.5506370.859040
20.717373-0.173222-0.9987241.2183910.374692
30.600547-1.3818771.3339131.0690110.962694
4-0.7161700.487692-0.5019501.724558-0.670558
5-1.3540310.874799-1.3616901.4563271.566802
6-0.9776120.640754-1.463054-0.3905951.840542
7-0.806473-0.568746-1.0638380.251186-0.226839
80.344598-0.8162980.7982621.604684-0.450517
90.889421-0.5051471.7283321.241280-0.028786
a.shape[0]
10
st = ["股票{}".format(i+1) for i in range(a.shape[0])]

st
['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10']
pd.DataFrame(s1,index=st)  # 行索引
01234
股票1-0.206871-0.3265880.083312-0.352001-0.472878
股票2-1.128803-0.605174-1.5866130.017744-0.191853
股票31.5855850.0527001.0849890.524939-0.389492
股票4-0.2214701.379508-1.0462660.998950-2.078902
股票50.204359-0.0751661.709479-2.511187-0.212739
股票60.146010-0.7260830.2071930.3052750.860769
股票70.4155130.3833291.9463181.314996-1.073269
股票8-0.3923580.0641190.0608900.543482-0.595863
股票9-0.076807-1.5663430.0377630.3783090.292487
股票10-0.2896200.1770080.0299780.8923980.122592

pd.date_range()生成一组连续的时间序列

date_range(start=None,end=None,periods=None,freq=‘B’)

  • start : 开始时间
  • end : 结束时间
  • periods : 时间天数
  • freq : 递进单位,默认为1天,'B’默认略过周末
date = pd.date_range(start='20190403',periods=a.shape[1],freq='B')

date
DatetimeIndex(['2019-04-03', '2019-04-04', '2019-04-05', '2019-04-08',
               '2019-04-09'],
              dtype='datetime64[ns]', freq='B')
pd.DataFrame(s1,index=st,columns=date)
2019-04-032019-04-042019-04-052019-04-082019-04-09
股票1-0.206871-0.3265880.083312-0.352001-0.472878
股票2-1.128803-0.605174-1.5866130.017744-0.191853
股票31.5855850.0527001.0849890.524939-0.389492
股票4-0.2214701.379508-1.0462660.998950-2.078902
股票50.204359-0.0751661.709479-2.511187-0.212739
股票60.146010-0.7260830.2071930.3052750.860769
股票70.4155130.3833291.9463181.314996-1.073269
股票8-0.3923580.0641190.0608900.543482-0.595863
股票9-0.076807-1.5663430.0377630.3783090.292487
股票10-0.2896200.1770080.0299780.8923980.122592

DataFrame

df=pd.DataFrame(s1,index=st,columns=date)
df
2019-04-032019-04-042019-04-052019-04-082019-04-09
股票1-0.206871-0.3265880.083312-0.352001-0.472878
股票2-1.128803-0.605174-1.5866130.017744-0.191853
股票31.5855850.0527001.0849890.524939-0.389492
股票4-0.2214701.379508-1.0462660.998950-2.078902
股票50.204359-0.0751661.709479-2.511187-0.212739
股票60.146010-0.7260830.2071930.3052750.860769
股票70.4155130.3833291.9463181.314996-1.073269
股票8-0.3923580.0641190.0608900.543482-0.595863
股票9-0.076807-1.5663430.0377630.3783090.292487
股票10-0.2896200.1770080.0299780.8923980.122592
df.shape
(10, 5)
df.index
Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')
df.columns
DatetimeIndex(['2019-04-03', '2019-04-04', '2019-04-05', '2019-04-08',
               '2019-04-09'],
              dtype='datetime64[ns]', freq='B')
df.values
array([[-0.20687054, -0.32658765,  0.08331198, -0.35200115, -0.47287752],
       [-1.12880273, -0.60517378, -1.58661317,  0.01774439, -0.1918528 ],
       [ 1.58558494,  0.05270011,  1.08498907,  0.52493911, -0.38949185],
       [-0.22147045,  1.3795081 , -1.04626624,  0.99894976, -2.07890203],
       [ 0.20435856, -0.07516624,  1.70947873, -2.51118659, -0.21273919],
       [ 0.14601014, -0.72608304,  0.20719284,  0.30527482,  0.86076936],
       [ 0.41551343,  0.38332934,  1.9463179 ,  1.31499648, -1.07326932],
       [-0.39235779,  0.06411874,  0.06089033,  0.54348207, -0.59586276],
       [-0.07680699, -1.56634337,  0.03776305,  0.37830907,  0.29248695],
       [-0.28961965,  0.17700808,  0.02997782,  0.89239819,  0.12259162]])
df.T
股票1股票2股票3股票4股票5股票6股票7股票8股票9股票10
2019-04-03-0.206871-1.1288031.585585-0.2214700.2043590.1460100.415513-0.392358-0.076807-0.289620
2019-04-04-0.326588-0.6051740.0527001.379508-0.075166-0.7260830.3833290.064119-1.5663430.177008
2019-04-050.083312-1.5866131.084989-1.0462661.7094790.2071931.9463180.0608900.0377630.029978
2019-04-08-0.3520010.0177440.5249390.998950-2.5111870.3052751.3149960.5434820.3783090.892398
2019-04-09-0.472878-0.191853-0.389492-2.078902-0.2127390.860769-1.073269-0.5958630.2924870.122592
df.head(5)
2019-04-032019-04-042019-04-052019-04-082019-04-09
股票1-0.206871-0.3265880.083312-0.352001-0.472878
股票2-1.128803-0.605174-1.5866130.017744-0.191853
股票31.5855850.0527001.0849890.524939-0.389492
股票4-0.2214701.379508-1.0462660.998950-2.078902
股票50.204359-0.0751661.709479-2.511187-0.212739
df.tail(5)
2019-04-032019-04-042019-04-052019-04-082019-04-09
股票60.146010-0.7260830.2071930.3052750.860769
股票70.4155130.3833291.9463181.314996-1.073269
股票8-0.3923580.0641190.0608900.543482-0.595863
股票9-0.076807-1.5663430.0377630.3783090.292487
股票10-0.2896200.1770080.0299780.8923980.122592

设置索引

st = ["股票_{}".format(i+1) for i in range(a.shape[0])]

st
['股票_1',
 '股票_2',
 '股票_3',
 '股票_4',
 '股票_5',
 '股票_6',
 '股票_7',
 '股票_8',
 '股票_9',
 '股票_10']
df.index=st
df
2019-04-032019-04-042019-04-052019-04-082019-04-09
股票_1-0.206871-0.3265880.083312-0.352001-0.472878
股票_2-1.128803-0.605174-1.5866130.017744-0.191853
股票_31.5855850.0527001.0849890.524939-0.389492
股票_4-0.2214701.379508-1.0462660.998950-2.078902
股票_50.204359-0.0751661.709479-2.511187-0.212739
股票_60.146010-0.7260830.2071930.3052750.860769
股票_70.4155130.3833291.9463181.314996-1.073269
股票_8-0.3923580.0641190.0608900.543482-0.595863
股票_9-0.076807-1.5663430.0377630.3783090.292487
股票_10-0.2896200.1770080.0299780.8923980.122592

重置索引

df.reset_index()   # 重置索引
index2019-04-03 00:00:002019-04-04 00:00:002019-04-05 00:00:002019-04-08 00:00:002019-04-09 00:00:00
0股票_1-0.206871-0.3265880.083312-0.352001-0.472878
1股票_2-1.128803-0.605174-1.5866130.017744-0.191853
2股票_31.5855850.0527001.0849890.524939-0.389492
3股票_4-0.2214701.379508-1.0462660.998950-2.078902
4股票_50.204359-0.0751661.709479-2.511187-0.212739
5股票_60.146010-0.7260830.2071930.3052750.860769
6股票_70.4155130.3833291.9463181.314996-1.073269
7股票_8-0.3923580.0641190.0608900.543482-0.595863
8股票_9-0.076807-1.5663430.0377630.3783090.292487
9股票_10-0.2896200.1770080.0299780.8923980.122592
df
2019-04-032019-04-042019-04-052019-04-082019-04-09
股票_1-0.206871-0.3265880.083312-0.352001-0.472878
股票_2-1.128803-0.605174-1.5866130.017744-0.191853
股票_31.5855850.0527001.0849890.524939-0.389492
股票_4-0.2214701.379508-1.0462660.998950-2.078902
股票_50.204359-0.0751661.709479-2.511187-0.212739
股票_60.146010-0.7260830.2071930.3052750.860769
股票_70.4155130.3833291.9463181.314996-1.073269
股票_8-0.3923580.0641190.0608900.543482-0.595863
股票_9-0.076807-1.5663430.0377630.3783090.292487
股票_10-0.2896200.1770080.0299780.8923980.122592
df.reset_index(drop=True)
2019-04-032019-04-042019-04-052019-04-082019-04-09
0-0.206871-0.3265880.083312-0.352001-0.472878
1-1.128803-0.605174-1.5866130.017744-0.191853
21.5855850.0527001.0849890.524939-0.389492
3-0.2214701.379508-1.0462660.998950-2.078902
40.204359-0.0751661.709479-2.511187-0.212739
50.146010-0.7260830.2071930.3052750.860769
60.4155130.3833291.9463181.314996-1.073269
7-0.3923580.0641190.0608900.543482-0.595863
8-0.076807-1.5663430.0377630.3783090.292487
9-0.2896200.1770080.0299780.8923980.122592

以某列的值设置新的索引

df = pd.DataFrame({'month':[1,2,3,4],
                  'year':[2012,2014,2015,2016],
                  'sale':[35,40,84,31]})

df
monthyearsale
01201235
12201440
23201584
34201631
df.set_index(keys=['year'])  # 将yaer一列的值作为索引
monthsale
year
2012135
2014240
2015384
2016431
df.set_index(keys=["year","month"])  # 将yaer和 month 列的值作为索引

sale
yearmonth
2012135
2014240
2015384
2016431
stock_data = pd.read_csv("./课件/data/stock_day.csv")
stock_data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
stock_data = stock_data.drop(["ma5","ma10","ma20","v_ma5","v_ma10","v_ma20"], axis=1)
stock_data.head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5395578.030.632.682.39
2018-02-2622.8023.7823.5322.8060985.110.693.021.53
2018-02-2322.8823.3722.8222.7152914.010.542.421.32
2018-02-2222.2522.7622.2822.0236105.010.361.640.90
2018-02-1421.4921.9921.9221.4823331.040.442.050.58

pandas索引

stock_data["open"]["2018-02-27"]
23.53
# stock_data["2018-02-27"]["open"]  # 不可以先行后列
# stock_data[:1, :2]

loc 按照行列索引的名字

iloc 索引的下标值

ix 混合索引


stock_data.loc["2018-02-27":"2018-02-23", "high"]
2018-02-27    25.88
2018-02-26    23.78
2018-02-23    23.37
Name: high, dtype: float64
stock_data.iloc[:3,:2]
openhigh
2018-02-2723.5325.88
2018-02-2622.8023.78
2018-02-2322.8823.37
stock_data.ix[:5,('high','low')]
D:\Anaconda\an\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
D:\Anaconda\an\lib\site-packages\pandas\core\indexing.py:961: FutureWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  return getattr(section, self.name)[new_key]
highlow
2018-02-2725.8823.53
2018-02-2623.7822.80
2018-02-2323.3722.71
2018-02-2222.7622.02
2018-02-1421.9921.48
stock_data.columns
Index(['open', 'high', 'close', 'low', 'volume', 'price_change', 'p_change',
       'turnover'],
      dtype='object')

get_indexer()

stock_data.columns.get_indexer(["open", "low"])  # 获取对应的下标
array([0, 3], dtype=int64)
stock_data.index.get_indexer(['2018-02-27'])  # 获取对应的下标
array([0], dtype=int64)

赋值操作

stock_data["volume"] = 100
stock_data.volume = 1000
stock_data.head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5310000.632.682.39
2018-02-2622.8023.7823.5322.8010000.693.021.53
2018-02-2322.8823.3722.8222.7110000.542.421.32
2018-02-2222.2522.7622.2822.0210000.361.640.90
2018-02-1421.4921.9921.9221.4810000.442.050.58

排序

  • df.sort_values(by=,ascending=)
    • 单个键或多个键进行排序,默认升序
    • ascending=False降序

DataFrame的排序

stock_data.sort_values(by="open", ascending=True).head()
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.6712.5212.2010000.322.623.30
2015-09-0212.3014.1112.3612.301000-1.10-8.172.40
2015-03-0312.5213.0612.7012.5210000.181.444.76
2015-03-0412.8012.9212.9012.6110000.201.572.30
2015-03-0512.8813.4513.1612.8710000.262.023.19
stock_data.sort_values(by=["open", "high"]).head()
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.6712.5212.2010000.322.623.30
2015-09-0212.3014.1112.3612.301000-1.10-8.172.40
2015-03-0312.5213.0612.7012.5210000.181.444.76
2015-03-0412.8012.9212.9012.6110000.201.572.30
2015-03-0512.8813.4513.1612.8710000.262.023.19
stock_data.sort_index().head()  # 按照索引进行排序
openhighcloselowvolumeprice_changep_changeturnover
2015-03-0212.2512.6712.5212.2010000.322.623.30
2015-03-0312.5213.0612.7012.5210000.181.444.76
2015-03-0412.8012.9212.9012.6110000.201.572.30
2015-03-0512.8813.4513.1612.8710000.262.023.19
2015-03-0613.1714.4814.2813.1310001.128.516.16

series的排序

stock_data["open"].sort_values(ascending=False).head()
2015-06-15    34.99
2015-06-12    34.69
2015-06-10    34.10
2017-11-01    33.85
2015-06-11    33.17
Name: open, dtype: float64
stock_data["open"].sort_index(ascending=False).head()  # 降序
2018-02-27    23.53
2018-02-26    22.80
2018-02-23    22.88
2018-02-22    22.25
2018-02-14    21.49
Name: open, dtype: float64
stock_data["open"].sort_index().head()  # 升序
2015-03-02    12.25
2015-03-03    12.52
2015-03-04    12.80
2015-03-05    12.88
2015-03-06    13.17
Name: open, dtype: float64

算术运算

  • add()
  • sub()
a1=pd.Series(range(10,13))
a2=pd.Series(range(20,25))
a1+a2
0    30.0
1    32.0
2    34.0
3     NaN
4     NaN
dtype: float64
a1.add(a2)  

# 先按照索引进行对齐
# 对齐的位置进行相加,没有对齐的位置用NaN值进行填充
0    30.0
1    32.0
2    34.0
3     NaN
4     NaN
dtype: float64
a1.add(a2,fill_value=0)

# 也可以设置 fill_value参数,fill_value将会使用对象中存在的数据进行填充
0    30.0
1    32.0
2    34.0
3    23.0
4    24.0
dtype: float64
stock_data['close'].add(10).head()
2018-02-27    34.16
2018-02-26    33.53
2018-02-23    32.82
2018-02-22    32.28
2018-02-14    31.92
Name: close, dtype: float64
stock_data['close']+100
2018-02-27    124.16
2018-02-26    123.53
2018-02-23    122.82
2018-02-22    122.28
2018-02-14    121.92
               ...  
2015-03-06    114.28
2015-03-05    113.16
2015-03-04    112.90
2015-03-03    112.70
2015-03-02    112.52
Name: close, Length: 643, dtype: float64
stock_data.head()
openhighcloselowvolumeprice_changep_changeturnover
2018-02-2723.5325.8824.1623.5310000.632.682.39
2018-02-2622.8023.7823.5322.8010000.693.021.53
2018-02-2322.8823.3722.8222.7110000.542.421.32
2018-02-2222.2522.7622.2822.0210000.361.640.90
2018-02-1421.4921.9921.9221.4810000.442.050.58
close = stock_data['close']
open1 = stock_data['open']

stock_data['m_price_change'] = close.sub(open1)  # close - open = m_price_change
stock_data.head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5310000.632.682.390.63
2018-02-2622.8023.7823.5322.8010000.693.021.530.73
2018-02-2322.8823.3722.8222.7110000.542.421.32-0.06
2018-02-2222.2522.7622.2822.0210000.361.640.900.03
2018-02-1421.4921.9921.9221.4810000.442.050.580.43

逻辑运算符<,>,|,&

stock_data[stock_data['open']>20].head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5310000.632.682.390.63
2018-02-2622.8023.7823.5322.8010000.693.021.530.73
2018-02-2322.8823.3722.8222.7110000.542.421.32-0.06
2018-02-2222.2522.7622.2822.0210000.361.640.900.03
2018-02-1421.4921.9921.9221.4810000.442.050.580.43
data=stock_data
data[(data["open"] > 23) & (data["open"] < 24)].head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5310000.632.682.390.63
2018-02-0123.7123.8622.4222.221000-1.30-5.481.66-1.29
2018-01-3123.8523.9823.7223.311000-0.11-0.461.23-0.13
2018-01-3023.7124.0823.8323.7010000.050.210.810.12
2018-01-1623.4024.6024.4023.3010000.964.102.541.00

逻辑运算函数

  • data.query()
  • isin() 判断是否在这个范围内

Pandas学习笔记十——高性能的eval和query方法

data.query("open<24 & open>23").head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5310000.632.682.390.63
2018-02-0123.7123.8622.4222.221000-1.30-5.481.66-1.29
2018-01-3123.8523.9823.7223.311000-0.11-0.461.23-0.13
2018-01-3023.7124.0823.8323.7010000.050.210.810.12
2018-01-1623.4024.6024.4023.3010000.964.102.541.00
data.query("open>23 & open<24").head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5310000.632.682.390.63
2018-02-0123.7123.8622.4222.221000-1.30-5.481.66-1.29
2018-01-3123.8523.9823.7223.311000-0.11-0.461.23-0.13
2018-01-3023.7124.0823.8323.7010000.050.210.810.12
2018-01-1623.4024.6024.4023.3010000.964.102.541.00
# 判断‘turnover’是否为4.19,2.39
# 可以指定一个值进行判断,从而进行筛选

data[data['turnover'].isin([4.19,2.39])]
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2018-02-2723.5325.8824.1623.5310000.632.682.390.63
2017-07-2523.0724.2023.7022.6410000.672.914.190.63
2016-09-2819.8820.9820.8619.7110000.984.932.390.98
2015-04-0716.5417.9817.5416.5010000.885.284.191.00


## 统计运算
- data.describe() 统计结果count,mean,std(标准差),min,max
data.describe()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
count643.000000643.000000643.000000643.000000643.0643.000000643.000000643.000000643.000000
mean21.27270621.90051321.33626720.7718351000.00.0188020.1902802.9361900.063561
std3.9309734.0775783.9428063.7919680.00.8984764.0796982.0793750.800565
min12.25000012.67000012.36000012.2000001000.0-3.520000-10.0300000.040000-3.300000
25%19.00000019.50000019.04500018.5250001000.0-0.390000-1.8500001.360000-0.300000
50%21.44000021.97000021.45000020.9800001000.00.0500000.2600002.5000000.080000
75%23.40000024.06500023.41500022.8500001000.00.4550002.3050003.9150000.450000
max34.99000036.35000035.21000034.0100001000.03.03000010.03000012.5600003.410000

统计函数

  • sum

  • mean

  • median 中位数

  • min

  • max

  • mode 众数

  • abs 绝对值

  • prod 返回不同维度上的乘积

  • std 标准差

  • var 方差

  • idxmax 返回最大值的索引

  • idxmin

  • describe 汇总统计

  • argmax 返回最大值的索引
    注:不建议使用 argmax(),已使用 idxmax() 替代。

data.sum(axis=1)  # 按行求和
2018-02-27    1103.43
2018-02-26    1098.88
2018-02-23    1096.00
2018-02-22    1092.24
2018-02-14    1090.38
               ...   
2015-03-06    1071.96
2015-03-05    1058.11
2015-03-04    1055.40
2015-03-03    1057.36
2015-03-02    1056.15
Length: 643, dtype: float64
data.sum(axis=0)  # 按行求和
open               13678.35
high               14082.03
close              13719.22
low                13356.29
volume            643000.00
price_change          12.09
p_change             122.35
turnover            1887.97
m_price_change        40.87
dtype: float64
df = pd.DataFrame({'col1':[2,3,5,4,6,9],
                  'col2':[1,2,2,3,6,7]})

df
col1col2
021
132
252
343
466
597
df.median()
col1    4.5
col2    2.5
dtype: float64
df.idxmax()
col1    5
col2    5
dtype: int64
df['col1'].argmax()
D:\Anaconda\an\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: 
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.





5

累计统计函数求和

  • cumsum 计算前1/2/3/…/n个数的和
  • cummax 计算前1/2/3/…/n个数的最大值
  • cummin 计算前1/2/3/…/n个数的最小值
  • cumprod 计算前1/2/3/…/n个数的积
data = data.sort_index()

data.head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2015-03-0212.2512.6712.5212.2010000.322.623.300.27
2015-03-0312.5213.0612.7012.5210000.181.444.760.18
2015-03-0412.8012.9212.9012.6110000.201.572.300.10
2015-03-0512.8813.4513.1612.8710000.262.023.190.28
2015-03-0613.1714.4814.2813.1310001.128.516.161.11
stick=data['p_change']

stick.head()
2015-03-02    2.62
2015-03-03    1.44
2015-03-04    1.57
2015-03-05    2.02
2015-03-06    8.51
Name: p_change, dtype: float64
stick.cumsum()
2015-03-02      2.62
2015-03-03      4.06
2015-03-04      5.63
2015-03-05      7.65
2015-03-06     16.16
               ...  
2018-02-14    112.59
2018-02-22    114.23
2018-02-23    116.65
2018-02-26    119.67
2018-02-27    122.35
Name: p_change, Length: 643, dtype: float64
stick1=stick.cumsum()
stick.sum()
122.35000000000001
import matplotlib.pyplot as plt

stick1.plot()
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lfgAL6dB-1615267786010)(output_98_0.png)]


data.head()
openhighcloselowvolumeprice_changep_changeturnoverm_price_change
2015-03-0212.2512.6712.5212.2010000.322.623.300.27
2015-03-0312.5213.0612.7012.5210000.181.444.760.18
2015-03-0412.8012.9212.9012.6110000.201.572.300.10
2015-03-0512.8813.4513.1612.8710000.262.023.190.28
2015-03-0613.1714.4814.2813.1310001.128.516.161.11

自定义运算

  • apply(func,axis=0)
    • func:自定义
    • axis=0 : 默认是列,axis=1是行

pandas函数应用——apply、applymap、map方法

data[['open','close']].apply(lambda x:x.max()-x.min(),axis=0)
open     22.74
close    22.85
dtype: float64
data[['open','close']].apply(lambda x:x[0]-x[1],axis=0)
open    -0.27
close   -0.18
dtype: float64
data[['open','close']].apply(lambda x:x.max()-x.min(),axis=1)
2015-03-02    0.27
2015-03-03    0.18
2015-03-04    0.10
2015-03-05    0.28
2015-03-06    1.11
              ... 
2018-02-14    0.43
2018-02-22    0.03
2018-02-23    0.06
2018-02-26    0.73
2018-02-27    0.63
Length: 643, dtype: float64

map()

Pandas数据处理三板斧——map、apply、applymap详解

Pandas之超好用的Groupby用法详解

boolean=[True,False]
gender=["男","女"]
color=["white","black","yellow"]
data=pd.DataFrame({
    "height":np.random.randint(150,190,100),
    "weight":np.random.randint(40,90,100),
    "smoker":[boolean[x] for x in np.random.randint(0,2,100)],
    "gender":[gender[x] for x in np.random.randint(0,2,100)],
    "age":np.random.randint(15,90,100),
    "color":[color[x] for x in np.random.randint(0,len(color),100) ]
}
)
data.head()
heightweightsmokergenderagecolor
016847True82black
118562False39yellow
215371False77white
316880True28black
418255True49white
# 把男的换成1,女的换成0

data['gender']=data['gender'].map({'男':1,'女':0})

data.head()
heightweightsmokergenderagecolor
016847True182black
118562False139yellow
215371False077white
316880True128black
418255True149white
def apply_age(x,bias):
    return x+bias

#以元组的方式传入额外的参数
data["age"] = data["age"].apply(apply_age,args=(-3,))
data.head()
heightweightsmokergenderagecolor
016847True179black
118562False136yellow
215371False074white
316880True125black
418255True146white
# 沿着0轴求和
data[["height","weight","age"]].apply(np.sum, axis=0)
height    17025
weight     6575
age        4924
dtype: int64
# 沿着0轴取对数
data[["height","weight","age"]].apply(np.log, axis=0)
heightweightage
05.1239643.8501484.369448
15.2203564.1271343.583519
25.0304384.2626804.304065
35.1239644.3820273.218876
45.2040074.0073333.828641
............
955.1179943.9120234.158883
965.1590554.4659084.043051
975.1817844.2341074.219508
985.0172804.0604432.772589
995.1984974.2195083.970292

100 rows × 3 columns

将DataFrame中所有的值或单独一列保留两位小数显示

## applymap
df = pd.DataFrame(
    {
        "A":np.random.randn(5),
        "B":np.random.randn(5),
        "C":np.random.randn(5),
        "D":np.random.randn(5),
        "E":np.random.randn(5),
    }
)
df.head()
ABCDE
00.3965962.6518141.072639-1.852616-0.295315
1-1.165494-0.538050-0.975762-0.146247-0.927800
2-1.1598840.4158551.870558-1.3282740.487950
3-0.2490291.578621-1.034431-1.8289040.554523
4-0.0127420.5412320.0656081.6435570.055912
# 将DataFrame中所有的值保留两位小数显示

df.applymap(lambda x:"%.2f" % x)
ABCDE
00.402.651.07-1.85-0.30
1-1.17-0.54-0.98-0.15-0.93
2-1.160.421.87-1.330.49
3-0.251.58-1.03-1.830.55
4-0.010.540.071.640.06
df['E']=df['E'].map(lambda x:"%.2f" % x)
df
ABCDE
00.3965962.6518141.072639-1.852616-0.30
1-1.165494-0.538050-0.975762-0.146247-0.93
2-1.1598840.4158551.870558-1.3282740.49
3-0.2490291.578621-1.034431-1.8289040.55
4-0.0127420.5412320.0656081.6435570.06

pandas 画图

  • DataFrame.plot(x=None,y=None,kind=‘line’)

    • x : x轴的值
    • y : y轴的值

    -kind:str
    - ‘line’: 默认是画折线图
    - ‘bar’:条形图
    - ‘barh’:横向条形图
    - ‘hist’:直方图
    - ‘pie’:饼图
    - ‘scatter’:散点图

data['open'].plot(kind='line')
<matplotlib.axes._subplots.AxesSubplot at 0x22afcafa8c8>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aYkQnFzS-1615267786013)(output_124_1.png)]

data['open'].plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x22afe0fc148>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-kE7J4zVO-1615267786015)(output_125_1.png)]

pd.read_csv()

import pandas as pd

data = pd.read_csv("./课件/data/stock_day.csv", usecols=["open", "high"])

data.head()
openhigh
2018-02-2723.5325.88
2018-02-2622.8023.78
2018-02-2322.8823.37
2018-02-2222.2522.76
2018-02-1421.4921.99

to_csv()

df.to_csv(‘askci.csv’, encoding=‘utf_8_sig’)

DataFrame.to_csv(path_or_buf=None,sep=’,’,columns=None,header=True,index=True,
mode=‘w’,encoding=None)

  • path_or_buf
  • sep
  • columns
  • mode:'w’重写,'a’追加
  • index:是否进行索引
  • header:是否写入列索引值

缺失值的处理

  • pd.isnull()
  • pd.notnull()
  • dropna(axis=‘rows’) 删除缺失值
  • 替换缺失值 fillna(value,inplace=True)

电影数据文件处理

movie = pd.read_csv("./课件/data/IMDB-Movie-Data.csv")

movie.head()
RankTitleGenreDescriptionDirectorActorsYearRuntime (Minutes)RatingVotesRevenue (Millions)Metascore
01Guardians of the GalaxyAction,Adventure,Sci-FiA group of intergalactic criminals are forced ...James GunnChris Pratt, Vin Diesel, Bradley Cooper, Zoe S...20141218.1757074333.1376.0
12PrometheusAdventure,Mystery,Sci-FiFollowing clues to the origin of mankind, a te...Ridley ScottNoomi Rapace, Logan Marshall-Green, Michael Fa...20121247.0485820126.4665.0
23SplitHorror,ThrillerThree girls are kidnapped by a man with a diag...M. Night ShyamalanJames McAvoy, Anya Taylor-Joy, Haley Lu Richar...20161177.3157606138.1262.0
34SingAnimation,Comedy,FamilyIn a city of humanoid animals, a hustling thea...Christophe LourdeletMatthew McConaughey,Reese Witherspoon, Seth Ma...20161087.260545270.3259.0
45Suicide SquadAction,Adventure,FantasyA secret government agency recruits some of th...David AyerWill Smith, Jared Leto, Margot Robbie, Viola D...20161236.2393727325.0240.0
np.any(pd.isnull(movie))  # 有一个缺失值就返回True
True
np.all(pd.isnull(movie))  # 有一个缺失值就返回False
False
data = movie.dropna()
np.any(pd.isnull(data))  # 有一个缺失值就返回True
False
# 用平均值填充

movie['Revenue (Millions)'].fillna(value=movie['Revenue (Millions)'].mean()) # inplace=True
0      333.130000
1      126.460000
2      138.120000
3      270.320000
4      325.020000
          ...    
995     82.956376
996     17.540000
997     58.010000
998     82.956376
999     19.640000
Name: Revenue (Millions), Length: 1000, dtype: float64
# 替换所有缺失值

for i in movie.columns:
    if np.all(pd.notnull(movie[i])) == False:
        print(i)
        movie[i].fillna(movie[i].mean(),inplace=True)
Revenue (Millions)
Metascore
wis = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
wis.head()
1000025511.11.221.331.41.52.1
0100294554457103212
110154253111223112
210162776881343712
310170234113213112
410171228101087109714

replace()

wis = wis.replace(to_replace="?", value=np.NaN)

wis.head()
1000025511.11.221.331.41.52.1
0100294554457103212
110154253111223112
210162776881343712
310170234113213112
410171228101087109714
np.any(pd.isnull(wis))  # 里面如果有一个缺失值,就返回True
True
wis = wis.dropna()
np.any(pd.isnull(wis))  # 里面如果有一个缺失值,就返回True
False

数据离散化处理

编号身高情况
1180
2189
3170.5
4150
编号
1100
2100
3010
4001
data = pd.read_csv("./课件/data/stock_day.csv")
data_p = data['p_change']

data_p.head()
2018-02-27    2.68
2018-02-26    3.02
2018-02-23    2.42
2018-02-22    1.64
2018-02-14    2.05
Name: p_change, dtype: float64

pd.qcut(data,q)

  • 对数据进行分组将数据分组一般会与value_columns搭配使用,统计每组的个数
  • series.value_counts()
# 自行分组
qcut_r = pd.qcut(data_p,q=10)
qcut_r
2018-02-27    (1.738, 2.938]
2018-02-26     (2.938, 5.27]
2018-02-23    (1.738, 2.938]
2018-02-22     (0.94, 1.738]
2018-02-14    (1.738, 2.938]
                   ...      
2015-03-06     (5.27, 10.03]
2015-03-05    (1.738, 2.938]
2015-03-04     (0.94, 1.738]
2015-03-03     (0.94, 1.738]
2015-03-02    (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
# 计算分到每个组数据个数
qcut_r.value_counts()
(5.27, 10.03]                    65
(0.26, 0.94]                     65
(-0.462, 0.26]                   65
(-10.030999999999999, -4.836]    65
(2.938, 5.27]                    64
(1.738, 2.938]                   64
(-1.352, -0.462]                 64
(-2.444, -1.352]                 64
(-4.836, -2.444]                 64
(0.94, 1.738]                    63
Name: p_change, dtype: int64
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58

自定义区间分组

  • pd.cut(data,bins)
import pandas as pd
import numpy as np

bins = [-100,-7,-5,-3,0,3,5,7,100]

p_counts = pd.cut(data_p,bins=bins)

p_counts
2018-02-27      (0, 3]
2018-02-26      (3, 5]
2018-02-23      (0, 3]
2018-02-22      (0, 3]
2018-02-14      (0, 3]
                ...   
2015-03-06    (7, 100]
2015-03-05      (0, 3]
2015-03-04      (0, 3]
2015-03-03      (0, 3]
2015-03-02      (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
p_counts.value_counts()
(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(7, 100]       35
(5, 7]         35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64
bins = [-100,-7,-5,-3,0,3,5,7,100]

cut_r = pd.cut(data_p,bins=bins)
cut_r.value_counts()
(0, 3]        215
(-3, 0]       188
(3, 5]         57
(-5, -3]       51
(7, 100]       35
(5, 7]         35
(-100, -7]     34
(-7, -5]       28
Name: p_change, dtype: int64
cut_r.head()
2018-02-27    (0, 3]
2018-02-26    (3, 5]
2018-02-23    (0, 3]
2018-02-22    (0, 3]
2018-02-14    (0, 3]
Name: p_change, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]

one-hot编码

pd.get_dummies(cut_r,prefix="python38_best")
python38_best_(-100, -7]python38_best_(-7, -5]python38_best_(-5, -3]python38_best_(-3, 0]python38_best_(0, 3]python38_best_(3, 5]python38_best_(5, 7]python38_best_(7, 100]
2018-02-2700001000
2018-02-2600000100
2018-02-2300001000
2018-02-2200001000
2018-02-1400001000
...........................
2015-03-0600000001
2015-03-0500001000
2015-03-0400001000
2015-03-0300001000
2015-03-0200001000

643 rows × 8 columns






合并

pd.concat实现数据合并

data
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnover
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.39
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.53
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.32
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.90
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.58
.............................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112115090.18115090.18115090.186.16
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.82098904.7998904.7998904.793.19
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707100812.93100812.93100812.932.30
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610117681.67117681.67117681.674.76
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.52096291.7396291.7396291.733.30

643 rows × 14 columns


data1=pd.get_dummies(cut_r,prefix="python38_best")
data1
python38_best_(-100, -7]python38_best_(-7, -5]python38_best_(-5, -3]python38_best_(-3, 0]python38_best_(0, 3]python38_best_(3, 5]python38_best_(5, 7]python38_best_(7, 100]
2018-02-2700001000
2018-02-2600000100
2018-02-2300001000
2018-02-2200001000
2018-02-1400001000
...........................
2015-03-0600000001
2015-03-0500001000
2015-03-0400001000
2015-03-0300001000
2015-03-0200001000

643 rows × 8 columns


pd.concat([data,data1],axis=1)
openhighcloselowvolumeprice_changep_changema5ma10ma20...v_ma20turnoverpython38_best_(-100, -7]python38_best_(-7, -5]python38_best_(-5, -3]python38_best_(-3, 0]python38_best_(0, 3]python38_best_(3, 5]python38_best_(5, 7]python38_best_(7, 100]
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.875...55576.112.3900001000
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.942...56007.501.5300000100
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.022...56372.851.3200001000
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.137...60149.600.9000001000
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.253...61716.110.5800001000
..................................................................
2015-03-0613.1714.4814.2813.13179831.721.128.5113.11213.11213.112...115090.186.1600000001
2015-03-0512.8813.4513.1612.8793180.390.262.0212.82012.82012.820...98904.793.1900001000
2015-03-0412.8012.9212.9012.6167075.440.201.5712.70712.70712.707...100812.932.3000001000
2015-03-0312.5213.0612.7012.52139071.610.181.4412.61012.61012.610...117681.674.7600001000
2015-03-0212.2512.6712.5212.2096291.730.322.6212.52012.52012.520...96291.733.3000001000

643 rows × 22 columns

pd.merge(left,right,on=[‘key1’,‘key2’])

left = pd.DataFrame({'key1':['k0','k0','k1','k2'],
                    'key2':['k0','k1','k0','k1'],
                    'A':['A0','A1','A2','A3'],
                    'B':['B0','B1','B2','B3']})


right = pd.DataFrame({'key1':['k0','k1','k1','k2'],
                    'key2':['k0','k0','k0','k0'],
                    'C':['C0','C1','C2','C3'],
                    'D':['D0','D1','D2','D3']})
left
key1key2AB
0k0k0A0B0
1k0k1A1B1
2k1k0A2B2
3k2k1A3B3
right
key1key2CD
0k0k0C0D0
1k1k0C1D1
2k1k0C2D2
3k2k0C3D3

pd.merge(left,right,on=['key1','key2']) 

# on : 按照'key1','key2'列进行连接
key1key2ABCD
0k0k0A0B0C0D0
1k1k0A2B2C1D1
2k1k0A2B2C2D2
其种按照key1和key2键进行合并时,如果两个表中的key1和key2的值相同,就合并,
有一个不相同,就删除该行
pd.merge(left,right,on=['key1','key2'],how="inner") 
key1key2ABCD
0k0k0A0B0C0D0
1k1k0A2B2C1D1
2k1k0A2B2C2D2
pd.merge(left,right,on=['key1','key2'],how='outer') 
key1key2ABCD
0k0k0A0B0C0D0
1k0k1A1B1NaNNaN
2k1k0A2B2C1D1
3k1k0A2B2C2D2
4k2k1A3B3NaNNaN
5k2k0NaNNaNC3D3
pd.merge(left,right,on=['key1','key2'],how='left') 
key1key2ABCD
0k0k0A0B0C0D0
1k0k1A1B1NaNNaN
2k1k0A2B2C1D1
3k1k0A2B2C2D2
4k2k1A3B3NaNNaN

交叉表与透视表

探索两列数据之间的关系

  • pd.crosstab()
    返回具体数量
  • 对象.pivot_table()
    返回占比情况
data.index
Index(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22', '2018-02-14',
       '2018-02-13', '2018-02-12', '2018-02-09', '2018-02-08', '2018-02-07',
       ...
       '2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10', '2015-03-09',
       '2015-03-06', '2015-03-05', '2015-03-04', '2015-03-03', '2015-03-02'],
      dtype='object', length=643)
time = pd.to_datetime(data.index)
time.day  # 第几天
Int64Index([27, 26, 23, 22, 14, 13, 12,  9,  8,  7,
            ...
            13, 12, 11, 10,  9,  6,  5,  4,  3,  2],
           dtype='int64', length=643)
time.week  # 第几周
Int64Index([ 9,  9,  8,  8,  7,  7,  7,  6,  6,  6,
            ...
            11, 11, 11, 11, 11, 10, 10, 10, 10, 10],
           dtype='int64', length=643)
time.weekday  # 星期几
Int64Index([1, 0, 4, 3, 2, 1, 0, 4, 3, 2,
            ...
            4, 3, 2, 1, 0, 4, 3, 2, 1, 0],
           dtype='int64', length=643)
data['week'] = time.weekday
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnoverweek
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.391
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.530
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.324
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.903
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.582
data['p_n'] = np.where(data['p_change']>0,1,0)
data.head()
openhighcloselowvolumeprice_changep_changema5ma10ma20v_ma5v_ma10v_ma20turnoverweekp_n
2018-02-2723.5325.8824.1623.5395578.030.632.6822.94222.14222.87553782.6446738.6555576.112.3911
2018-02-2622.8023.7823.5322.8060985.110.693.0222.40621.95522.94240827.5242736.3456007.501.5301
2018-02-2322.8823.3722.8222.7152914.010.542.4221.93821.92923.02235119.5841871.9756372.851.3241
2018-02-2222.2522.7622.2822.0236105.010.361.6421.44621.90923.13735397.5839904.7860149.600.9031
2018-02-1421.4921.9921.9221.4823331.040.442.0521.36621.92323.25333590.2142935.7461716.110.5821

pd.crosstab(data['week'],data['p_n'])
p_n01
week
06362
15576
26171
36365
45968
count = pd.crosstab(data['week'],data['p_n'])
count
p_n01
week
06362
15576
26171
36365
45968
sum = count.sum(axis=1)

sum
week
0    125
1    131
2    132
3    128
4    127
dtype: int64
per = count.div(sum,axis=0)

per
p_n01
week
00.5040000.496000
10.4198470.580153
20.4621210.537879
30.4921880.507812
40.4645670.535433
per.plot(kind='bar')
<matplotlib.axes._subplots.AxesSubplot at 0x165c14dd548>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HhRGebEy-1615267786020)(output_205_1.png)]

per.plot(kind='bar',stacked=True)  #  stacked是否堆积
<matplotlib.axes._subplots.AxesSubplot at 0x165c157c508>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-m4vSGs0g-1615267786022)(output_206_1.png)]

透视表 pivot_table()

data.pivot_table(['p_n'], index='week')
p_n
week
00.496000
10.580153
20.537879
30.507812
40.535433

分组和聚合

col =pd.DataFrame({'color': ['white','red','green','red','green'], 'object': ['pen','pencil','pencil','ashtray','pen'],'price1':[5.56,4.20,1.30,0.56,2.75],'price2':[4.75,4.12,1.60,0.75,3.15]})

col
colorobjectprice1price2
0whitepen5.564.75
1redpencil4.204.12
2greenpencil1.301.60
3redashtray0.560.75
4greenpen2.753.15
col.groupby(['color'])['price1'].mean()
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64
col['price1'].groupby(col['color']).mean()
color
green    2.025
red      2.380
white    5.560
Name: price1, dtype: float64
# col['price1'].groupby(col['color'], as_index=False).mean()
col.groupby(['color'], as_index=False)['price1'].mean()

# as_index=False 重新设置一个索引
colorprice1
0green2.025
1red2.380
2white5.560

分组中的agg()

Pandas之超好用的Groupby用法详解

company=["A","B","C"]
data=pd.DataFrame({
    "company":[company[x] for x in np.random.randint(0,len(company),10)],
    "salary":np.random.randint(5,50,10),
    "age":np.random.randint(15,50,10)
}
)
data.head()
companysalaryage
0A833
1C2846
2C3143
3C826
4A2520
# 求不同公司员工的平均年龄和平均薪水,

data.groupby("company").agg('mean')
salaryage
company
A14.66666723.0
B38.00000024.0
C19.16666736.0
# 如果想对针对不同的列求不同的值,比如要计算不同公司员工的平均年龄以及薪水的中位数,
#可以利用字典进行聚合操作的指定:
data.groupby('company').agg({'salary':'median','age':'mean'})
salaryage
company
A11.023
B38.024
C18.536

transform

# 在上面的agg中,我们学会了如何求不同公司员工的平均薪水,如果现在需要在原数据集中新增一列avg_salary,
# 代表员工所在的公司的平均薪水(相同公司的员工具有一样的平均薪水),该怎么实现呢?
# 如果按照正常的步骤来计算,需要先求得不同公司的平均薪水,
# 然后按照员工和公司的对应关系填充到对应的位置,不用transform的话,实现代码如下:
avg_salary_dict = data.groupby('company')['salary'].mean().to_dict()
avg_salary_dict
{'A': 39.0, 'B': 40.75, 'C': 38.0}
data['company'].map(avg_salary_dict)
0    40.75
1    39.00
2    40.75
3    38.00
4    38.00
5    39.00
6    40.75
7    38.00
8    40.75
9    39.00
Name: company, dtype: float64
data['avg_salary'] = data['company'].map(avg_salary_dict)
data
companysalaryageavg_salary
0B253140.75
1A472139.00
2B423640.75
3C493338.00
4C304838.00
5A373139.00
6B472740.75
7C354938.00
8B493640.75
9A334039.00
# 如果使用transform的话,仅需要一行代码:
company=["A","B","C"]
data=pd.DataFrame({
    "company":[company[x] for x in np.random.randint(0,len(company),10)],
    "salary":np.random.randint(5,50,10),
    "age":np.random.randint(15,50,10)
}
)
data.head()
companysalaryage
0B2531
1A4721
2B4236
3C4933
4C3048
data['avg_salary'] = data.groupby('company')['salary'].transform('mean')

data
companysalaryageavg_salary
0B253140.75
1A472139.00
2B423640.75
3C493338.00
4C304838.00
5A373139.00
6B472740.75
7C354938.00
8B493640.75
9A334039.00
# 需要获取各个公司年龄最大的员工的数据
data.groupby('company').max()
salaryageavg_salary
company
A474039.00
B493640.75
C494938.00

对于groupby后的apply

# 对于groupby后的apply,以分组后的子DataFrame作为参数传入指定函数的,基本操作单位是DataFrame,
# 而之前介绍的apply的基本操作单位是Series。
# 还是以一个案例来介绍groupby后的apply用法。
def get_oldest_staff(x):
    df = x.sort_values(by = 'age',ascending=True)
    return df.iloc[-1,:]
oldest_staff = data.groupby('company',as_index=False).apply(get_oldest_staff)

oldest_staff
companysalaryageavg_salary
0A334039.00
1B493640.75
2C354938.00







时间序列

Pandas数据处理——玩转时间序列数据

pd.to_datetime()

pd.set_index()

pd.resample()

.dt


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值