pandas
import pandas as pd
import numpy as np
import re
s1 = np. random. normal( 0 , 1 , ( 10 , 5 ) )
s1
array([[-0.20687054, -0.32658765, 0.08331198, -0.35200115, -0.47287752],
[-1.12880273, -0.60517378, -1.58661317, 0.01774439, -0.1918528 ],
[ 1.58558494, 0.05270011, 1.08498907, 0.52493911, -0.38949185],
[-0.22147045, 1.3795081 , -1.04626624, 0.99894976, -2.07890203],
[ 0.20435856, -0.07516624, 1.70947873, -2.51118659, -0.21273919],
[ 0.14601014, -0.72608304, 0.20719284, 0.30527482, 0.86076936],
[ 0.41551343, 0.38332934, 1.9463179 , 1.31499648, -1.07326932],
[-0.39235779, 0.06411874, 0.06089033, 0.54348207, -0.59586276],
[-0.07680699, -1.56634337, 0.03776305, 0.37830907, 0.29248695],
[-0.28961965, 0.17700808, 0.02997782, 0.89239819, 0.12259162]])
a = pd. DataFrame( s)
a
0 1 2 3 4 0 1.217958 -2.025388 1.009486 -0.146502 1.846820 1 -0.182713 -2.179872 -0.925558 0.550637 0.859040 2 0.717373 -0.173222 -0.998724 1.218391 0.374692 3 0.600547 -1.381877 1.333913 1.069011 0.962694 4 -0.716170 0.487692 -0.501950 1.724558 -0.670558 5 -1.354031 0.874799 -1.361690 1.456327 1.566802 6 -0.977612 0.640754 -1.463054 -0.390595 1.840542 7 -0.806473 -0.568746 -1.063838 0.251186 -0.226839 8 0.344598 -0.816298 0.798262 1.604684 -0.450517 9 0.889421 -0.505147 1.728332 1.241280 -0.028786
a. shape[ 0 ]
10
st = [ "股票{}" . format ( i+ 1 ) for i in range ( a. shape[ 0 ] ) ]
st
['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10']
pd. DataFrame( s1, index= st)
0 1 2 3 4 股票1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 股票2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 股票3 1.585585 0.052700 1.084989 0.524939 -0.389492 股票4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 股票5 0.204359 -0.075166 1.709479 -2.511187 -0.212739 股票6 0.146010 -0.726083 0.207193 0.305275 0.860769 股票7 0.415513 0.383329 1.946318 1.314996 -1.073269 股票8 -0.392358 0.064119 0.060890 0.543482 -0.595863 股票9 -0.076807 -1.566343 0.037763 0.378309 0.292487 股票10 -0.289620 0.177008 0.029978 0.892398 0.122592
pd.date_range()生成一组连续的时间序列
date_range(start=None,end=None,periods=None,freq=‘B’)
start : 开始时间 end : 结束时间 periods : 时间天数 freq : 递进单位,默认为1天,'B’默认略过周末
date = pd. date_range( start= '20190403' , periods= a. shape[ 1 ] , freq= 'B' )
date
DatetimeIndex(['2019-04-03', '2019-04-04', '2019-04-05', '2019-04-08',
'2019-04-09'],
dtype='datetime64[ns]', freq='B')
pd. DataFrame( s1, index= st, columns= date)
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 股票1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 股票2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 股票3 1.585585 0.052700 1.084989 0.524939 -0.389492 股票4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 股票5 0.204359 -0.075166 1.709479 -2.511187 -0.212739 股票6 0.146010 -0.726083 0.207193 0.305275 0.860769 股票7 0.415513 0.383329 1.946318 1.314996 -1.073269 股票8 -0.392358 0.064119 0.060890 0.543482 -0.595863 股票9 -0.076807 -1.566343 0.037763 0.378309 0.292487 股票10 -0.289620 0.177008 0.029978 0.892398 0.122592
DataFrame
df= pd. DataFrame( s1, index= st, columns= date)
df
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 股票1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 股票2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 股票3 1.585585 0.052700 1.084989 0.524939 -0.389492 股票4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 股票5 0.204359 -0.075166 1.709479 -2.511187 -0.212739 股票6 0.146010 -0.726083 0.207193 0.305275 0.860769 股票7 0.415513 0.383329 1.946318 1.314996 -1.073269 股票8 -0.392358 0.064119 0.060890 0.543482 -0.595863 股票9 -0.076807 -1.566343 0.037763 0.378309 0.292487 股票10 -0.289620 0.177008 0.029978 0.892398 0.122592
df. shape
(10, 5)
df. index
Index(['股票1', '股票2', '股票3', '股票4', '股票5', '股票6', '股票7', '股票8', '股票9', '股票10'], dtype='object')
df. columns
DatetimeIndex(['2019-04-03', '2019-04-04', '2019-04-05', '2019-04-08',
'2019-04-09'],
dtype='datetime64[ns]', freq='B')
df. values
array([[-0.20687054, -0.32658765, 0.08331198, -0.35200115, -0.47287752],
[-1.12880273, -0.60517378, -1.58661317, 0.01774439, -0.1918528 ],
[ 1.58558494, 0.05270011, 1.08498907, 0.52493911, -0.38949185],
[-0.22147045, 1.3795081 , -1.04626624, 0.99894976, -2.07890203],
[ 0.20435856, -0.07516624, 1.70947873, -2.51118659, -0.21273919],
[ 0.14601014, -0.72608304, 0.20719284, 0.30527482, 0.86076936],
[ 0.41551343, 0.38332934, 1.9463179 , 1.31499648, -1.07326932],
[-0.39235779, 0.06411874, 0.06089033, 0.54348207, -0.59586276],
[-0.07680699, -1.56634337, 0.03776305, 0.37830907, 0.29248695],
[-0.28961965, 0.17700808, 0.02997782, 0.89239819, 0.12259162]])
df. T
股票1 股票2 股票3 股票4 股票5 股票6 股票7 股票8 股票9 股票10 2019-04-03 -0.206871 -1.128803 1.585585 -0.221470 0.204359 0.146010 0.415513 -0.392358 -0.076807 -0.289620 2019-04-04 -0.326588 -0.605174 0.052700 1.379508 -0.075166 -0.726083 0.383329 0.064119 -1.566343 0.177008 2019-04-05 0.083312 -1.586613 1.084989 -1.046266 1.709479 0.207193 1.946318 0.060890 0.037763 0.029978 2019-04-08 -0.352001 0.017744 0.524939 0.998950 -2.511187 0.305275 1.314996 0.543482 0.378309 0.892398 2019-04-09 -0.472878 -0.191853 -0.389492 -2.078902 -0.212739 0.860769 -1.073269 -0.595863 0.292487 0.122592
df. head( 5 )
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 股票1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 股票2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 股票3 1.585585 0.052700 1.084989 0.524939 -0.389492 股票4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 股票5 0.204359 -0.075166 1.709479 -2.511187 -0.212739
df. tail( 5 )
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 股票6 0.146010 -0.726083 0.207193 0.305275 0.860769 股票7 0.415513 0.383329 1.946318 1.314996 -1.073269 股票8 -0.392358 0.064119 0.060890 0.543482 -0.595863 股票9 -0.076807 -1.566343 0.037763 0.378309 0.292487 股票10 -0.289620 0.177008 0.029978 0.892398 0.122592
设置索引
st = [ "股票_{}" . format ( i+ 1 ) for i in range ( a. shape[ 0 ] ) ]
st
['股票_1',
'股票_2',
'股票_3',
'股票_4',
'股票_5',
'股票_6',
'股票_7',
'股票_8',
'股票_9',
'股票_10']
df. index= st
df
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 股票_1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 股票_2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 股票_3 1.585585 0.052700 1.084989 0.524939 -0.389492 股票_4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 股票_5 0.204359 -0.075166 1.709479 -2.511187 -0.212739 股票_6 0.146010 -0.726083 0.207193 0.305275 0.860769 股票_7 0.415513 0.383329 1.946318 1.314996 -1.073269 股票_8 -0.392358 0.064119 0.060890 0.543482 -0.595863 股票_9 -0.076807 -1.566343 0.037763 0.378309 0.292487 股票_10 -0.289620 0.177008 0.029978 0.892398 0.122592
重置索引
df. reset_index( )
index 2019-04-03 00:00:00 2019-04-04 00:00:00 2019-04-05 00:00:00 2019-04-08 00:00:00 2019-04-09 00:00:00 0 股票_1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 1 股票_2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 2 股票_3 1.585585 0.052700 1.084989 0.524939 -0.389492 3 股票_4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 4 股票_5 0.204359 -0.075166 1.709479 -2.511187 -0.212739 5 股票_6 0.146010 -0.726083 0.207193 0.305275 0.860769 6 股票_7 0.415513 0.383329 1.946318 1.314996 -1.073269 7 股票_8 -0.392358 0.064119 0.060890 0.543482 -0.595863 8 股票_9 -0.076807 -1.566343 0.037763 0.378309 0.292487 9 股票_10 -0.289620 0.177008 0.029978 0.892398 0.122592
df
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 股票_1 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 股票_2 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 股票_3 1.585585 0.052700 1.084989 0.524939 -0.389492 股票_4 -0.221470 1.379508 -1.046266 0.998950 -2.078902 股票_5 0.204359 -0.075166 1.709479 -2.511187 -0.212739 股票_6 0.146010 -0.726083 0.207193 0.305275 0.860769 股票_7 0.415513 0.383329 1.946318 1.314996 -1.073269 股票_8 -0.392358 0.064119 0.060890 0.543482 -0.595863 股票_9 -0.076807 -1.566343 0.037763 0.378309 0.292487 股票_10 -0.289620 0.177008 0.029978 0.892398 0.122592
df. reset_index( drop= True )
2019-04-03 2019-04-04 2019-04-05 2019-04-08 2019-04-09 0 -0.206871 -0.326588 0.083312 -0.352001 -0.472878 1 -1.128803 -0.605174 -1.586613 0.017744 -0.191853 2 1.585585 0.052700 1.084989 0.524939 -0.389492 3 -0.221470 1.379508 -1.046266 0.998950 -2.078902 4 0.204359 -0.075166 1.709479 -2.511187 -0.212739 5 0.146010 -0.726083 0.207193 0.305275 0.860769 6 0.415513 0.383329 1.946318 1.314996 -1.073269 7 -0.392358 0.064119 0.060890 0.543482 -0.595863 8 -0.076807 -1.566343 0.037763 0.378309 0.292487 9 -0.289620 0.177008 0.029978 0.892398 0.122592
以某列的值设置新的索引
df = pd. DataFrame( { 'month' : [ 1 , 2 , 3 , 4 ] ,
'year' : [ 2012 , 2014 , 2015 , 2016 ] ,
'sale' : [ 35 , 40 , 84 , 31 ] } )
df
month year sale 0 1 2012 35 1 2 2014 40 2 3 2015 84 3 4 2016 31
df. set_index( keys= [ 'year' ] )
month sale year 2012 1 35 2014 2 40 2015 3 84 2016 4 31
df. set_index( keys= [ "year" , "month" ] )
sale year month 2012 1 35 2014 2 40 2015 3 84 2016 4 31
stock_data = pd. read_csv( "./课件/data/stock_day.csv" )
stock_data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
stock_data = stock_data. drop( [ "ma5" , "ma10" , "ma20" , "v_ma5" , "v_ma10" , "v_ma20" ] , axis= 1 )
stock_data. head( )
open high close low volume price_change p_change turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 0.58
pandas索引
stock_data[ "open" ] [ "2018-02-27" ]
23.53
loc 按照行列索引的名字
iloc 索引的下标值
ix 混合索引
stock_data. loc[ "2018-02-27" : "2018-02-23" , "high" ]
2018-02-27 25.88
2018-02-26 23.78
2018-02-23 23.37
Name: high, dtype: float64
stock_data. iloc[ : 3 , : 2 ]
open high 2018-02-27 23.53 25.88 2018-02-26 22.80 23.78 2018-02-23 22.88 23.37
stock_data. ix[ : 5 , ( 'high' , 'low' ) ]
D:\Anaconda\an\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
D:\Anaconda\an\lib\site-packages\pandas\core\indexing.py:961: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
return getattr(section, self.name)[new_key]
high low 2018-02-27 25.88 23.53 2018-02-26 23.78 22.80 2018-02-23 23.37 22.71 2018-02-22 22.76 22.02 2018-02-14 21.99 21.48
stock_data. columns
Index(['open', 'high', 'close', 'low', 'volume', 'price_change', 'p_change',
'turnover'],
dtype='object')
get_indexer()
stock_data. columns. get_indexer( [ "open" , "low" ] )
array([0, 3], dtype=int64)
stock_data. index. get_indexer( [ '2018-02-27' ] )
array([0], dtype=int64)
赋值操作
stock_data[ "volume" ] = 100
stock_data. volume = 1000
stock_data. head( )
open high close low volume price_change p_change turnover 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 2018-02-26 22.80 23.78 23.53 22.80 1000 0.69 3.02 1.53 2018-02-23 22.88 23.37 22.82 22.71 1000 0.54 2.42 1.32 2018-02-22 22.25 22.76 22.28 22.02 1000 0.36 1.64 0.90 2018-02-14 21.49 21.99 21.92 21.48 1000 0.44 2.05 0.58
排序
df.sort_values(by=,ascending=)
DataFrame的排序
stock_data. sort_values( by= "open" , ascending= True ) . head( )
open high close low volume price_change p_change turnover 2015-03-02 12.25 12.67 12.52 12.20 1000 0.32 2.62 3.30 2015-09-02 12.30 14.11 12.36 12.30 1000 -1.10 -8.17 2.40 2015-03-03 12.52 13.06 12.70 12.52 1000 0.18 1.44 4.76 2015-03-04 12.80 12.92 12.90 12.61 1000 0.20 1.57 2.30 2015-03-05 12.88 13.45 13.16 12.87 1000 0.26 2.02 3.19
stock_data. sort_values( by= [ "open" , "high" ] ) . head( )
open high close low volume price_change p_change turnover 2015-03-02 12.25 12.67 12.52 12.20 1000 0.32 2.62 3.30 2015-09-02 12.30 14.11 12.36 12.30 1000 -1.10 -8.17 2.40 2015-03-03 12.52 13.06 12.70 12.52 1000 0.18 1.44 4.76 2015-03-04 12.80 12.92 12.90 12.61 1000 0.20 1.57 2.30 2015-03-05 12.88 13.45 13.16 12.87 1000 0.26 2.02 3.19
stock_data. sort_index( ) . head( )
open high close low volume price_change p_change turnover 2015-03-02 12.25 12.67 12.52 12.20 1000 0.32 2.62 3.30 2015-03-03 12.52 13.06 12.70 12.52 1000 0.18 1.44 4.76 2015-03-04 12.80 12.92 12.90 12.61 1000 0.20 1.57 2.30 2015-03-05 12.88 13.45 13.16 12.87 1000 0.26 2.02 3.19 2015-03-06 13.17 14.48 14.28 13.13 1000 1.12 8.51 6.16
series的排序
stock_data[ "open" ] . sort_values( ascending= False ) . head( )
2015-06-15 34.99
2015-06-12 34.69
2015-06-10 34.10
2017-11-01 33.85
2015-06-11 33.17
Name: open, dtype: float64
stock_data[ "open" ] . sort_index( ascending= False ) . head( )
2018-02-27 23.53
2018-02-26 22.80
2018-02-23 22.88
2018-02-22 22.25
2018-02-14 21.49
Name: open, dtype: float64
stock_data[ "open" ] . sort_index( ) . head( )
2015-03-02 12.25
2015-03-03 12.52
2015-03-04 12.80
2015-03-05 12.88
2015-03-06 13.17
Name: open, dtype: float64
算术运算
a1= pd. Series( range ( 10 , 13 ) )
a2= pd. Series( range ( 20 , 25 ) )
a1+ a2
0 30.0
1 32.0
2 34.0
3 NaN
4 NaN
dtype: float64
a1. add( a2)
0 30.0
1 32.0
2 34.0
3 NaN
4 NaN
dtype: float64
a1. add( a2, fill_value= 0 )
0 30.0
1 32.0
2 34.0
3 23.0
4 24.0
dtype: float64
stock_data[ 'close' ] . add( 10 ) . head( )
2018-02-27 34.16
2018-02-26 33.53
2018-02-23 32.82
2018-02-22 32.28
2018-02-14 31.92
Name: close, dtype: float64
stock_data[ 'close' ] + 100
2018-02-27 124.16
2018-02-26 123.53
2018-02-23 122.82
2018-02-22 122.28
2018-02-14 121.92
...
2015-03-06 114.28
2015-03-05 113.16
2015-03-04 112.90
2015-03-03 112.70
2015-03-02 112.52
Name: close, Length: 643, dtype: float64
stock_data. head( )
open high close low volume price_change p_change turnover 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 2018-02-26 22.80 23.78 23.53 22.80 1000 0.69 3.02 1.53 2018-02-23 22.88 23.37 22.82 22.71 1000 0.54 2.42 1.32 2018-02-22 22.25 22.76 22.28 22.02 1000 0.36 1.64 0.90 2018-02-14 21.49 21.99 21.92 21.48 1000 0.44 2.05 0.58
close = stock_data[ 'close' ]
open1 = stock_data[ 'open' ]
stock_data[ 'm_price_change' ] = close. sub( open1)
stock_data. head( )
open high close low volume price_change p_change turnover m_price_change 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 0.63 2018-02-26 22.80 23.78 23.53 22.80 1000 0.69 3.02 1.53 0.73 2018-02-23 22.88 23.37 22.82 22.71 1000 0.54 2.42 1.32 -0.06 2018-02-22 22.25 22.76 22.28 22.02 1000 0.36 1.64 0.90 0.03 2018-02-14 21.49 21.99 21.92 21.48 1000 0.44 2.05 0.58 0.43
逻辑运算符<,>,|,&
stock_data[ stock_data[ 'open' ] > 20 ] . head( )
open high close low volume price_change p_change turnover m_price_change 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 0.63 2018-02-26 22.80 23.78 23.53 22.80 1000 0.69 3.02 1.53 0.73 2018-02-23 22.88 23.37 22.82 22.71 1000 0.54 2.42 1.32 -0.06 2018-02-22 22.25 22.76 22.28 22.02 1000 0.36 1.64 0.90 0.03 2018-02-14 21.49 21.99 21.92 21.48 1000 0.44 2.05 0.58 0.43
data= stock_data
data[ ( data[ "open" ] > 23 ) & ( data[ "open" ] < 24 ) ] . head( )
open high close low volume price_change p_change turnover m_price_change 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 0.63 2018-02-01 23.71 23.86 22.42 22.22 1000 -1.30 -5.48 1.66 -1.29 2018-01-31 23.85 23.98 23.72 23.31 1000 -0.11 -0.46 1.23 -0.13 2018-01-30 23.71 24.08 23.83 23.70 1000 0.05 0.21 0.81 0.12 2018-01-16 23.40 24.60 24.40 23.30 1000 0.96 4.10 2.54 1.00
逻辑运算函数
data.query() isin() 判断是否在这个范围内
Pandas学习笔记十——高性能的eval和query方法
data. query( "open<24 & open>23" ) . head( )
open high close low volume price_change p_change turnover m_price_change 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 0.63 2018-02-01 23.71 23.86 22.42 22.22 1000 -1.30 -5.48 1.66 -1.29 2018-01-31 23.85 23.98 23.72 23.31 1000 -0.11 -0.46 1.23 -0.13 2018-01-30 23.71 24.08 23.83 23.70 1000 0.05 0.21 0.81 0.12 2018-01-16 23.40 24.60 24.40 23.30 1000 0.96 4.10 2.54 1.00
data. query( "open>23 & open<24" ) . head( )
open high close low volume price_change p_change turnover m_price_change 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 0.63 2018-02-01 23.71 23.86 22.42 22.22 1000 -1.30 -5.48 1.66 -1.29 2018-01-31 23.85 23.98 23.72 23.31 1000 -0.11 -0.46 1.23 -0.13 2018-01-30 23.71 24.08 23.83 23.70 1000 0.05 0.21 0.81 0.12 2018-01-16 23.40 24.60 24.40 23.30 1000 0.96 4.10 2.54 1.00
data[ data[ 'turnover' ] . isin( [ 4.19 , 2.39 ] ) ]
open high close low volume price_change p_change turnover m_price_change 2018-02-27 23.53 25.88 24.16 23.53 1000 0.63 2.68 2.39 0.63 2017-07-25 23.07 24.20 23.70 22.64 1000 0.67 2.91 4.19 0.63 2016-09-28 19.88 20.98 20.86 19.71 1000 0.98 4.93 2.39 0.98 2015-04-07 16.54 17.98 17.54 16.50 1000 0.88 5.28 4.19 1.00
- data. describe( ) 统计结果count, mean, std( 标准差) , min , max
data. describe( )
open high close low volume price_change p_change turnover m_price_change count 643.000000 643.000000 643.000000 643.000000 643.0 643.000000 643.000000 643.000000 643.000000 mean 21.272706 21.900513 21.336267 20.771835 1000.0 0.018802 0.190280 2.936190 0.063561 std 3.930973 4.077578 3.942806 3.791968 0.0 0.898476 4.079698 2.079375 0.800565 min 12.250000 12.670000 12.360000 12.200000 1000.0 -3.520000 -10.030000 0.040000 -3.300000 25% 19.000000 19.500000 19.045000 18.525000 1000.0 -0.390000 -1.850000 1.360000 -0.300000 50% 21.440000 21.970000 21.450000 20.980000 1000.0 0.050000 0.260000 2.500000 0.080000 75% 23.400000 24.065000 23.415000 22.850000 1000.0 0.455000 2.305000 3.915000 0.450000 max 34.990000 36.350000 35.210000 34.010000 1000.0 3.030000 10.030000 12.560000 3.410000
统计函数
data. sum ( axis= 1 )
2018-02-27 1103.43
2018-02-26 1098.88
2018-02-23 1096.00
2018-02-22 1092.24
2018-02-14 1090.38
...
2015-03-06 1071.96
2015-03-05 1058.11
2015-03-04 1055.40
2015-03-03 1057.36
2015-03-02 1056.15
Length: 643, dtype: float64
data. sum ( axis= 0 )
open 13678.35
high 14082.03
close 13719.22
low 13356.29
volume 643000.00
price_change 12.09
p_change 122.35
turnover 1887.97
m_price_change 40.87
dtype: float64
df = pd. DataFrame( { 'col1' : [ 2 , 3 , 5 , 4 , 6 , 9 ] ,
'col2' : [ 1 , 2 , 2 , 3 , 6 , 7 ] } )
df
col1 col2 0 2 1 1 3 2 2 5 2 3 4 3 4 6 6 5 9 7
df. median( )
col1 4.5
col2 2.5
dtype: float64
df. idxmax( )
col1 5
col2 5
dtype: int64
df[ 'col1' ] . argmax( )
D:\Anaconda\an\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
"""Entry point for launching an IPython kernel.
5
累计统计函数求和
cumsum 计算前1/2/3/…/n个数的和 cummax 计算前1/2/3/…/n个数的最大值 cummin 计算前1/2/3/…/n个数的最小值 cumprod 计算前1/2/3/…/n个数的积
data = data. sort_index( )
data. head( )
open high close low volume price_change p_change turnover m_price_change 2015-03-02 12.25 12.67 12.52 12.20 1000 0.32 2.62 3.30 0.27 2015-03-03 12.52 13.06 12.70 12.52 1000 0.18 1.44 4.76 0.18 2015-03-04 12.80 12.92 12.90 12.61 1000 0.20 1.57 2.30 0.10 2015-03-05 12.88 13.45 13.16 12.87 1000 0.26 2.02 3.19 0.28 2015-03-06 13.17 14.48 14.28 13.13 1000 1.12 8.51 6.16 1.11
stick= data[ 'p_change' ]
stick. head( )
2015-03-02 2.62
2015-03-03 1.44
2015-03-04 1.57
2015-03-05 2.02
2015-03-06 8.51
Name: p_change, dtype: float64
stick. cumsum( )
2015-03-02 2.62
2015-03-03 4.06
2015-03-04 5.63
2015-03-05 7.65
2015-03-06 16.16
...
2018-02-14 112.59
2018-02-22 114.23
2018-02-23 116.65
2018-02-26 119.67
2018-02-27 122.35
Name: p_change, Length: 643, dtype: float64
stick1= stick. cumsum( )
stick. sum ( )
122.35000000000001
import matplotlib. pyplot as plt
stick1. plot( )
plt. show( )
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lfgAL6dB-1615267786010)(output_98_0.png)]
data. head( )
open high close low volume price_change p_change turnover m_price_change 2015-03-02 12.25 12.67 12.52 12.20 1000 0.32 2.62 3.30 0.27 2015-03-03 12.52 13.06 12.70 12.52 1000 0.18 1.44 4.76 0.18 2015-03-04 12.80 12.92 12.90 12.61 1000 0.20 1.57 2.30 0.10 2015-03-05 12.88 13.45 13.16 12.87 1000 0.26 2.02 3.19 0.28 2015-03-06 13.17 14.48 14.28 13.13 1000 1.12 8.51 6.16 1.11
自定义运算
apply(func,axis=0)
func:自定义 axis=0 : 默认是列,axis=1是行
pandas函数应用——apply、applymap、map方法
data[ [ 'open' , 'close' ] ] . apply ( lambda x: x. max ( ) - x. min ( ) , axis= 0 )
open 22.74
close 22.85
dtype: float64
data[ [ 'open' , 'close' ] ] . apply ( lambda x: x[ 0 ] - x[ 1 ] , axis= 0 )
open -0.27
close -0.18
dtype: float64
data[ [ 'open' , 'close' ] ] . apply ( lambda x: x. max ( ) - x. min ( ) , axis= 1 )
2015-03-02 0.27
2015-03-03 0.18
2015-03-04 0.10
2015-03-05 0.28
2015-03-06 1.11
...
2018-02-14 0.43
2018-02-22 0.03
2018-02-23 0.06
2018-02-26 0.73
2018-02-27 0.63
Length: 643, dtype: float64
map()
Pandas数据处理三板斧——map、apply、applymap详解
Pandas之超好用的Groupby用法详解
boolean= [ True , False ]
gender= [ "男" , "女" ]
color= [ "white" , "black" , "yellow" ]
data= pd. DataFrame( {
"height" : np. random. randint( 150 , 190 , 100 ) ,
"weight" : np. random. randint( 40 , 90 , 100 ) ,
"smoker" : [ boolean[ x] for x in np. random. randint( 0 , 2 , 100 ) ] ,
"gender" : [ gender[ x] for x in np. random. randint( 0 , 2 , 100 ) ] ,
"age" : np. random. randint( 15 , 90 , 100 ) ,
"color" : [ color[ x] for x in np. random. randint( 0 , len ( color) , 100 ) ]
}
)
data. head( )
height weight smoker gender age color 0 168 47 True 男 82 black 1 185 62 False 男 39 yellow 2 153 71 False 女 77 white 3 168 80 True 男 28 black 4 182 55 True 男 49 white
data[ 'gender' ] = data[ 'gender' ] . map ( { '男' : 1 , '女' : 0 } )
data. head( )
height weight smoker gender age color 0 168 47 True 1 82 black 1 185 62 False 1 39 yellow 2 153 71 False 0 77 white 3 168 80 True 1 28 black 4 182 55 True 1 49 white
def apply_age ( x, bias) :
return x+ bias
data[ "age" ] = data[ "age" ] . apply ( apply_age, args= ( - 3 , ) )
data. head( )
height weight smoker gender age color 0 168 47 True 1 79 black 1 185 62 False 1 36 yellow 2 153 71 False 0 74 white 3 168 80 True 1 25 black 4 182 55 True 1 46 white
data[ [ "height" , "weight" , "age" ] ] . apply ( np. sum , axis= 0 )
height 17025
weight 6575
age 4924
dtype: int64
data[ [ "height" , "weight" , "age" ] ] . apply ( np. log, axis= 0 )
height weight age 0 5.123964 3.850148 4.369448 1 5.220356 4.127134 3.583519 2 5.030438 4.262680 4.304065 3 5.123964 4.382027 3.218876 4 5.204007 4.007333 3.828641 ... ... ... ... 95 5.117994 3.912023 4.158883 96 5.159055 4.465908 4.043051 97 5.181784 4.234107 4.219508 98 5.017280 4.060443 2.772589 99 5.198497 4.219508 3.970292
100 rows × 3 columns
将DataFrame中所有的值或单独一列保留两位小数显示
df = pd. DataFrame(
{
"A" : np. random. randn( 5 ) ,
"B" : np. random. randn( 5 ) ,
"C" : np. random. randn( 5 ) ,
"D" : np. random. randn( 5 ) ,
"E" : np. random. randn( 5 ) ,
}
)
df. head( )
A B C D E 0 0.396596 2.651814 1.072639 -1.852616 -0.295315 1 -1.165494 -0.538050 -0.975762 -0.146247 -0.927800 2 -1.159884 0.415855 1.870558 -1.328274 0.487950 3 -0.249029 1.578621 -1.034431 -1.828904 0.554523 4 -0.012742 0.541232 0.065608 1.643557 0.055912
df. applymap( lambda x: "%.2f" % x)
A B C D E 0 0.40 2.65 1.07 -1.85 -0.30 1 -1.17 -0.54 -0.98 -0.15 -0.93 2 -1.16 0.42 1.87 -1.33 0.49 3 -0.25 1.58 -1.03 -1.83 0.55 4 -0.01 0.54 0.07 1.64 0.06
df[ 'E' ] = df[ 'E' ] . map ( lambda x: "%.2f" % x)
df
A B C D E 0 0.396596 2.651814 1.072639 -1.852616 -0.30 1 -1.165494 -0.538050 -0.975762 -0.146247 -0.93 2 -1.159884 0.415855 1.870558 -1.328274 0.49 3 -0.249029 1.578621 -1.034431 -1.828904 0.55 4 -0.012742 0.541232 0.065608 1.643557 0.06
pandas 画图
DataFrame.plot(x=None,y=None,kind=‘line’)
-kind:str - ‘line’: 默认是画折线图 - ‘bar’:条形图 - ‘barh’:横向条形图 - ‘hist’:直方图 - ‘pie’:饼图 - ‘scatter’:散点图
data[ 'open' ] . plot( kind= 'line' )
<matplotlib.axes._subplots.AxesSubplot at 0x22afcafa8c8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aYkQnFzS-1615267786013)(output_124_1.png)]
data[ 'open' ] . plot( kind= 'bar' )
<matplotlib.axes._subplots.AxesSubplot at 0x22afe0fc148>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-kE7J4zVO-1615267786015)(output_125_1.png)]
pd.read_csv()
import pandas as pd
data = pd. read_csv( "./课件/data/stock_day.csv" , usecols= [ "open" , "high" ] )
data. head( )
open high 2018-02-27 23.53 25.88 2018-02-26 22.80 23.78 2018-02-23 22.88 23.37 2018-02-22 22.25 22.76 2018-02-14 21.49 21.99
to_csv()
df.to_csv(‘askci.csv’, encoding=‘utf_8_sig’)
DataFrame.to_csv(path_or_buf=None,sep=’,’,columns=None,header=True,index=True, mode=‘w’,encoding=None)
path_or_buf sep columns mode:'w’重写,'a’追加 index:是否进行索引 header:是否写入列索引值
缺失值的处理
pd.isnull() pd.notnull() dropna(axis=‘rows’) 删除缺失值 替换缺失值 fillna(value,inplace=True)
电影数据文件处理
movie = pd. read_csv( "./课件/data/IMDB-Movie-Data.csv" )
movie. head( )
Rank Title Genre Description Director Actors Year Runtime (Minutes) Rating Votes Revenue (Millions) Metascore 0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi A group of intergalactic criminals are forced ... James Gunn Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121 8.1 757074 333.13 76.0 1 2 Prometheus Adventure,Mystery,Sci-Fi Following clues to the origin of mankind, a te... Ridley Scott Noomi Rapace, Logan Marshall-Green, Michael Fa... 2012 124 7.0 485820 126.46 65.0 2 3 Split Horror,Thriller Three girls are kidnapped by a man with a diag... M. Night Shyamalan James McAvoy, Anya Taylor-Joy, Haley Lu Richar... 2016 117 7.3 157606 138.12 62.0 3 4 Sing Animation,Comedy,Family In a city of humanoid animals, a hustling thea... Christophe Lourdelet Matthew McConaughey,Reese Witherspoon, Seth Ma... 2016 108 7.2 60545 270.32 59.0 4 5 Suicide Squad Action,Adventure,Fantasy A secret government agency recruits some of th... David Ayer Will Smith, Jared Leto, Margot Robbie, Viola D... 2016 123 6.2 393727 325.02 40.0
np. any ( pd. isnull( movie) )
True
np. all ( pd. isnull( movie) )
False
data = movie. dropna( )
np. any ( pd. isnull( data) )
False
movie[ 'Revenue (Millions)' ] . fillna( value= movie[ 'Revenue (Millions)' ] . mean( ) )
0 333.130000
1 126.460000
2 138.120000
3 270.320000
4 325.020000
...
995 82.956376
996 17.540000
997 58.010000
998 82.956376
999 19.640000
Name: Revenue (Millions), Length: 1000, dtype: float64
for i in movie. columns:
if np. all ( pd. notnull( movie[ i] ) ) == False :
print ( i)
movie[ i] . fillna( movie[ i] . mean( ) , inplace= True )
Revenue (Millions)
Metascore
wis = pd. read_csv( "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data" )
wis. head( )
1000025 5 1 1.1 1.2 2 1.3 3 1.4 1.5 2.1 0 1002945 5 4 4 5 7 10 3 2 1 2 1 1015425 3 1 1 1 2 2 3 1 1 2 2 1016277 6 8 8 1 3 4 3 7 1 2 3 1017023 4 1 1 3 2 1 3 1 1 2 4 1017122 8 10 10 8 7 10 9 7 1 4
replace()
wis = wis. replace( to_replace= "?" , value= np. NaN)
wis. head( )
1000025 5 1 1.1 1.2 2 1.3 3 1.4 1.5 2.1 0 1002945 5 4 4 5 7 10 3 2 1 2 1 1015425 3 1 1 1 2 2 3 1 1 2 2 1016277 6 8 8 1 3 4 3 7 1 2 3 1017023 4 1 1 3 2 1 3 1 1 2 4 1017122 8 10 10 8 7 10 9 7 1 4
np. any ( pd. isnull( wis) )
True
wis = wis. dropna( )
np. any ( pd. isnull( wis) )
False
数据离散化处理
编号 身高 情况 1 180 高 2 189 高 3 170.5 中 4 150 矮
data = pd. read_csv( "./课件/data/stock_day.csv" )
data_p = data[ 'p_change' ]
data_p. head( )
2018-02-27 2.68
2018-02-26 3.02
2018-02-23 2.42
2018-02-22 1.64
2018-02-14 2.05
Name: p_change, dtype: float64
pd.qcut(data,q)
对数据进行分组将数据分组一般会与value_columns搭配使用,统计每组的个数 series.value_counts()
qcut_r = pd. qcut( data_p, q= 10 )
qcut_r
2018-02-27 (1.738, 2.938]
2018-02-26 (2.938, 5.27]
2018-02-23 (1.738, 2.938]
2018-02-22 (0.94, 1.738]
2018-02-14 (1.738, 2.938]
...
2015-03-06 (5.27, 10.03]
2015-03-05 (1.738, 2.938]
2015-03-04 (0.94, 1.738]
2015-03-03 (0.94, 1.738]
2015-03-02 (1.738, 2.938]
Name: p_change, Length: 643, dtype: category
Categories (10, interval[float64]): [(-10.030999999999999, -4.836] < (-4.836, -2.444] < (-2.444, -1.352] < (-1.352, -0.462] ... (0.94, 1.738] < (1.738, 2.938] < (2.938, 5.27] < (5.27, 10.03]]
qcut_r. value_counts( )
(5.27, 10.03] 65
(0.26, 0.94] 65
(-0.462, 0.26] 65
(-10.030999999999999, -4.836] 65
(2.938, 5.27] 64
(1.738, 2.938] 64
(-1.352, -0.462] 64
(-2.444, -1.352] 64
(-4.836, -2.444] 64
(0.94, 1.738] 63
Name: p_change, dtype: int64
data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58
自定义区间分组
import pandas as pd
import numpy as np
bins = [ - 100 , - 7 , - 5 , - 3 , 0 , 3 , 5 , 7 , 100 ]
p_counts = pd. cut( data_p, bins= bins)
p_counts
2018-02-27 (0, 3]
2018-02-26 (3, 5]
2018-02-23 (0, 3]
2018-02-22 (0, 3]
2018-02-14 (0, 3]
...
2015-03-06 (7, 100]
2015-03-05 (0, 3]
2015-03-04 (0, 3]
2015-03-03 (0, 3]
2015-03-02 (0, 3]
Name: p_change, Length: 643, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
p_counts. value_counts( )
(0, 3] 215
(-3, 0] 188
(3, 5] 57
(-5, -3] 51
(7, 100] 35
(5, 7] 35
(-100, -7] 34
(-7, -5] 28
Name: p_change, dtype: int64
bins = [ - 100 , - 7 , - 5 , - 3 , 0 , 3 , 5 , 7 , 100 ]
cut_r = pd. cut( data_p, bins= bins)
cut_r. value_counts( )
(0, 3] 215
(-3, 0] 188
(3, 5] 57
(-5, -3] 51
(7, 100] 35
(5, 7] 35
(-100, -7] 34
(-7, -5] 28
Name: p_change, dtype: int64
cut_r. head( )
2018-02-27 (0, 3]
2018-02-26 (3, 5]
2018-02-23 (0, 3]
2018-02-22 (0, 3]
2018-02-14 (0, 3]
Name: p_change, dtype: category
Categories (8, interval[int64]): [(-100, -7] < (-7, -5] < (-5, -3] < (-3, 0] < (0, 3] < (3, 5] < (5, 7] < (7, 100]]
one-hot编码
pd. get_dummies( cut_r, prefix= "python38_best" )
python38_best_(-100, -7] python38_best_(-7, -5] python38_best_(-5, -3] python38_best_(-3, 0] python38_best_(0, 3] python38_best_(3, 5] python38_best_(5, 7] python38_best_(7, 100] 2018-02-27 0 0 0 0 1 0 0 0 2018-02-26 0 0 0 0 0 1 0 0 2018-02-23 0 0 0 0 1 0 0 0 2018-02-22 0 0 0 0 1 0 0 0 2018-02-14 0 0 0 0 1 0 0 0 ... ... ... ... ... ... ... ... ... 2015-03-06 0 0 0 0 0 0 0 1 2015-03-05 0 0 0 0 1 0 0 0 2015-03-04 0 0 0 0 1 0 0 0 2015-03-03 0 0 0 0 1 0 0 0 2015-03-02 0 0 0 0 1 0 0 0
643 rows × 8 columns
合并
pd.concat实现数据合并
data
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 13.112 13.112 13.112 115090.18 115090.18 115090.18 6.16 2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 12.820 12.820 12.820 98904.79 98904.79 98904.79 3.19 2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 12.707 12.707 12.707 100812.93 100812.93 100812.93 2.30 2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 12.610 12.610 12.610 117681.67 117681.67 117681.67 4.76 2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 12.520 12.520 12.520 96291.73 96291.73 96291.73 3.30
643 rows × 14 columns
data1= pd. get_dummies( cut_r, prefix= "python38_best" )
data1
python38_best_(-100, -7] python38_best_(-7, -5] python38_best_(-5, -3] python38_best_(-3, 0] python38_best_(0, 3] python38_best_(3, 5] python38_best_(5, 7] python38_best_(7, 100] 2018-02-27 0 0 0 0 1 0 0 0 2018-02-26 0 0 0 0 0 1 0 0 2018-02-23 0 0 0 0 1 0 0 0 2018-02-22 0 0 0 0 1 0 0 0 2018-02-14 0 0 0 0 1 0 0 0 ... ... ... ... ... ... ... ... ... 2015-03-06 0 0 0 0 0 0 0 1 2015-03-05 0 0 0 0 1 0 0 0 2015-03-04 0 0 0 0 1 0 0 0 2015-03-03 0 0 0 0 1 0 0 0 2015-03-02 0 0 0 0 1 0 0 0
643 rows × 8 columns
pd. concat( [ data, data1] , axis= 1 )
open high close low volume price_change p_change ma5 ma10 ma20 ... v_ma20 turnover python38_best_(-100, -7] python38_best_(-7, -5] python38_best_(-5, -3] python38_best_(-3, 0] python38_best_(0, 3] python38_best_(3, 5] python38_best_(5, 7] python38_best_(7, 100] 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 ... 55576.11 2.39 0 0 0 0 1 0 0 0 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 ... 56007.50 1.53 0 0 0 0 0 1 0 0 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 ... 56372.85 1.32 0 0 0 0 1 0 0 0 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 ... 60149.60 0.90 0 0 0 0 1 0 0 0 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 ... 61716.11 0.58 0 0 0 0 1 0 0 0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 2015-03-06 13.17 14.48 14.28 13.13 179831.72 1.12 8.51 13.112 13.112 13.112 ... 115090.18 6.16 0 0 0 0 0 0 0 1 2015-03-05 12.88 13.45 13.16 12.87 93180.39 0.26 2.02 12.820 12.820 12.820 ... 98904.79 3.19 0 0 0 0 1 0 0 0 2015-03-04 12.80 12.92 12.90 12.61 67075.44 0.20 1.57 12.707 12.707 12.707 ... 100812.93 2.30 0 0 0 0 1 0 0 0 2015-03-03 12.52 13.06 12.70 12.52 139071.61 0.18 1.44 12.610 12.610 12.610 ... 117681.67 4.76 0 0 0 0 1 0 0 0 2015-03-02 12.25 12.67 12.52 12.20 96291.73 0.32 2.62 12.520 12.520 12.520 ... 96291.73 3.30 0 0 0 0 1 0 0 0
643 rows × 22 columns
pd.merge(left,right,on=[‘key1’,‘key2’])
left = pd. DataFrame( { 'key1' : [ 'k0' , 'k0' , 'k1' , 'k2' ] ,
'key2' : [ 'k0' , 'k1' , 'k0' , 'k1' ] ,
'A' : [ 'A0' , 'A1' , 'A2' , 'A3' ] ,
'B' : [ 'B0' , 'B1' , 'B2' , 'B3' ] } )
right = pd. DataFrame( { 'key1' : [ 'k0' , 'k1' , 'k1' , 'k2' ] ,
'key2' : [ 'k0' , 'k0' , 'k0' , 'k0' ] ,
'C' : [ 'C0' , 'C1' , 'C2' , 'C3' ] ,
'D' : [ 'D0' , 'D1' , 'D2' , 'D3' ] } )
left
key1 key2 A B 0 k0 k0 A0 B0 1 k0 k1 A1 B1 2 k1 k0 A2 B2 3 k2 k1 A3 B3
right
key1 key2 C D 0 k0 k0 C0 D0 1 k1 k0 C1 D1 2 k1 k0 C2 D2 3 k2 k0 C3 D3
pd. merge( left, right, on= [ 'key1' , 'key2' ] )
key1 key2 A B C D 0 k0 k0 A0 B0 C0 D0 1 k1 k0 A2 B2 C1 D1 2 k1 k0 A2 B2 C2 D2
其种按照key1和key2键进行合并时,如果两个表中的key1和key2的值相同,就合并,
有一个不相同,就删除该行
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= "inner" )
key1 key2 A B C D 0 k0 k0 A0 B0 C0 D0 1 k1 k0 A2 B2 C1 D1 2 k1 k0 A2 B2 C2 D2
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'outer' )
key1 key2 A B C D 0 k0 k0 A0 B0 C0 D0 1 k0 k1 A1 B1 NaN NaN 2 k1 k0 A2 B2 C1 D1 3 k1 k0 A2 B2 C2 D2 4 k2 k1 A3 B3 NaN NaN 5 k2 k0 NaN NaN C3 D3
pd. merge( left, right, on= [ 'key1' , 'key2' ] , how= 'left' )
key1 key2 A B C D 0 k0 k0 A0 B0 C0 D0 1 k0 k1 A1 B1 NaN NaN 2 k1 k0 A2 B2 C1 D1 3 k1 k0 A2 B2 C2 D2 4 k2 k1 A3 B3 NaN NaN
交叉表与透视表
探索两列数据之间的关系
pd.crosstab() 返回具体数量 对象.pivot_table() 返回占比情况
data. index
Index(['2018-02-27', '2018-02-26', '2018-02-23', '2018-02-22', '2018-02-14',
'2018-02-13', '2018-02-12', '2018-02-09', '2018-02-08', '2018-02-07',
...
'2015-03-13', '2015-03-12', '2015-03-11', '2015-03-10', '2015-03-09',
'2015-03-06', '2015-03-05', '2015-03-04', '2015-03-03', '2015-03-02'],
dtype='object', length=643)
time = pd. to_datetime( data. index)
time. day
Int64Index([27, 26, 23, 22, 14, 13, 12, 9, 8, 7,
...
13, 12, 11, 10, 9, 6, 5, 4, 3, 2],
dtype='int64', length=643)
time. week
Int64Index([ 9, 9, 8, 8, 7, 7, 7, 6, 6, 6,
...
11, 11, 11, 11, 11, 10, 10, 10, 10, 10],
dtype='int64', length=643)
time. weekday
Int64Index([1, 0, 4, 3, 2, 1, 0, 4, 3, 2,
...
4, 3, 2, 1, 0, 4, 3, 2, 1, 0],
dtype='int64', length=643)
data[ 'week' ] = time. weekday
data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover week 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 1 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 0 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 4 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 3 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 2
data[ 'p_n' ] = np. where( data[ 'p_change' ] > 0 , 1 , 0 )
data. head( )
open high close low volume price_change p_change ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover week p_n 2018-02-27 23.53 25.88 24.16 23.53 95578.03 0.63 2.68 22.942 22.142 22.875 53782.64 46738.65 55576.11 2.39 1 1 2018-02-26 22.80 23.78 23.53 22.80 60985.11 0.69 3.02 22.406 21.955 22.942 40827.52 42736.34 56007.50 1.53 0 1 2018-02-23 22.88 23.37 22.82 22.71 52914.01 0.54 2.42 21.938 21.929 23.022 35119.58 41871.97 56372.85 1.32 4 1 2018-02-22 22.25 22.76 22.28 22.02 36105.01 0.36 1.64 21.446 21.909 23.137 35397.58 39904.78 60149.60 0.90 3 1 2018-02-14 21.49 21.99 21.92 21.48 23331.04 0.44 2.05 21.366 21.923 23.253 33590.21 42935.74 61716.11 0.58 2 1
pd. crosstab( data[ 'week' ] , data[ 'p_n' ] )
p_n 0 1 week 0 63 62 1 55 76 2 61 71 3 63 65 4 59 68
count = pd. crosstab( data[ 'week' ] , data[ 'p_n' ] )
count
p_n 0 1 week 0 63 62 1 55 76 2 61 71 3 63 65 4 59 68
sum = count. sum ( axis= 1 )
sum
week
0 125
1 131
2 132
3 128
4 127
dtype: int64
per = count. div( sum , axis= 0 )
per
p_n 0 1 week 0 0.504000 0.496000 1 0.419847 0.580153 2 0.462121 0.537879 3 0.492188 0.507812 4 0.464567 0.535433
per. plot( kind= 'bar' )
<matplotlib.axes._subplots.AxesSubplot at 0x165c14dd548>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HhRGebEy-1615267786020)(output_205_1.png)]
per. plot( kind= 'bar' , stacked= True )
<matplotlib.axes._subplots.AxesSubplot at 0x165c157c508>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-m4vSGs0g-1615267786022)(output_206_1.png)]
透视表 pivot_table()
data. pivot_table( [ 'p_n' ] , index= 'week' )
p_n week 0 0.496000 1 0.580153 2 0.537879 3 0.507812 4 0.535433
分组和聚合
col = pd. DataFrame( { 'color' : [ 'white' , 'red' , 'green' , 'red' , 'green' ] , 'object' : [ 'pen' , 'pencil' , 'pencil' , 'ashtray' , 'pen' ] , 'price1' : [ 5.56 , 4.20 , 1.30 , 0.56 , 2.75 ] , 'price2' : [ 4.75 , 4.12 , 1.60 , 0.75 , 3.15 ] } )
col
color object price1 price2 0 white pen 5.56 4.75 1 red pencil 4.20 4.12 2 green pencil 1.30 1.60 3 red ashtray 0.56 0.75 4 green pen 2.75 3.15
col. groupby( [ 'color' ] ) [ 'price1' ] . mean( )
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
col[ 'price1' ] . groupby( col[ 'color' ] ) . mean( )
color
green 2.025
red 2.380
white 5.560
Name: price1, dtype: float64
col. groupby( [ 'color' ] , as_index= False ) [ 'price1' ] . mean( )
color price1 0 green 2.025 1 red 2.380 2 white 5.560
分组中的agg()
Pandas之超好用的Groupby用法详解
company= [ "A" , "B" , "C" ]
data= pd. DataFrame( {
"company" : [ company[ x] for x in np. random. randint( 0 , len ( company) , 10 ) ] ,
"salary" : np. random. randint( 5 , 50 , 10 ) ,
"age" : np. random. randint( 15 , 50 , 10 )
}
)
data. head( )
company salary age 0 A 8 33 1 C 28 46 2 C 31 43 3 C 8 26 4 A 25 20
data. groupby( "company" ) . agg( 'mean' )
salary age company A 14.666667 23.0 B 38.000000 24.0 C 19.166667 36.0
data. groupby( 'company' ) . agg( { 'salary' : 'median' , 'age' : 'mean' } )
salary age company A 11.0 23 B 38.0 24 C 18.5 36
transform
avg_salary_dict = data. groupby( 'company' ) [ 'salary' ] . mean( ) . to_dict( )
avg_salary_dict
{'A': 39.0, 'B': 40.75, 'C': 38.0}
data[ 'company' ] . map ( avg_salary_dict)
0 40.75
1 39.00
2 40.75
3 38.00
4 38.00
5 39.00
6 40.75
7 38.00
8 40.75
9 39.00
Name: company, dtype: float64
data[ 'avg_salary' ] = data[ 'company' ] . map ( avg_salary_dict)
data
company salary age avg_salary 0 B 25 31 40.75 1 A 47 21 39.00 2 B 42 36 40.75 3 C 49 33 38.00 4 C 30 48 38.00 5 A 37 31 39.00 6 B 47 27 40.75 7 C 35 49 38.00 8 B 49 36 40.75 9 A 33 40 39.00
company= [ "A" , "B" , "C" ]
data= pd. DataFrame( {
"company" : [ company[ x] for x in np. random. randint( 0 , len ( company) , 10 ) ] ,
"salary" : np. random. randint( 5 , 50 , 10 ) ,
"age" : np. random. randint( 15 , 50 , 10 )
}
)
data. head( )
company salary age 0 B 25 31 1 A 47 21 2 B 42 36 3 C 49 33 4 C 30 48
data[ 'avg_salary' ] = data. groupby( 'company' ) [ 'salary' ] . transform( 'mean' )
data
company salary age avg_salary 0 B 25 31 40.75 1 A 47 21 39.00 2 B 42 36 40.75 3 C 49 33 38.00 4 C 30 48 38.00 5 A 37 31 39.00 6 B 47 27 40.75 7 C 35 49 38.00 8 B 49 36 40.75 9 A 33 40 39.00
data. groupby( 'company' ) . max ( )
salary age avg_salary company A 47 40 39.00 B 49 36 40.75 C 49 49 38.00
对于groupby后的apply
def get_oldest_staff ( x) :
df = x. sort_values( by = 'age' , ascending= True )
return df. iloc[ - 1 , : ]
oldest_staff = data. groupby( 'company' , as_index= False ) . apply ( get_oldest_staff)
oldest_staff
company salary age avg_salary 0 A 33 40 39.00 1 B 49 36 40.75 2 C 35 49 38.00
时间序列
Pandas数据处理——玩转时间序列数据
pd.to_datetime()
pd.set_index()
pd.resample()
.dt