pandas

#pandas
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
#Series
obj = pd.Series([4,7,-5,3])
obj
0    4
1    7
2   -5
3    3
dtype: int64
obj.values
array([ 4,  7, -5,  3], dtype=int64)
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = pd.Series([4,7,-5,3],index = ['d','b','a','c'])
obj2
d    4
b    7
a   -5
c    3
dtype: int64
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
#索引取值
obj2['a']
-5
obj2['d'] = 6
obj2[['c','a','d']]
c    3
a   -5
d    6
dtype: int64
obj2[obj2>0]
d    6
b    7
c    3
dtype: int64
obj2*2
d    12
b    14
a   -10
c     6
dtype: int64
np.exp(obj2)
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
'b'in obj2
True
'r' in obj2
False
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata,index=states)
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
#缺失值
pd.isnull(obj4)
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
pd.notnull(obj4)
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
obj4.isnull()
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
#根据运算的索引标签自动对齐数据:
obj3 
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
obj3 + obj4
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
obj
0    4
1    7
2   -5
3    3
dtype: int64
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64
#DdataFrame 表格型数据结构
data = data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
stateyearpop
0Ohio20001.5
1Ohio20011.7
2Ohio20023.6
3Nevada20012.4
4Nevada20022.9
5Nevada20033.2
frame.head()  #前五行
stateyearpop
0Ohio20001.5
1Ohio20011.7
2Ohio20023.6
3Nevada20012.4
4Nevada20022.9
pd.DataFrame(data,columns=['year','state','pop'])
yearstatepop
02000Ohio1.5
12001Ohio1.7
22002Ohio3.6
32001Nevada2.4
42002Nevada2.9
52003Nevada3.2
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four', 'five', 'six'])
frame2

yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7NaN
three2002Ohio3.6NaN
four2001Nevada2.4NaN
five2002Nevada2.9NaN
six2003Nevada3.2NaN
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
frame2['state'] #返回Series
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
frame2.year
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
frame2.loc['three']    #loc返回行数据
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
frame2['debt'] = 16.5
frame2
yearstatepopdebt
one2000Ohio1.516.5
two2001Ohio1.716.5
three2002Ohio3.616.5
four2001Nevada2.416.5
five2002Nevada2.916.5
six2003Nevada3.216.5
frame2['debt'] = np.arange(6.)
frame2
yearstatepopdebt
one2000Ohio1.50.0
two2001Ohio1.71.0
three2002Ohio3.62.0
four2001Nevada2.43.0
five2002Nevada2.94.0
six2003Nevada3.25.0
val = pd.Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
frame2['debt'] = val
frame2
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
six2003Nevada3.2NaN
frame2['eastern'] = frame2.state == 'Ohio'
frame2
yearstatepopdebteastern
one2000Ohio1.5NaNTrue
two2001Ohio1.7-1.2True
three2002Ohio3.6NaNTrue
four2001Nevada2.4-1.5False
five2002Nevada2.9-1.7False
six2003Nevada3.2NaNFalse
del frame2['eastern']
frame2
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
six2003Nevada3.2NaN
#嵌套字典
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
#外层字典的键作为列,内层键则作为行索引
frame3 = pd.DataFrame(pop)
frame3
NevadaOhio
2000NaN1.5
20012.41.7
20022.93.6
#转置
frame3.T
200020012002
NevadaNaN2.42.9
Ohio1.51.73.6
frame3.index.name = 'year';frame3.columns.name = 'state'
frame3
stateNevadaOhio
year
2000NaN1.5
20012.41.7
20022.93.6
frame3.values
array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])
frame2.values
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

在这里插入图片描述

#索引对象 Index
obj = pd.Series(range(3),index=['a','b','c'])
index = obj.index
index
Index(['a', 'b', 'c'], dtype='object')
index[1:]
Index(['b', 'c'], dtype='object')
labels = pd.Index(np.arange(3))
labels
Int64Index([0, 1, 2], dtype='int64')
obj2 = pd.Series([1.5,-2.5,0],index=labels)
obj2
0    1.5
1   -2.5
2    0.0
dtype: float64
obj2.index is labels
True
frame3
NevadaOhio
2000NaN1.5
20012.41.7
20022.93.6
frame3.columns
Index(['Nevada', 'Ohio'], dtype='object')
'Ohio'in frame3.columns
True
dup_labels = pd.Index(['foo','foo', 'bar', 'bar'])
dup_labels
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

在这里插入图片描述

#基本功能
#重新索引
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3
0      blue
2    purple
4    yellow
dtype: object
obj3.reindex(range(6),method='ffill')#插值处理
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                         index=['a', 'c', 'd'],
                         columns=['Ohio', 'Texas', 'California'])
frame
OhioTexasCalifornia
a012
c345
d678
frame2 = frame.reindex(['a','b','c','d'])
frame2
OhioTexasCalifornia
a0.01.02.0
bNaNNaNNaN
c3.04.05.0
d6.07.08.0
states = ['Texas','Utah','California']
frame.reindex(columns=states)
TexasUtahCalifornia
a1NaN2
c4NaN5
d7NaN8

在这里插入图片描述

#丢弃指定轴上的项 drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
new_obj = obj.drop('c')
new_obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
obj.drop(['d','c'])
a    0.0
b    1.0
e    4.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                        index=['Ohio', 'Colorado', 'Utah', 'New York'],
                        columns=['one', 'two', 'three', 'four'])

data
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York12131415
data.drop(['Colorado','Ohio'])
onetwothreefour
Utah891011
New York12131415
data.drop('two',axis=1) #axis指定行或列
onethreefour
Ohio023
Colorado467
Utah81011
New York121415
data.drop(['two','four'],axis = 'columns')
onethree
Ohio02
Colorado46
Utah810
New York1214
obj
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
obj.drop('c',inplace=True) #inplace销毁被删除数据
obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
#索引,选取和过滤
obj = pd.Series(np.arange(4),index=['a', 'b', 'c', 'd'])
obj
a    0
b    1
c    2
d    3
dtype: int32
obj['b']
1
obj[1]
1
obj[2:4]
c    2
d    3
dtype: int32
obj[['b','a','d']]
b    1
a    0
d    3
dtype: int32
obj[[1,3]]
b    1
d    3
dtype: int32
obj[obj<2]
a    0
b    1
dtype: int32
#利用标签的切片运算与普通的Python切片运算不同,其末端是包含的
obj['b':'c']
b    1
c    2
dtype: int32
obj['b':'c'] = 5
obj
a    0
b    5
c    5
d    3
dtype: int32
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                      index=['Ohio', 'Colorado', 'Utah', 'New York'],
                      columns=['one', 'two', 'three', 'four'])

data
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York12131415
data['two']
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
data[['three','one']]
threeone
Ohio20
Colorado64
Utah108
New York1412
data[:2]
onetwothreefour
Ohio0123
Colorado4567
data[data['three']>5]
onetwothreefour
Colorado4567
Utah891011
New York12131415
data<5
onetwothreefour
OhioTrueTrueTrueTrue
ColoradoTrueFalseFalseFalse
UtahFalseFalseFalseFalse
New YorkFalseFalseFalseFalse
data[data<5] = 0
data
onetwothreefour
Ohio0000
Colorado0567
Utah891011
New York12131415
#利用loc和iloc进行选取;标签运算符
data.loc['Colorado',['two','three']]

two      5
three    6
Name: Colorado, dtype: int32
data.iloc[2,[3,0,1]]
four    11
one      8
two      9
Name: Utah, dtype: int32
data.iloc[2]
one       8
two       9
three    10
four     11
Name: Utah, dtype: int32
data.iloc[[1,2],[3,0,1]]
fouronetwo
Colorado705
Utah1189
data.loc[:'Utah','two']
Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32
data.iloc[:,:3][data.three>5]
onetwothree
Colorado056
Utah8910
New York121314

在这里插入图片描述

#整数索引
ser = pd.Series(np.arange(3))
ser
0    0
1    1
2    2
dtype: int32
ser[-1]
---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-73-44969a759c20> in <module>()
----> 1 ser[-1]


C:\Anaconda\lib\site-packages\pandas\core\series.py in __getitem__(self, key)
    765         key = com._apply_if_callable(key, self)
    766         try:
--> 767             result = self.index.get_value(self, key)
    768 
    769             if not is_scalar(result):


C:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_value(self, series, key)
   3116         try:
   3117             return self._engine.get_value(s, k,
-> 3118                                           tz=getattr(series.dtype, 'tz', None))
   3119         except KeyError as e1:
   3120             if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_value()


pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()


pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item()


KeyError: -1
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c']) #非整数索引
ser2[-1]
2.0
ser[:1]
0    0
dtype: int32
ser.loc[:1] #注意区别
0    0
1    1
dtype: int32
#算术运算和数据对齐
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64
s2
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64
s1 + s2  #对齐操作
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                    index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                       index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
bcd
Ohio0.01.02.0
Texas3.04.05.0
Colorado6.07.08.0
df2
bde
Utah0.01.02.0
Ohio3.04.05.0
Texas6.07.08.0
Oregon9.010.011.0
df1 + df2 #DataFrame对象相加,没有共用的列或行标签,结果都会是空
bcde
ColoradoNaNNaNNaNNaN
Ohio3.0NaN6.0NaN
OregonNaNNaNNaNNaN
Texas9.0NaN12.0NaN
UtahNaNNaNNaNNaN
#在算术方法中填充值
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                    columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list('abcde'))  
df2.loc[1,'b'] = np.nan

df1
abcd
00.01.02.03.0
14.05.06.07.0
28.09.010.011.0
df2
abcde
00.01.02.03.04.0
15.0NaN7.08.09.0
210.011.012.013.014.0
315.016.017.018.019.0
df1 + df2
abcde
00.02.04.06.0NaN
19.0NaN13.015.0NaN
218.020.022.024.0NaN
3NaNNaNNaNNaNNaN
df1.add(df2,fill_value=0)   #指定填充值
abcde
00.02.04.06.04.0
19.05.013.015.09.0
218.020.022.024.014.0
315.016.017.018.019.0

在这里插入图片描述

#DataFrame和Series之间的运算
arr = np.arange(12).reshape(3,4)
arr
array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])
arr[0]
array([0, 1, 2, 3])
arr - arr[0]    #广播
array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                      columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series =frame.iloc[0]
frame
bde
Utah0.01.02.0
Ohio3.04.05.0
Texas6.07.08.0
Oregon9.010.011.0
series
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
frame - series
bde
Utah0.00.00.0
Ohio3.03.03.0
Texas6.06.06.0
Oregon9.09.09.0
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2
bdef
Utah0.0NaN3.0NaN
Ohio3.0NaN6.0NaN
Texas6.0NaN9.0NaN
Oregon9.0NaN12.0NaN
series3 = frame['d']
frame
bde
Utah0.01.02.0
Ohio3.04.05.0
Texas6.07.08.0
Oregon9.010.011.0
series3
Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64
frame.sub(series3,axis='index')
bde
Utah-1.00.01.0
Ohio-1.00.01.0
Texas-1.00.01.0
Oregon-1.00.01.0
#函数应用和映射 ufuncs(元素级数组方法)
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
bde
Utah-0.951265-0.498273-0.388690
Ohio1.9885460.370789-0.488038
Texas0.692938-0.1609440.654771
Oregon-1.3142371.163286-1.687210
np.abs(frame)
bde
Utah0.9512650.4982730.388690
Ohio1.9885460.3707890.488038
Texas0.6929380.1609440.654771
Oregon1.3142371.1632861.687210
f = lambda x:x.max()-x.min()
frame.apply(f)  #默认列执行f
b    3.302783
d    1.661559
e    2.341980
dtype: float64
frame.apply(f,axis='columns')#行执行f
Utah      0.562574
Ohio      2.476585
Texas     0.853882
Oregon    2.850495
dtype: float64
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)     #接受多值的series函数
bde
min-1.314237-0.498273-1.687210
max1.9885461.1632860.654771
format = lambda x: '%.2f' % x
frame.applymap(format)      #元素级函数
bde
Utah-0.95-0.50-0.39
Ohio1.990.37-0.49
Texas0.69-0.160.65
Oregon-1.311.16-1.69
frame['e'].map(format)     #区分map与applymap
Utah      -0.39
Ohio      -0.49
Texas      0.65
Oregon    -1.69
Name: e, dtype: object
# 排序和排名
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()
a    1
b    2
c    3
d    0
dtype: int64
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                       index=['three', 'one'],
                       columns=['d', 'a', 'b', 'c'])
frame.sort_index()


dabc
one4567
three0123
frame.sort_index(axis=1,ascending=False)
dcba
three0321
one4765
obj = pd.Series([4,7,-3,2])
obj.sort_values()
2   -3
3    2
0    4
1    7
dtype: int64
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame
ba
040
171
2-30
321
frame.sort_values(by='b')
ba
2-30
321
040
171
frame.sort_values(by=['a','b'])
ba
2-30
040
321
171
obj=pd.Series([7, -5, 7, 4, 2, 0, 4])#rank是通过“为各组分配一个平均排名”的方式破坏平级关系的
obj.rank()
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
obj.rank(ascending=False, method='max')
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame
bac
04.30-2.0
17.015.0
2-3.008.0
32.01-2.5
frame.rank(axis='columns')
bac
03.02.01.0
13.01.02.0
21.02.03.0
33.02.01.0

在这里插入图片描述

#带有重复标签的轴索引
obj = pd.Series(range(5),index=['a','a','b','b','c'])
obj
a    0
a    1
b    2
b    3
c    4
dtype: int64
obj.index.is_unique
False
obj['a']
a    0
a    1
dtype: int64
obj['b']
b    2
b    3
dtype: int64
df = pd.DataFrame(np.random.randn(4,3),index= ['a','a','b','b'])
df
012
a1.2652400.407293-0.652129
a0.268019-1.4239121.297783
b0.797760-0.3536631.323543
b0.9618880.2271321.843558
df.loc['b']
012
b0.797760-0.3536631.323543
b0.9618880.2271321.843558
#汇总和计算描绘统计
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                   index=['a', 'b', 'c', 'd'],
                   columns=['one', 'two'])
df

onetwo
a1.40NaN
b7.10-4.5
cNaNNaN
d0.75-1.3
df.sum()   #列和
one    9.25
two   -5.80
dtype: float64
df.sum(axis=1)
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
df.mean(axis='columns',skipna=False)   #不忽略Nan
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
df.idxmax()  #达到最大的索引
one    b
two    d
dtype: object
df.cumsum()
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
df.describe()
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%1.075000-3.700000
50%1.400000-2.900000
75%4.250000-2.100000
max7.100000-1.300000
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)#非数值型
obj.describe()
count     16
unique     3
top        a
freq       8
dtype: object

在这里插入图片描述
在这里插入图片描述

#相关系数与协方差
import pandas_datareader.data as web
all_data = {ticker:web.get_data_yahoo(ticker)
           for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})
returns = price.pct_change()
returns.tail()
                          
AAPLIBMMSFTGOOG
Date
2018-10-18-0.023374-0.026110-0.019962-0.024846
2018-10-190.015230-0.0111070.0014750.007804
2018-10-220.0061100.0071260.0089270.004287
2018-10-230.0094270.009152-0.0139560.002297
2018-10-24-0.034302-0.030486-0.053469-0.048003
returns['MSFT'].corr(returns['IBM']) #相关系数

0.4746674318628231
returns["MSFT"].cov(returns["IBM"])#协方差
8.150193655338736e-05
returns.MSFT.corr(returns.IBM)
0.4746674318628231
returns.corr()   #相关系数矩阵
AAPLIBMMSFTGOOG
AAPL1.0000000.3644340.4219840.438015
IBM0.3644341.0000000.4746670.398449
MSFT0.4219840.4746671.0000000.516364
GOOG0.4380150.3984490.5163641.000000
returns.cov()  #协方差矩阵
AAPLIBMMSFTGOOG
AAPL0.0002520.0000700.0000950.000106
IBM0.0000700.0001460.0000820.000073
MSFT0.0000950.0000820.0002020.000112
GOOG0.0001060.0000730.0001120.000232
returns.corrwith(returns.IBM)#与某一列或行的相关系数
AAPL    0.364434
IBM     1.000000
MSFT    0.474667
GOOG    0.398449
dtype: float64
returns.corrwith(volume)#传入一个DataFrame则会计算按列名配对的相关系数
AAPL   -0.065065
IBM    -0.173822
MSFT   -0.088563
GOOG   -0.016396
dtype: float64
#唯一值,值记述以及成员资格
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
obj.value_counts()#计算出现频率
c    3
a    3
b    2
d    1
dtype: int64
obj
0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
mask = obj.isin(['b','c'])#用于判断矢量化集合的成员资格
mask
0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool
obj[mask]
0    c
5    b
6    b
7    c
8    c
dtype: object
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_values = pd.Series(['c','b','a'])
pd.Index(unique_values).get_indexer(to_match)
#与isin类似的是Index.get_indexer方法,它可以给你一个索引数组,从可能包含重复值的数组到另一个不同值的数组
array([0, 2, 1, 1, 0, 2], dtype=int64)

在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值