task3

最新推荐文章于 2023-03-27 08:00:00 发布

Rachelwen

最新推荐文章于 2023-03-27 08:00:00 发布

阅读量232

点赞数

文章标签： pandas

本文链接：https://blog.csdn.net/qq_39303320/article/details/99779969

版权

from pandas import Series,DataFrame
import pandas as pd
import numpy as np
obj=pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

#Series是类似于一维数组的数组，它是由一组数据（各种numpy数据类型）和与之相关的数据标签组成
#因此可以用values和index来获取数据的表示形式和索引对象

print (type(obj.values))
print (type(obj.index))

<class 'numpy.ndarray'>
<class 'pandas.core.indexes.range.RangeIndex'>

obj2=pd.Series([4,7,-5,3],index=["b","c","v","f"])
obj2

b    4
c    7
v   -5
f    3
dtype: int64

obj2.index

Index(['b', 'c', 'v', 'f'], dtype='object')

print (obj2["v"])
print (obj2[["c","v","f"]])#索引值列表，即使包含的是字符串而不是整数

-5
c    7
v   -5
f    3
dtype: int64

obj2.index[3]

'f'

obj2*2

b     8
c    14
v   -10
f     6
dtype: int64

np.exp(obj2)#exp() 方法返回x的指数,e的x次方

b      54.598150
c    1096.633158
v       0.006738
f      20.085537
dtype: float64

"b"in obj2

True

"r"in obj2

False

sdata={"ohio":35000,"texas":71000,"oregon":16000,"utah":5000}
obj3=pd.Series(sdata)
obj3

ohio      35000
texas     71000
oregon    16000
utah       5000
dtype: int64

#NA：表示缺失值(Missing value)，是“Not Available”的缩写，NaN：表示非数值，是“Not a Number”的缩写
#NULL：表示空值，即没有内容
states=["california","ohio","michegan","utah"]
obj4=pd.Series(sdata,index=states)
obj4#值找不到，其结果就是“NAN“，not a number,在pandas中，用于表示缺失值或NA值

california        NaN
ohio          35000.0
michegan          NaN
utah           5000.0
dtype: float64

#pandas 的 isnull 和 notnull 函数可用于检测缺失数据

pd.isnull(obj4)

california     True
ohio          False
michegan       True
utah          False
dtype: bool

pd.notnull(obj4)

california    False
ohio           True
michegan      False
utah           True
dtype: bool

#Series 也有类似的实例方法
obj4.isnull()

california     True
ohio          False
michegan       True
utah          False
dtype: bool

obj3

ohio      35000
texas     71000
oregon    16000
utah       5000
dtype: int64

obj4

california        NaN
ohio          35000.0
michegan          NaN
utah           5000.0
dtype: float64

obj3+obj4

california        NaN
michegan          NaN
ohio          70000.0
oregon            NaN
texas             NaN
utah          10000.0
dtype: float64

obj4.name="population"
obj4.index.name="state"
obj4

state
california        NaN
ohio          35000.0
michegan          NaN
utah           5000.0
Name: population, dtype: float64

#Series 的索引可以通过赋值的方式就地修改
obj.index=["bob","steve","jeff",'ryan']
obj

bob      4
steve    7
jeff    -5
ryan     3
dtype: int64

#DataFrame 是一个表格型的数据结构，它含有一组有序的列，每列可以是不同 的值类型(数值、字符串、布尔值等)。
#DataFrame 既有行索引也有列索引， 它可以被看做由 Series 组成的字典(共用同一个索引)。
#DataFrame 中的数据 是以一个或多个二维块存放的(而不是列表、字典或别的一维数据结构)

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada',
'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

	state	year	pop
0	Ohio	2000	1.5
1	Ohio	2001	1.7
2	Ohio	2002	3.6
3	Nevada	2001	2.4
4	Nevada	2002	2.9
5	Nevada	2003	3.2

print (frame.head())

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9

pd.DataFrame(data,columns=["year","state","pop"])

	year	state	pop
0	2000	Ohio	1.5
1	2001	Ohio	1.7
2	2002	Ohio	3.6
3	2001	Nevada	2.4
4	2002	Nevada	2.9
5	2003	Nevada	3.2

frame2=pd.DataFrame(data,columns=["year",'state','pop','debt'],index=['one','two','three','four','five','six'])
print (frame2)

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN

frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

frame2['state']#类似字典的方式

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

frame2.year#属性的方式

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

frame2['debt']=16.5
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	16.5
two	2001	Ohio	1.7	16.5
three	2002	Ohio	3.6	16.5
four	2001	Nevada	2.4	16.5
five	2002	Nevada	2.9	16.5
six	2003	Nevada	3.2	16.5

frame2['debt']=np.arange(6.)
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	0.0
two	2001	Ohio	1.7	1.0
three	2002	Ohio	3.6	2.0
four	2001	Nevada	2.4	3.0
five	2002	Nevada	2.9	4.0
six	2003	Nevada	3.2	5.0

val=pd.Series([-1.2,-1.5,3],index=['one','two','five'])
frame2['debt']=val
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	-1.2
two	2001	Ohio	1.7	-1.5
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	3.0
six	2003	Nevada	3.2	NaN

frame2['eastern']=frame2.state=='Ohio'
frame2

	year	state	pop	debt	eastern
one	2000	Ohio	1.5	-1.2	True
two	2001	Ohio	1.7	-1.5	True
three	2002	Ohio	3.6	NaN	True
four	2001	Nevada	2.4	NaN	False
five	2002	Nevada	2.9	3.0	False
six	2003	Nevada	3.2	NaN	False

del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

pop={'Neveda':{2001:2.4,2002:2.9},
    'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
#嵌套字典传给 DataFrame，pandas 就会被解释为:外层字典的键作为列,内层键则作为行索引

frame3=pd.DataFrame(pop)
frame3

	Neveda	Ohio
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

frame3.T#转置

	2000	2001	2002
Neveda	NaN	2.4	2.9
Ohio	1.5	1.7	3.6

pd.DataFrame(pop,index=[2001,2002,2003])

	Neveda	Ohio
2001	2.4	1.7
2002	2.9	3.6
2003	NaN	NaN

pdata={'Ohio':frame3['Ohio'][:-1],
      'Neveda':frame3['Neveda'][:3]}
pd.DataFrame(pdata)

	Ohio	Neveda
2000	1.5	NaN
2001	1.7	2.4
2002	NaN	2.9

frame3.index.name='year';frame3.columns.name='state'
frame3

state	Neveda	Ohio
year
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

frame2.values

array([[2000, 'Ohio', 1.5, -1.2],
       [2001, 'Ohio', 1.7, -1.5],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, 3.0],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

#索引对象
#pandas 的索引对象负责管理轴标签和其他元数据(比如轴名称等)。
#构建 Series 或 DataFrame 时，所用到的任何数组或其他序列的标签都会被转换成一 个 Index:

obj=pd.Series(range(3),index=['a','b','c'])
index=obj.index
values=obj.values
obj

a    0
b    1
c    2
dtype: int64

index[1:]

Index(['b', 'c'], dtype='object')

index[1]="w"

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-116-287780bd69fa> in <module>
----> 1 index[1]="w"


/anaconda3/lib/python3.7/site-packages/pandas/core/indexes/base.py in __setitem__(self, key, value)
   3936 
   3937     def __setitem__(self, key, value):
-> 3938         raise TypeError("Index does not support mutable operations")
   3939 
   3940     def __getitem__(self, key):


TypeError: Index does not support mutable operations

labels=pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

obj2=pd.Series([1.5,-2.5,0],index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

obj2.index is labels

True

frame3

state	Neveda	Ohio
year
2000	NaN	1.5
2001	2.4	1.7
2002	2.9	3.6

frame3.columns

Index(['Neveda', 'Ohio'], dtype='object', name='state')

'Ohio' in frame3.columns

True

2003 in frame3.index

False

dup_labels=pd.Index(['foo','foo','bar','bar'])
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

基本功能
重新索引

obj=pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

obj2=obj.reindex(['a','b','c','d','e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

obj3=pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

obj3.reindex(range(6),method='ffill')#ffill向前填充

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

frame=pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['a','c','d'],
                  columns=['Ohio','Texas','California'])
frame

	Ohio	Texas	California
a	0	1	2
c	3	4	5
d	6	7	8

frame2=frame.reindex(['a','b','c','d'])
frame2

	Ohio	Texas	California
a	0.0	1.0	2.0
b	NaN	NaN	NaN
c	3.0	4.0	5.0
d	6.0	7.0	8.0

states=['Texas','Utah','California']
frame.reindex(columns=states)

	Texas	Utah	California
a	1	NaN	2
c	4	NaN	5
d	7	NaN	8

obj=pd.Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

new_obj=obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

data=pd.DataFrame(np.arange(16).reshape(4,4),
                 index=['ohio','Colorado','Utah','Newyork'],
                 columns=['one','two','three','four'])
data

	one	two	three	four
ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
Newyork	12	13	14	15

data.drop(['Colorado','ohio'])

	one	two	three	four
Utah	8	9	10	11
Newyork	12	13	14	15

data.drop('three',axis=1)

	one	two	four
ohio	0	1	3
Colorado	4	5	7
Utah	8	9	11
Newyork	12	13	15

data.drop(['two','four'],axis=1)

	one	three
ohio	0	2
Colorado	4	6
Utah	8	10
Newyork	12	14

obj.drop('c',inplace=True)#许多函数，如 drop，会修改 Series 或 DataFrame 的大小或形状，可以就地修 改对象，不会返回新的对象:
#小心使用 inplace，它会销毁所有被删除的数据。
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

obj=pd.Series(np.arange(4.),index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

obj['b']

1.0

obj[1]

1.0

obj[2:4]

c    2.0
d    3.0
dtype: float64

obj[[1,3]]

b    1.0
d    3.0
dtype: float64

obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

obj[obj<2]

a    0.0
b    1.0
dtype: float64

obj['b':'c']

b    1.0
c    2.0
dtype: float64

obj["b":'c']=5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

data=pd.DataFrame(np.arange(16).reshape(4,4),
                 index=['Ohio','Colorado','Utah','California'],
                 columns=['one','two','three','four'])
data

	one	two	three	four
Ohio	0	1	2	3
Colorado	4	5	6	7
Utah	8	9	10	11
California	12	13	14	15

data['two']

Ohio           1
Colorado       5
Utah           9
California    13
Name: two, dtype: int64

data[['three','one']]

	three	one
Ohio	2	0
Colorado	6	4
Utah	10	8
California	14	12

data['two']

Ohio           1
Colorado       5
Utah           9
California    13
Name: two, dtype: int64

data[data['three']>5]

	one	two	three	four
Colorado	4	5	6	7
Utah	8	9	10	11
California	12	13	14	15

data<5

	one	two	three	four
Ohio	True	True	True	True
Colorado	True	False	False	False
Utah	False	False	False	False
California	False	False	False	False

data[data<5]=0
data

	one	two	three	four
Ohio	0	0	0	0
Colorado	0	5	6	7
Utah	8	9	10	11
California	12	13	14	15

data.loc['Colorado',['one','two']]

one    0
two    5
Name: Colorado, dtype: int64

data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

data.iloc[[1,2],[3,0,1]]

	four	one	two
Colorado	7	0	5
Utah	11	8	9

data.loc[:'Utah','two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

data.iloc[:,:3][data.three>5]

	one	two	three
Colorado	0	5	6
Utah	8	9	10
California	12	13	14

ser=pd.Series(np.arange(3.))
ser

0    0.0
1    1.0
2    2.0
dtype: float64

ser2=pd.Series(np.arange(3.),index=['a','b','c'])
ser2[-1]

2.0

ser[:1]

0    0.0
dtype: float64

ser.loc[:1]

0    0.0
1    1.0
dtype: float64

ser.iloc[:1]

0    0.0
dtype: float64

s1=pd.Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2=pd.Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

s1+s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

df1=pd.DataFrame(np.arange(9.).reshape(3,3),
                columns=list('bcd'),
                index=['Ohio','Texas','Colorato'])
df2=pd.DataFrame(np.arange(12.).reshape(4,3),
                columns=list('bde'),
                index=['Utah','Ohio','Texas','Oregon'])
df1

	b	c	d
Ohio	0.0	1.0	2.0
Texas	3.0	4.0	5.0
Colorato	6.0	7.0	8.0

df2

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

df1+df2

	b	c	d	e
Colorato	NaN	NaN	NaN	NaN
Ohio	3.0	NaN	6.0	NaN
Oregon	NaN	NaN	NaN	NaN
Texas	9.0	NaN	12.0	NaN
Utah	NaN	NaN	NaN	NaN

df1=pd.DataFrame({'A':[1,2]})
df2=pd.DataFrame({'B':[3,4]})
df1

	A
0	1
1	2

df2

	B
0	3
1	4

df1-df2

	A	B
0	NaN	NaN
1	NaN	NaN

df1=pd.DataFrame(np.arange(12.).reshape(3,4),
                columns=list('abcd'))
df2=pd.DataFrame(np.arange(20.).reshape(4,5),
                columns=list('abcde'))
df2.loc[3,'e']=np.nan
df1

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

df2

	a	b	c	d	e
0	0.0	1.0	2.0	3.0	4.0
1	5.0	6.0	7.0	8.0	9.0
2	10.0	11.0	12.0	13.0	14.0
3	15.0	16.0	17.0	18.0	NaN

df1+df2

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	NaN
1	9.0	11.0	13.0	15.0	NaN
2	18.0	20.0	22.0	24.0	NaN
3	NaN	NaN	NaN	NaN	NaN

df1.add(df2,fill_value=0)

	a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	11.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	NaN

1/df1

	a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250	0.200000	0.166667	0.142857
2	0.125	0.111111	0.100000	0.090909

df1.rdiv(1)

	a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250	0.200000	0.166667	0.142857
2	0.125	0.111111	0.100000	0.090909

df1.reindex(columns=df2.columns,fill_value=0)

	a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

arr=np.arange(12.).reshape(3,4)
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

arr[0]

array([0., 1., 2., 3.])

arr-arr[0]#广播

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

frame=pd.DataFrame(np.arange(12.).reshape(4,3),
                  columns=list('bde'),
                   index=['Utah','Ohio','Texas','Oregon'])
series=frame.iloc[0]

frame

	b	d	e
Utah	0.0	1.0	2.0
Ohio	3.0	4.0	5.0
Texas	6.0	7.0	8.0
Oregon	9.0	10.0	11.0

series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

frame-series

	b	d	e
Utah	0.0	0.0	0.0
Ohio	3.0	3.0	3.0
Texas	6.0	6.0	6.0
Oregon	9.0	9.0	9.0

series2=pd.Series(range(3),index=['b','e','f'])
frame+series2

	a	b	c	d	e	f
three	NaN	2.0	NaN	NaN	NaN	NaN
one	NaN	6.0	NaN	NaN	NaN	NaN

series3=frame['d']
frame

	d	a	b	c
three	0	1	2	3
one	4	5	6	7

series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

frame.sub(series3)

	Ohio	Oregon	Texas	Utah	a	b	c	d
three	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
one	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

frame=pd.DataFrame(np.random.randn(4,3),
                  columns=list('bde'),
                  index=['Utah','Ohio','Texas','Oregon'])
frame

	b	d	e
Utah	-0.674038	1.405883	1.123189
Ohio	-0.901923	1.277413	-0.979557
Texas	0.063148	-2.040145	0.155796
Oregon	-1.667468	-0.353109	-0.616387

np.abs(frame)

	b	d	e
Utah	0.674038	1.405883	1.123189
Ohio	0.901923	1.277413	0.979557
Texas	0.063148	2.040145	0.155796
Oregon	1.667468	0.353109	0.616387

f=lambda x:x.max()-x.min()
frame.apply(f)

b    1.730616
d    3.446028
e    2.102747
dtype: float64

frame.apply(f,axis='columns')

Utah      2.079921
Ohio      2.256970
Texas     2.195941
Oregon    1.314358
dtype: float64

def f(x):
    return pd.Series([x.min(),x.max()],index=["min","max"])
frame.apply(f)

	b	d	e
min	-1.667468	-2.040145	-0.979557
max	0.063148	1.405883	1.123189

format=lambda x:'%.2f'%x

frame.applymap(format)

	b	d	e
Utah	-1.75	0.13	0.25
Ohio	1.77	-0.76	0.57
Texas	0.17	-0.84	0.40
Oregon	0.20	0.20	-1.46

frame['e'].map(format)

Utah       0.25
Ohio       0.57
Texas      0.40
Oregon    -1.46
Name: e, dtype: object

obj=pd.Series(range(4),index=['d','a','b','c'])

obj.sort_index()

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

frame=pd.DataFrame(np.arange(8).reshape(2,4),
                  index=['three','one'],
                  columns=['d','a','b','c'])

frame.sort_index()

	d	a	b	c
one	4	5	6	7
three	0	1	2	3

frame.sort_index(axis=1)

	a	b	c	d
three	1	2	3	0
one	5	6	7	4

frame.sort_index(axis=1,ascending=False)

	d	c	b	a
three	0	3	2	1
one	4	7	6	5

obj=pd.Series([4,7,-3,2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

obj=pd.Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

frame=pd.DataFrame({'a':[4,7,-3,2],'b':[0,1,0,1]})
frame

frame.sort_values(by='b')

frame.sort_values(by=['a','b'])

obj=pd.Series([7,-5,7,4,2,0,4])
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

obj.rank(ascending=False,method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0,1],'c': [-2, 5, 8, -2.5]})
frame

	b	a	c
0	4.3	0	-2.0
1	7.0	1	5.0
2	-3.0	0	8.0
3	2.0	1	-2.5

frame.rank(axis='columns')

	b	a	c
0	3.0	2.0	1.0
1	3.0	1.0	2.0
2	1.0	2.0	3.0
3	3.0	2.0	1.0

obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])

obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

obj.index.is_unique

False

obj['a']

a    0
a    1
dtype: int64

obj['c']

df=pd.DataFrame(np.random.randn(4,3),index=['a','a','b','b'])

df

	0	1	2
a	1.095240	0.137070	0.533132
a	0.470992	-0.038642	-0.118522
b	0.509320	-0.095165	1.565080
b	1.551403	-0.028062	0.090268

df.loc['b']

	0	1	2
b	0.509320	-0.095165	1.565080
b	1.551403	-0.028062	0.090268

df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df

	one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

df.sum()

one    9.25
two   -5.80
dtype: float64

df.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

df.mean(axis='columns',skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

df.idxmin()

one    d
two    b
dtype: object

df.cumsum()

	one	two
a	1.40	NaN
b	8.50	-4.5
c	NaN	NaN
d	9.25	-5.8

obj=pd.Series(['a','a','b','c']*4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}
price = pd.DataFrame({ticker: data['Adj Close']
                      for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                       for ticker, data in all_data.items()})
returns = price.pct_change()
returns.tail()

	AAPL	IBM	MSFT	GOOG
Date
2019-08-13	0.042348	0.012452	0.020694	0.019205
2019-08-14	-0.029765	-0.033434	-0.030114	-0.027546
2019-08-15	-0.004981	0.005105	-0.002239	0.002551
2019-08-16	0.023595	0.013948	0.018327	0.008858
2019-08-19	0.026344	0.016709	0.012047	0.022393

returns['MSFT'].corr(returns['IBM'])

0.4905235623531012

 returns['MSFT'].cov(returns['IBM'])

8.766298066095883e-05

returns.MSFT.corr(returns.IBM)

0.4905235623531012

returns.corr()

	AAPL	IBM	MSFT	GOOG
AAPL	1.000000	0.384193	0.455895	0.461466
IBM	0.384193	1.000000	0.490524	0.404765
MSFT	0.455895	0.490524	1.000000	0.537158
GOOG	0.461466	0.404765	0.537158	1.000000

returns.cov()

	AAPL	IBM	MSFT	GOOG
AAPL	0.000267	0.000078	0.000108	0.000117
IBM	0.000078	0.000153	0.000088	0.000078
MSFT	0.000108	0.000088	0.000209	0.000121
GOOG	0.000117	0.000078	0.000121	0.000242

returns.corrwith(returns.IBM)

AAPL    0.384193
IBM     1.000000
MSFT    0.490524
GOOG    0.404765
dtype: float64

returns.corrwith(volume)

AAPL   -0.062747
IBM    -0.152642
MSFT   -0.090553
GOOG   -0.019246
dtype: float64

obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c',
'c'])
uniques=obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

pd.value_counts(obj.values,sort=False)

c    3
d    1
b    2
a    3
dtype: int64

obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

mask=obj.isin(['b','c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

to_match = pd.Series(['c', 'r', 'b', 'b', 'd', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

array([ 0, -1,  1,  1, -1,  2])

data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

	Qu1	Qu2	Qu3
0	1	2	1
1	3	3	5
2	4	1	2
3	3	2	4
4	4	3	4

result=data.apply(pd.value_counts).fillna(0)
result

	Qu1	Qu2	Qu3
1	1.0	1.0	1.0
2	0.0	2.0	1.0
3	2.0	2.0	0.0
4	2.0	0.0	2.0
5	0.0	0.0	1.0

Rachelwen

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫