Datawhale_数据分析组队学习task2

pandas的数据结构介绍

Series

# 创建Series
import pandas as pd
obj = pd.Series([4,7,-5,3])
obj
Out[3]:
0    4
1    7
2   -5
3    3
dtype: int64

# 获取values和index
obj.values
Out[5]:
array([ 4,  7, -5,  3])

obj.index
Out[6]:
RangeIndex(start=0, stop=4, step=1)

# 指定索引
obj2 = pd.Series([4,7,-5,3],index = ['a','b','c','d'])
obj2
Out[8]:
a    4
b    7
c   -5
d    3
dtype: int64

# 通过索引选取值
obj2['b']
Out[9]:
7

obj2[['c','a','b']]
Out[10]:
c   -5
a    4
b    7
dtype: int64

obj2[obj2 >0]
Out[11]:
a    4
b    7
d    3
dtype: int64

obj2*2
Out[12]:
a     8
b    14
c   -10
d     6
dtype: int64

'b' in obj2
Out[14]:
True
# 通过字典创建Series
sdata = {'ohio':3500,'texas':71000,'oregon':16000,'utah':5000}
obj3 = pd.Series(sdata)
obj3
Out[15]:
ohio       3500
texas     71000
oregon    16000
utah       5000
dtype: int64

# 传入字排好序的字典的键以改变顺序
states = ['californis','ohio','oregon','texas']
obj4 = pd.Series(sdata,index = states)
obj4
Out[16]:
californis        NaN
ohio           3500.0
oregon        16000.0
texas         71000.0
dtype: float64

# 检测缺失值
pd.isnull(obj4)
Out[17]:
californis     True
ohio          False
oregon        False
texas         False
dtype: bool

obj4.isnull()
Out[18]:
californis     True
ohio          False
oregon        False
texas         False
dtype: bool

# Series 的name属性
obj4.name = 'population'
obj4.index.name = 'stats'
obj4
Out[19]:
stats
californis        NaN
ohio           3500.0
oregon        16000.0
texas         71000.0
Name: population, dtype: float64

# 直接修改索引
obj4.index = [1,2,3,4]
obj4
Out[20]:
1        NaN
2     3500.0
3    16000.0
4    71000.0
Name: population, dtype: float64

DataFrame

# 创建DataFrame
data = {'state':['ohio','ohio','ohio','nevada','nevada','nevada'],
       'year':[2000,2001,2002,2001,2002,2003],
       'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)
frame
Out[21]:

state	year	pop
0	ohio	2000	1.5
1	ohio	2001	1.7
2	ohio	2002	3.6
3	nevada	2001	2.4
4	nevada	2002	2.9
5	nevada	2003	3.2

# 选取前五行
frame.head()
Out[22]:
state	year	pop
0	ohio	2000	1.5
1	ohio	2001	1.7
2	ohio	2002	3.6
3	nevada	2001	2.4
4	nevada	2002	2.9

# 按指定顺序进行列排序
pd.DataFrame(data,columns = ['year','state','pop'])
Out[23]:
year	state	pop
0	2000	ohio	1.5
1	2001	ohio	1.7
2	2002	ohio	3.6
3	2001	nevada	2.4
4	2002	nevada	2.9
5	2003	nevada	3.2

# 将DataFrame的列获取为一个Series
frame['state']
Out[24]:
0      ohio
1      ohio
2      ohio
3    nevada
4    nevada
5    nevada
Name: state, dtype: object

frame.year
Out[25]:
0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

frame.state
Out[26]:
0      ohio
1      ohio
2      ohio
3    nevada
4    nevada
5    nevada
Name: state, dtype: object
frame2
>>>
state	year	pop
0	ohio	2000	1.5
一	ohio	2001	1.7
二	ohio	2002	3.6
三	nevada	2001	2.4
四	nevada	2002	2.9
五	nevada	2003	3.2

# 获取行
frame2.loc['三']
Out[40]:
state    nevada
year       2001
pop         2.4
Name: 三, dtype: object

# 直接赋值修改列
import numpy as np
frame2['pop'] = np.arange(6)
frame2
Out[46]:
state	year	pop
0	ohio	2000	0
一	ohio	2001	1
二	ohio	2002	2
三	nevada	2001	3
四	nevada	2002	4
五	nevada	2003	5

# 删除列
del frame2['pop']
frame2
Out[47]:
state	year
0	ohio	2000
一	ohio	2001
二	ohio	2002
三	nevada	2001
四	nevada	2002
五	nevada	2003

基本功能

重新索引

obj = pd.Series([4.5,7.2,-5.3,3.6],index = ['d','b','a','c'])
obj
>>>
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

# reindex是创建一个新对象,会根据新索引进行重排,当索引值不存在就引入缺失值
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
Out[49]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

obj3 = pd.Series(['blue','purple','yellow'],index = [0,2,4])
obj3
Out[50]:
0      blue
2    purple
4    yellow
dtype: object

# method插值处理
obj3.reindex(range(6),method = 'ffill') # ffill向前值填充
Out[55]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

# 列重新索引
frame
Out[59]:
state	year	pop
0	ohio	2000	1.5
1	ohio	2001	1.7
2	ohio	2002	3.6
3	nevada	2001	2.4
4	nevada	2002	2.9
5	nevada	2003	3.2

frame.reindex(columns = ['state_a','year','pop'])
Out[61]:
state_a	year	pop
0	NaN	2000	1.5
1	NaN	2001	1.7
2	NaN	2002	3.6
3	NaN	2001	2.4
4	NaN	2002	2.9
5	NaN	2003	3.2

reindex函数的参数在这里插入图片描述

丢弃指定轴上的项

drop方法返回的是一个在指定轴上删除了指定值的新对象

obj = pd.Series(np.arange(5.),index = ['a','b','c','d','e'])
obj
>>>
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

new_obj = obj.drop('c')
new_obj
Out[65]:
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

obj.drop(['d','c'])
Out[67]:
a    0.0
b    1.0
e    4.0
dtype: float64

# 对于DataFrame可以删除任意轴上的索引值
data = pd.DataFrame(np.arange(16).reshape(4,4),
                   index = ['ohio','colorado','utah','new york'],
                   columns = ['one','two','three','four'])
data
Out[71]:
one	two	three	four
ohio	0	1	2	3
colorado	4	5	6	7
utah	8	9	10	11
new york	12	13	14	15

# 用标签序列调用drop删除值
data.drop(['colorado','ohio'])
Out[72]:
one	two	three	four
utah	8	9	10	11
new york	12	13	14	15

# 通过传递axis =1或axis = ‘columns’删除列的值
data.drop('two',axis = 1)
Out[73]:
one	three	four
ohio	0	2	3
colorado	4	6	7
utah	8	10	11
new york	12	14	15

data.drop(['two','four'],axis = 'columns')
Out[74]:
one	three
ohio	0	2
colorado	4	6
utah	8	10
new york	12	14

索引、选取和过滤

obj = pd.Series(np.arange(4.),index = ['a','b','c','d'])
obj
>>>
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

obj[2:4]
Out[76]:
c    2.0
d    3.0
dtype: float64

obj[['b','d']]
Out[77]:
b    1.0
d    3.0
dtype: float64

obj[obj < 2]
Out[78]:
a    0.0
b    1.0
dtype: float64

obj['b':'c']  # 利用标签的切片其末端是包含的
Out[80]:
b    1.0
c    2.0
dtype: float64

data = pd.DataFrame(np.arange(16).reshape(4,4),
                   index = ['ohio','colorado','utah','new york'],
                   columns = ['one','two','three','four'])
data
Out[82]:
one	two	three	four
ohio	0	1	2	3
colorado	4	5	6	7
utah	8	9	10	11
new york	12	13	14	15

data[['three','one']]
Out[84]:
three	one
ohio	2	0
colorado	6	4
utah	10	8
new york	14	12

data[:2]
Out[85]:
one	two	three	four
ohio	0	1	2	3
colorado	4	5	6	7

data[data['three'] > 5]
Out[87]:
one	two	three	four
colorado	4	5	6	7
utah	8	9	10	11
new york	12	13	14	15

data < 5
Out[88]:
one	two	three	four
ohio	True	True	True	True
colorado	True	False	False	False
utah	False	False	False	False
new york	False	False	False	False

data[data <5 ] = 0
data
Out[90]:
one	two	three	four
ohio	0	0	0	0
colorado	0	5	6	7
utah	8	9	10	11
new york	12	13	14	15

用loc和iloc进行选取

data
>>>
one	two	three	four
ohio	0	0	0	0
colorado	0	5	6	7
utah	8	9	10	11
new york	12	13	14	15

# loc使用轴标签索引
data.loc['colorado',['two','three']]
Out[93]:
two      5
three    6
Name: colorado, dtype: int64

# iloc使用整数索引
data.iloc[2,[3,0,1]]
Out[94]:
four    11
one      8
two      9
Name: utah, dtype: int64

data.loc[:'utah','two']
Out[96]:
ohio        0
colorado    5
utah        9
Name: two, dtype: int64

data.iloc[:,:3][data.three > 5]
Out[97]:
one	two	three
colorado	0	5	6
utah	8	9	10
new york	12	13	14

算术运算和数据对齐

s1 = pd.Series([7.3,-2.5,3.4,1.5],index = ['a','c','d','e'])
s2 = pd.Series([-2.1,3.6,-1.5,4,3.1],index = ['a','c','e','f','g'])
s1
>>>
a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

s2
Out[99]:
a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

s1 + s2 # 在不重叠的索引处引入了缺失值,缺失值在算术运算过程中传播
Out[100]:
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

df1 = pd.DataFrame(np.arange(9.).reshape(3,3),columns = list('bcd'),index = ['ohio','texas','colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape(4,3),columns = list('bde'),index = ['utah','ohio','texas','oregon'])
df1
Out[103]:
b	c	d
ohio	0.0	1.0	2.0
texas	3.0	4.0	5.0
colorado	6.0	7.0	8.0

df2
Out[104]:
b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
oregon	9.0	10.0	11.0

# 相加后会返回一个新的DataFrame,其索引和列为原来两个的并集
df1 + df2
Out[105]:
b	c	d	e
colorado	NaN	NaN	NaN	NaN
ohio	3.0	NaN	6.0	NaN
oregon	NaN	NaN	NaN	NaN
texas	9.0	NaN	12.0	NaN
utah	NaN	NaN	NaN	NaN

df1 -  df2
Out[106]:
b	c	d	e
colorado	NaN	NaN	NaN	NaN
ohio	-3.0	NaN	-2.0	NaN
oregon	NaN	NaN	NaN	NaN
texas	-3.0	NaN	-2.0	NaN
utah	NaN	NaN	NaN	NaN

在算术方法中填充值

df1 = pd.DataFrame(np.arange(12.).reshape((3,4)),columns = list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)),columns = list('abcde'))
df2.loc[1,'b'] = np.nan
df1
>>>
a	b	c	d
0	0.0	1.0	2.0	3.0
1	4.0	5.0	6.0	7.0
2	8.0	9.0	10.0	11.0

df2
Out[108]:
a	b	c	d	e
0	0.0	1.0	2.0	3.0	4.0
1	5.0	NaN	7.0	8.0	9.0
2	10.0	11.0	12.0	13.0	14.0
3	15.0	16.0	17.0	18.0	19.0

df1 + df2 # 没有重叠的位置就会产生NA值
Out[109]:
a	b	c	d	e
0	0.0	2.0	4.0	6.0	NaN
1	9.0	NaN	13.0	15.0	NaN
2	18.0	20.0	22.0	24.0	NaN
3	NaN	NaN	NaN	NaN	NaN

# 使用df1的add方法,传入df2以及fill_value参数
df1.add(df2,fill_value = 0)
Out[110]:
a	b	c	d	e
0	0.0	2.0	4.0	6.0	4.0
1	9.0	5.0	13.0	15.0	9.0
2	18.0	20.0	22.0	24.0	14.0
3	15.0	16.0	17.0	18.0	19.0

1/df1
Out[111]:
a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250	0.200000	0.166667	0.142857
2	0.125	0.111111	0.100000	0.090909

df1.rdiv(1)  # 等价于1/df1
Out[112]:
a	b	c	d
0	inf	1.000000	0.500000	0.333333
1	0.250	0.200000	0.166667	0.142857
2	0.125	0.111111	0.100000	0.090909

在这里插入图片描述

DataFrame和Series之间的运算

frame = pd.DataFrame(np.arange(12.).reshape((4,3)),columns =list('bde'),index = ['utah','ohio','texas','orgon'])
series = frame.iloc[0]
frame
>>>
b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
orgon	9.0	10.0	11.0

series
Out[114]:
b    0.0
d    1.0
e    2.0
Name: utah, dtype: float64

# DataFrame和Series之间的算术运算会将Series的索引匹配到DataFrame的列,然后沿着行一直向下
frame - series
Out[115]:
b	d	e
utah	0.0	0.0	0.0
ohio	3.0	3.0	3.0
texas	6.0	6.0	6.0
orgon	9.0	9.0	9.0

# 如果索引值在DataFrame的列或Series的索引中找不到,则参与运算的两个对象会被重新索引形成并集
series2 = pd.Series(range(3),index = list('bef'))
series2
Out[116]:
b    0
e    1
f    2
dtype: int64

frame + series2
Out[117]:
b	d	e	f
utah	0.0	NaN	3.0	NaN
ohio	3.0	NaN	6.0	NaN
texas	6.0	NaN	9.0	NaN
orgon	9.0	NaN	12.0	NaN

# 匹配行索引(axis = 'index'或 axis = 0)
series3 = frame['d']
frame
Out[119]:
b	d	e
utah	0.0	1.0	2.0
ohio	3.0	4.0	5.0
texas	6.0	7.0	8.0
orgon	9.0	10.0	11.0

series3
Out[120]:
utah      1.0
ohio      4.0
texas     7.0
orgon    10.0
Name: d, dtype: float64

frame.sub(series3,axis = 'index')
Out[121]:
b	d	e
utah	-1.0	0.0	1.0
ohio	-1.0	0.0	1.0
texas	-1.0	0.0	1.0
orgon	-1.0	0.0	1.0

函数应用和映射

frame = pd.DataFrame(np.random.randn(4,3),columns = list('bde'),index = ['utah','ohio','texas','oregon'])
frame
>>>
b	d	e
utah	-1.631864	-2.479669	0.125731
ohio	-1.724878	0.811747	-1.425411
texas	-0.372374	-0.851630	-1.613718
oregon	-1.371739	-0.793874	1.523269

np.abs(frame)
Out[123]:
b	d	e
utah	1.631864	2.479669	0.125731
ohio	1.724878	0.811747	1.425411
texas	0.372374	0.851630	1.613718
oregon	1.371739	0.793874	1.523269

# 将函数应用到各列或行所形成的一维数组上
f = lambda x : x.max() - x.min()
frame.apply(f)
Out[124]:
b    1.352503
d    3.291416
e    3.136987
dtype: float64

frame.apply(f,axis = 'columns')
Out[126]:
utah      2.605400
ohio      2.536625
texas     1.241344
oregon    2.895008
dtype: float64

def f(x) :
    return pd.Series([x.min(),x.max()],index = ['min','max'])
frame.apply(f)
Out[127]:
b	d	e
min	-1.724878	-2.479669	-1.613718
max	-0.372374	0.811747	1.523269

format = lambda x: '%.2f' % x
frame.applymap(format)
Out[128]:
b	d	e
utah	-1.63	-2.48	0.13
ohio	-1.72	0.81	-1.43
texas	-0.37	-0.85	-1.61
oregon	-1.37	-0.79	1.52

frame['e'].map(format)
Out[129]:
utah       0.13
ohio      -1.43
texas     -1.61
oregon     1.52
Name: e, dtype: object

排序和排名

sort_index对行或列索引进行排序(按字典顺序)

obj = pd.Series(range(4),index = ['d','a','b','c'])
obj.sort_index()
>>>
a    1
b    2
c    3
d    0
dtype: int64

frame = pd.DataFrame(np.arange(8).reshape(2,4),index = ['three','two'],columns = ['d','a','b','c'])
frame.sort_index()
Out[131]:
d	a	b	c
three	0	1	2	3
two	4	5	6	7

frame.sort_index(axis = 1) # 按列排序
Out[132]:
a	b	c	d
three	1	2	3	0
two	5	6	7	4

frame.sort_index(axis = 1,ascending = False) #降序
Out[133]:
d	c	b	a
three	0	3	2	1
two	4	7	6	5

sort_values按值排序

obj = pd.Series([4,7,-3,2])
obj.sort_values()
>>>
2   -3
3    2
0    4
1    7
dtype: int64

# 排序时,任何缺失值默认放在末尾
obj = pd.Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()
Out[136]:
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

obj = pd.Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values(ascending = False)
Out[138]:
2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64

# 根据一个或多列中的值进行排序
frame = pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame
Out[139]:
b	a
0	4	0
1	7	1
2	-3	0
3	2	1

frame.sort_values(by = ['a','b'])
Out[140]:
b	a
2	-3	0
0	4	0
3	2	1
1	7	1

rank方法

obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank()
>>>
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

# 根据值在原数据中出现的顺序给出排名
obj.rank(method = 'first')
Out[142]:
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

obj.rank(method = 'first',ascending = False)
Out[143]:
0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64

frame = pd.DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frame
Out[144]:
b	a	c
0	4.3	0	-2.0
1	7.0	1	5.0
2	-3.0	0	8.0
3	2.0	1	-2.5

frame.rank(axis = 'columns')
Out[145]:
b	a	c
0	3.0	2.0	1.0
1	3.0	1.0	2.0
2	1.0	2.0	3.0
3	3.0	2.0	1.0

在这里插入图片描述

带有重复标签的轴索引

obj = pd.Series(range(5),index = ['a','a','b','b','c'])
obj
>>>
a    0
a    1
b    2
b    3
c    4
dtype: int64

# 确认唯一性
obj.index.is_unique
Out[148]:
False

obj['a']
Out[149]:
a    0
a    1
dtype: int64

obj['c']
Out[150]:
4

df = pd.DataFrame(np.random.randn(4,3),index = ['a','a','b','b'])
df
Out[151]:
0	1	2
a	-0.122680	-0.271023	0.792136
a	0.774621	0.111136	-0.964323
b	-0.941466	1.843027	0.581256
b	0.358818	-1.205702	0.537188

df.loc['b']
Out[152]:
0	1	2
b	-0.941466	1.843027	0.581256
b	0.358818	-1.205702	0.537188

汇总和计算描述统计

df = pd.DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
                 index = list('abcd'),
                 columns = ['one','two'])
df
Out[154]:
one	two
a	1.40	NaN
b	7.10	-4.5
c	NaN	NaN
d	0.75	-1.3

df.sum()
Out[155]:
one    9.25
two   -5.80
dtype: float64

# 按行求和运算
df.sum(axis = 1) 
Out[156]:
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

df.mean(axis = 'columns',skipna = False)
Out[157]:
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

在这里插入图片描述
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值