python-pandas基础

最新推荐文章于 2021-08-13 10:30:45 发布

嗯_雅娴

最新推荐文章于 2021-08-13 10:30:45 发布

阅读量178

点赞数

分类专栏： Python 数据分析

本文链接：https://blog.csdn.net/silvia__y/article/details/103284802

版权

from IPython.core.interactiveshell import InteractiveShellInteractiveShell.ast_node_interactivity = "all" #pandas基础import pandas as pdfrom pandas import Series, DataFrameimport numpy as np#S...

摘要由CSDN通过智能技术生成

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#pandas基础

import pandas as pd
from pandas import Series, DataFrame

import numpy as np

#Series数组的创建与索引

obj = pd.Series([4, 7, -5, 3])    #不设置索引时，默认数字，且从0开始
obj
obj.index
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'c', 'a'])
obj2
obj2.index

obj2[['b', 'a', 'c']]    #利用index来索引

obj.index = ['BOB', 'STEVE', 'JEFF', 'RYAN']    #更改Series的索引
obj

0    4
1    7
2   -5
3    3
dtype: int64

RangeIndex(start=0, stop=4, step=1)

d    4
b    7
c   -5
a    3
dtype: int64

Index(['d', 'b', 'c', 'a'], dtype='object')

b    7
a    3
c   -5
dtype: int64

BOB      4
STEVE    7
JEFF    -5
RYAN     3
dtype: int64

#用bool值过滤、与标量相乘、应用数学函数等都保留索引值连接

obj2[obj2 > 0]
obj2 * 2
np.exp(obj2)

d    4
b    7
a    3
dtype: int64

d     8
b    14
c   -10
a     6
dtype: int64

d      54.598150
b    1096.633158
c       0.006738
a      20.085537
dtype: float64

#Series数组与字典(可认为Series是一个长度固定且有序的字典)

'b' in obj2
'e' in obj2

#将字典转为Series

sdata = {
   'A':3500, 'B':4500, 'C':2300, 'D':1300}
obj3 = pd.Series(sdata)
obj3

states = ['C', 'D', 'E', 'A']
obj4 = pd.Series(sdata, index=states)       #可自定索引将字典生成Series数组，不包含的索引对应值为NaN
obj4

True

False

A    3500
B    4500
C    2300
D    1300
dtype: int64

C    2300.0
D    1300.0
E       NaN
A    3500.0
dtype: float64

# isnull() notnull()检查缺失数据

pd.isnull(obj4)    #为空返回True
pd.notnull(obj4)   #不为空返回True
obj4.isnull()

pd.isnull(obj4).sum()  #空值个数

C    False
D    False
E     True
A    False
dtype: bool

C     True
D     True
E    False
A     True
dtype: bool

C    False
D    False
E     True
A    False
dtype: bool

1

#自动对齐索引属性

obj3
obj4
obj3 + obj4  #相同索引对应值相加，不同索引值无法处理返回NaN

A    3500
B    4500
C    2300
D    1300
dtype: int64

C    2300.0
D    1300.0
E       NaN
A    3500.0
dtype: float64

A    7000.0
B       NaN
C    4600.0
D    2600.0
E       NaN
dtype: float64

#对Series数组命名，对其索引命名（name属性）

obj4.name = 'population'
obj4.index.name = 'state'    #值不能命名
obj4

state
C    2300.0
D    1300.0
E       NaN
A    3500.0
Name: population, dtype: float64

# DataFrame的创建

data = {
   'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year':[2000, 2001, 2002, 2001, 2002, 2003],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}                      #利用包含等长度列表或numpy数组的字典来创建DataFrame
frame = pd.DataFrame(data)
frame

frame.head()  #对于大型表格，用head()只选取前五行

	state	year	pop
0	Ohio	2000	1.5
1	Ohio	2001	1.7
2	Ohio	2002	3.6
3	Nevada	2001	2.4
4	Nevada	2002	2.9
5	Nevada	2003	3.2

	state	year	pop
0	Ohio	2000	1.5
1	Ohio	2001	1.7
2	Ohio	2002	3.6
3	Nevada	2001	2.4
4	Nevada	2002	2.9

# DataFrame的索引

frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame2       #可指定列的顺序，更改索引名，若所指定的列不存在，返回NaN值

frame2.year
frame2['year']    #选取列，这两种方式等价

frame2.loc['three']   #选取行

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	NaN
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	NaN
five	2002	Nevada	2.9	NaN
six	2003	Nevada	3.2	NaN

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

#更改列值

frame2['debt'] = 16.5   
frame2

frame2['debt'] = np.arange(6)
frame2

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val           #将Series赋值给一列时，其索引将会按照DataFrame的索引重新排列，并在空缺地方填充缺失值
frame2

	year	state	pop	debt
one	2000	Ohio	1.5	16.5
two	2001	Ohio	1.7	16.5
three	2002	Ohio	3.6	16.5
four	2001	Nevada	2.4	16.5
five	2002	Nevada	2.9	16.5
six	2003	Nevada	3.2	16.5

	year	state	pop	debt
one	2000	Ohio	1.5	0
two	2001	Ohio	1.7	1
three	2002	Ohio	3.6	2
four	2001	Nevada	2.4	3
five	2002	Nevada	2.9	4
six	2003	Nevada	3.2	5

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	-1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	-1.5
five	2002	Nevada	2.9	-1.7
six	2003	Nevada	3.2	NaN

#删除列

frame2['eastern'] = (frame2.state == 'Ohio')   #创建新列，且值为bool值
frame2

del frame2['eastern']
frame2

	year	state	pop	debt	eastern
one	2000	Ohio	1.5	NaN	True
two	2001	Ohio	1.7	-1.2	True
three	2002	Ohio	3.6	NaN	True
four	2001	Nevada	2.4	-1.5	False
five	2002	Nevada	2.9	-1.7	False
six	2003	Nevada	3.2	NaN	False

	year	state	pop	debt
one	2000	Ohio	1.5	NaN
two	2001	Ohio	1.7	-1.2
three	2002	Ohio	3.6	NaN
four	2001	Nevada	2.4	-1.5
five	2002	Nevada	2.9	-1.7
six	2003	Nevada	3.2	NaN

#DataFrame的创建2

pop = {
   'Nevada': {
   2001

最低0.47元/天解锁文章

嗯_雅娴

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python-pandas基础

from IPython.core.interactiveshell import InteractiveShellInteractiveShell.ast_node_interactivity = "all" #pandas基础import pandas as pdfrom pandas import Series, DataFrameimport numpy as np#S...
复制链接

扫一扫

专栏目录