python-pandas基础

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 
#pandas基础

import pandas as pd
from pandas import Series, DataFrame

import numpy as np
#Series数组的创建与索引

obj = pd.Series([4, 7, -5, 3])    #不设置索引时,默认数字,且从0开始
obj
obj.index
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'c', 'a'])
obj2
obj2.index

obj2[['b', 'a', 'c']]    #利用index来索引

obj.index = ['BOB', 'STEVE', 'JEFF', 'RYAN']    #更改Series的索引
obj
0    4
1    7
2   -5
3    3
dtype: int64

RangeIndex(start=0, stop=4, step=1)

d    4
b    7
c   -5
a    3
dtype: int64

Index(['d', 'b', 'c', 'a'], dtype='object')

b    7
a    3
c   -5
dtype: int64

BOB      4
STEVE    7
JEFF    -5
RYAN     3
dtype: int64
#用bool值过滤、与标量相乘、应用数学函数等都保留索引值连接

obj2[obj2 > 0]
obj2 * 2
np.exp(obj2)
d    4
b    7
a    3
dtype: int64

d     8
b    14
c   -10
a     6
dtype: int64

d      54.598150
b    1096.633158
c       0.006738
a      20.085537
dtype: float64
#Series数组与字典(可认为Series是一个长度固定且有序的字典)

'b' in obj2
'e' in obj2

#将字典转为Series

sdata = {
   'A':3500, 'B':4500, 'C':2300, 'D':1300}
obj3 = pd.Series(sdata)
obj3

states = ['C', 'D', 'E', 'A']
obj4 = pd.Series(sdata, index=states)       #可自定索引将字典生成Series数组,不包含的索引对应值为NaN
obj4
True

False

A    3500
B    4500
C    2300
D    1300
dtype: int64

C    2300.0
D    1300.0
E       NaN
A    3500.0
dtype: float64
# isnull() notnull()检查缺失数据

pd.isnull(obj4)    #为空返回True
pd.notnull(obj4)   #不为空返回True
obj4.isnull()

pd.isnull(obj4).sum()  #空值个数
C    False
D    False
E     True
A    False
dtype: bool

C     True
D     True
E    False
A     True
dtype: bool

C    False
D    False
E     True
A    False
dtype: bool

1
#自动对齐索引属性

obj3
obj4
obj3 + obj4  #相同索引对应值相加,不同索引值无法处理返回NaN
A    3500
B    4500
C    2300
D    1300
dtype: int64

C    2300.0
D    1300.0
E       NaN
A    3500.0
dtype: float64

A    7000.0
B       NaN
C    4600.0
D    2600.0
E       NaN
dtype: float64
#对Series数组命名,对其索引命名(name属性)

obj4.name = 'population'
obj4.index.name = 'state'    #值不能命名
obj4
state
C    2300.0
D    1300.0
E       NaN
A    3500.0
Name: population, dtype: float64
# DataFrame的创建

data = {
   'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
       'year':[2000, 2001, 2002, 2001, 2002, 2003],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}                      #利用包含等长度列表或numpy数组的字典来创建DataFrame
frame = pd.DataFrame(data)
frame

frame.head()  #对于大型表格,用head()只选取前五行
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
# DataFrame的索引

frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five', 'six'])
frame2       #可指定列的顺序,更改索引名,若所指定的列不存在,返回NaN值

frame2.year
frame2['year']    #选取列,这两种方式等价

frame2.loc['three']   #选取行
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 NaN
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 NaN
five 2002 Nevada 2.9 NaN
six 2003 Nevada 3.2 NaN
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
#更改列值

frame2['debt'] = 16.5   
frame2

frame2['debt'] = np.arange(6)
frame2

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val           #将Series赋值给一列时,其索引将会按照DataFrame的索引重新排列,并在空缺地方填充缺失值
frame2
year state pop debt
one 2000 Ohio 1.5 16.5
two 2001 Ohio 1.7 16.5
three 2002 Ohio 3.6 16.5
four 2001 Nevada 2.4 16.5
five 2002 Nevada 2.9 16.5
six 2003 Nevada 3.2 16.5
year state pop debt
one 2000 Ohio 1.5 0
two 2001 Ohio 1.7 1
three 2002 Ohio 3.6 2
four 2001 Nevada 2.4 3
five 2002 Nevada 2.9 4
six 2003 Nevada 3.2 5
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
#删除列

frame2['eastern'] = (frame2.state == 'Ohio')   #创建新列,且值为bool值
frame2

del frame2['eastern']
frame2
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
six 2003 Nevada 3.2 NaN False
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
#DataFrame的创建2

pop = {
   'Nevada': {
   2001
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值