利用python进行数据分析第二章pandas

5.1pandas 数据结构介绍

5.1.1Series

代码

import pandas as pd
from pandas import Series,DataFrame
import numpy as np
#Series一维数组型对象,包含数据标签,称为索引
def sseries():
    obj = pd.Series([4,7,-5,3])
    #索引左边值右边
    #值
    print(obj.values)

    #索引
    print(obj.index)

    #标签标识每个数据点
    obj2 =pd.Series([4,7,-5,3],index=['d','b','a','c'])
    print(obj2.index)

    print(obj2['a'])

    print(obj2['b'])

    print(obj2[obj2>2])
    print(obj2*2)
    print(np.exp(obj2))

    print('b' in obj2)
    print('e' in obj2)


    #sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
    sdata={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000} 
    obj3 = pd.Series(sdata)
    print(obj3)

    states = ['California','Ohio','Oregon','Texas']
    obj4 = pd.Series(sdata,index=states)
    print(obj4)

    #isnull和notnull检验缺失或“NA”数据
    print(pd.isnull(obj4))
    print(pd.notnull(obj4))
    print(obj4.isnull())

    print(obj3)
    print(obj4)
    print(obj3+obj4)

    obj4.name = 'population'
    obj4.index.name = 'state'
    print(obj4)

    print(obj)
    obj.index = ['Bob','Steve','Jeff','Ryan']
    print(obj)


if __name__=='__main__':
    sseries()

结果

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)
Index(['d', 'b', 'a', 'c'], dtype='object')
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
7
d    4
b    7
c    3
dtype: int64
d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
True
False
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
0    4
1    7
2   -5
3    3
dtype: int64
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

C:\Users\37596>G:\数据分析\利用python进行数据分析\pd0531.py
[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
7
d    4
b    7
c    3
dtype: int64
d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
True
False
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
0    4
1    7
2   -5
3    3
dtype: int64
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

5.1.2DataFrame

代码

import pymysql as MySQLdb
import pandas as pd
import numpy as np
#等长度的列表
data ={'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
'year':[2000,2001,2002,2001,2002,2003],
'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(data)
print(frame)
frame.head()
print(pd.DataFrame(data,columns=['year','state','pop']))
frame2 = pd.DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five','six'])
print(frame2)
print(frame2.columns)
print(frame2['state'])
print(frame2.year)

#行可以用位置或者特殊属性loc进行选取
print(frame2.loc['three'])
#修改列的引用
frame2['debt']=16.5
print(frame2)
frame2['debt']=np.arange(6.)
print(frame2)

val = pd.Series([-1.2,-1.5,-1.7],index = ['two','four','five'])
frame2['debt']=val
print(frame2)

frame2['eastern']=frame2.state == 'Ohio'
print(frame2)

#del移除列
del frame2['eastern']
print(frame2.columns)

#包含字典的嵌套字典
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}

frame3 = pd.DataFrame(pop)
print(frame3)
print(frame3.T)
#pd.DataFrame(pop,index = [2001,2002,2003])
#'list' object has no attribute 'astype',不知哪里错了出了这句提示

#包含Series的字典
pdate ={'Ohio':frame3['Ohio'][:-1],
'Nevada':frame3['Nevada'][:2]}
print(pd.DataFrame(pdate))

#DataFrame构造函数的有效输入 见P133

frame3.index.name = 'year'
frame3.columns.name = 'state'
print(frame3)
print(frame3.values)
print(frame2.values)

结果

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
six    2003  Nevada  3.2  16.5
       year   state  pop  debt
one    2000    Ohio  1.5   0.0
two    2001    Ohio  1.7   1.0
three  2002    Ohio  3.6   2.0
four   2001  Nevada  2.4   3.0
five   2002  Nevada  2.9   4.0
six    2003  Nevada  3.2   5.0
       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN
       year   state  pop  debt  eastern
one    2000    Ohio  1.5   NaN     True
two    2001    Ohio  1.7  -1.2     True
three  2002    Ohio  3.6   NaN     True
four   2001  Nevada  2.4  -1.5    False
five   2002  Nevada  2.9  -1.7    False
six    2003  Nevada  3.2   NaN    False
Index(['year', 'state', 'pop', 'debt'], dtype='object')
      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6
        2000  2001  2002
Nevada   NaN   2.4   2.9
Ohio     1.5   1.7   3.6
      Ohio  Nevada
2000   1.5     NaN
2001   1.7     2.4
state  Nevada  Ohio
year
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6
[[nan 1.5]
 [2.4 1.7]
 [2.9 3.6]]
[[2000 'Ohio' 1.5 nan]
 [2001 'Ohio' 1.7 -1.2]
 [2002 'Ohio' 3.6 nan]
 [2001 'Nevada' 2.4 -1.5]
 [2002 'Nevada' 2.9 -1.7]
 [2003 'Nevada' 3.2 nan]]

5.1.3索引对象

代码

pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame3 = pd.DataFrame(pop)
obj = pd.Series(range(3),index = ['a','b','c'])
index = obj.index
print(index)
print(index[1:])

labels = pd.Index(np.arange(3))
print(labels)

obj2= pd.Series([1.5,-2.5,0],index = labels)
print(obj2)
print(obj2.index is labels)



结果

Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')
Int64Index([0, 1, 2], dtype='int64')
0    1.5
1   -2.5
2    0.0
dtype: float64
True

表5-2一些索引对象地方法和属性 p135

方法描述
append将额外地索引对象黏贴到原索引后产生一个新的索引
difference差集
intersection交集
union并集
isin每一个值在传值器中的布尔数组
delete将位置i的元素删除,并产生新的索引
drop根据传参删除指定索引值,并产生新的索引
insert在位置i插入元素,并产生新的索引
is_monotonic如果索引序列递增则返回TRUE
is_uniqu如果索引序列唯一则返回TRUE
unique计算索引的唯一值序列

5.2基本功能

5.1重建索引

代码

obj = pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
print(obj)
#reindex:重建索引,会补充缺失值
obj2=obj.reindex(['a','b','c','d','e'])
print(obj2)
#在时间序列数据中,method参数里的ffill会将值前向填充
obj3 = pd.Series(['blue','purple','yellow'],index = [0,2,4])
print(obj3)
print(obj3.reindex(range(6),method ='ffill'))

#reindex可以改变行索引,列索引,也可以同时改变两者
frame= pd.DataFrame(np.arange(9).reshape((3,3)),
index =['a','b','d'],
columns = ['Ohio','Texas','California'])
print(frame)

frame2 = frame.reindex(['a','b','c','d'])
print(frame2)

states =['Texas','Utah','California']
frame.reindex(columns=states)

#使用loc进行更为简洁的标签索引
print(frame.loc[['a','b','c','d'],states])

#表5-3reindex方法的参数
结果
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
0      blue
2    purple
4    yellow
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
   Ohio  Texas  California
a     0      1           2
b     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   3.0    4.0         5.0
c   NaN    NaN         NaN
d   6.0    7.0         8.0
G:\数据分析\0531.py:26: FutureWarning:
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  print(frame.loc[['a','b','c','d'],states])
   Texas  Utah  California
a    1.0   NaN         2.0
b    4.0   NaN         5.0
c    NaN   NaN         NaN
d    7.0   NaN         8.0
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值