利用Python进行数据分析的学习笔记——chap5

pandas的数据结构介绍

from pandas import Series,DataFrame
import pandas as pd
import numpy as np

Series
(索引在左边,值在右边。可看作是一个定长的有序字典)

obj = Series([4,7,-5,3])
obj
0    4
1    7
2   -5
3    3
dtype: int64
#通过Series的values和index属性获取其数组表示形式和索引对象。
obj.values
obj.index
RangeIndex(start=0, stop=4, step=1)
obj2 = Series([4,7,-5,3],index=['d','b','a','c'])
obj2.index
Index(['d', 'b', 'a', 'c'], dtype='object')
#一些基本操作
obj2[obj2 > 0]
obj2 * 2
np.exp(obj2)
'b' in obj2
'e' in obj2
False
sdata = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
#用字典来创建Series
obj3 = Series(sdata)
states = ['California','Ohio','Oregon','Texas']
obj4 = Series(sdata,index=states)
obj4
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
#检测缺失值
pd.isnull(obj4)
pd.notnull(obj4)
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
#Series在算术运算中会自动对齐不同索引的数据
obj3 + obj4
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
#Series的name属性
obj4.name = 'population'
obj4.index.name = 'state'
obj4
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64
#Series的索引可以通过赋值的方式就地修改
obj.index = ['Bob','Steve','Jeff','Ryan']
obj
Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

DataFrame

#构建DataFrame
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
#DataFrame会自动加上索引,且全部列会被有序排列
frame = DataFrame(data)
frame
stateyearpop
0Ohio20001.5
1Ohio20011.7
2Ohio20023.6
3Nevada20012.4
4Nevada20022.9
#指定列序列,按照该顺序进行排列
DataFrame(data,columns=['year','state','pop'])
yearstatepop
02000Ohio1.5
12001Ohio1.7
22002Ohio3.6
32001Nevada2.4
42002Nevada2.9
#如果传入的列在数据中找不到,就会产生NA值
frame2 = DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five'])
frame2
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7NaN
three2002Ohio3.6NaN
four2001Nevada2.4NaN
five2002Nevada2.9NaN
#将DataFrame的列获取为一个Series
frame2['state']
frame2.year
#通过位置或名称的方式获取行
frame2.loc['three']
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object
#列可以通过赋值的方式进行修改。标量或一组值
frame2['debt'] = 16.5
frame2['debt'] = np.arange(5.)
frame2
yearstatepopdebt
one2000Ohio1.50.0
two2001Ohio1.71.0
three2002Ohio3.62.0
four2001Nevada2.43.0
five2002Nevada2.94.0
#如果赋值的是一个Series,会精确匹配DataFrame的索引,所有的空位都将被填上缺失值
val = Series([-1.2,-1.5,-1.7],index=['two','four','five'])
frame2['debt'] = val
frame2
yearstatepopdebt
one2000Ohio1.5NaN
two2001Ohio1.7-1.2
three2002Ohio3.6NaN
four2001Nevada2.4-1.5
five2002Nevada2.9-1.7
#为不存在的列赋值会创建出一个新列
frame2['eastern'] = frame2.state == 'Ohio'
frame2
yearstatepopdebteastern
one2000Ohio1.5NaNTrue
two2001Ohio1.7-1.2True
three2002Ohio3.6NaNTrue
four2001Nevada2.4-1.5False
five2002Nevada2.9-1.7False
#删除列
del frame2['eastern']
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
#嵌套字典
pop = {'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
#创建DataFrame。外层字典的键作为列,内层键作为行索引
frame3 = DataFrame(pop)
#转置
frame3.T
200120022000
Nevada2.42.9NaN
Ohio1.73.61.5

#有个知识点
在这里插入图片描述

#设置DataFrame的index和columns的name属性
frame3.index.name = 'year';frame3.columns.name = 'state'
frame3.values
#各列数据类型不同,则值数组的数据类型会选用能兼容所有列的数据类型
frame2.values
array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

索引对象

Index对象是不可修改的

obj =Series(range(3),index=['a','b','c'])
index = obj.index
index[1:]
Index(['b', 'c'], dtype='object')
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0],index=index)
obj2.index is index
True

#又有个知识点
在这里插入图片描述

'Ohio' in frame3.columns
2003 in frame3.index
False

#有个知识点
在这里插入图片描述

重新索引

obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj2 = obj.reindex(['a','b','c','d','e'])
obj2
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
obj.reindex(['a','b','c','d','e'],fill_value=0)
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
obj3 = Series(['blue','purple','yellow'],index=[0,2,4])
#ffill实现前向值填充
obj3.reindex(range(6),method='ffill')
#ffill或pad   前向填充(或搬运)值
#bfill或backfill  后向填充(或搬运)值
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
#reindex对DataFrame的修改
frame = DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California'])
frame2 = frame.reindex(['a','b','c','d'])
frame2
OhioTexasCalifornia
a0.01.02.0
bNaNNaNNaN
c3.04.05.0
d6.07.08.0
#使用columns关键字可重新索引列
states = ['Texas','Utah','California']
frame.reindex(columns=states)
TexasUtahCalifornia
a1NaN2
c4NaN5
d7NaN8
#同时对行和列进行重新索引,而插值只能按行应用(即轴0)
#frame.reindex(index=['a','b','c','d'],method='ffill',columns=states)会报错
frame.reindex(index=['a','b','c','d'],columns=states).ffill()
TexasUtahCalifornia
a1.0NaN2.0
b1.0NaN2.0
c4.0NaN5.0
d7.0NaN8.0
#利用ix(被淘汰了)换成loc,继续重新索引
#frame.loc[['a','b','c','d'],states]
#KeyError: "['b'] not in index"
frame = frame.reindex(['a','b','c','d'])
frame = frame.reindex(columns=states)
frame.loc[['a','b','c','d'],states]
#问题是直接frame.reindex并不会改变原来的frame,也就是说frame.reindex返回的是副本,而不是视图(?)
#reindex参数copy默认是T(无论如何都复制),改为F是新旧相等就不复制。
TexasUtahCalifornia
a1.0NaN2.0
bNaNNaNNaN
c4.0NaN5.0
d7.0NaN8.0

#又又又有个知识点
在这里插入图片描述

丢弃指定轴上的项

obj = Series(np.arange(5.),index=['a','b','c','d','e'])
#删除指定值的新对象
new_obj = obj.drop('c')
new_obj
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
#对于DataFrame
data =DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data.drop(['Colorado','Ohio'])
onetwothreefour
Utah891011
New York12131415
#删除指定列
data.drop('two',axis=1)
onethreefour
Ohio023
Colorado467
Utah81011
New York121415

索引、选取和过滤

obj = Series(np.arange(4.),index=['a','b','c','d'])
obj['b']
#等价于
obj[1]
1.0
obj[obj<2]
a    0.0
b    1.0
dtype: float64
#利用标签的切片运算与普通的python切片运算不同,其末端是包含的
obj['b':'c']
b    1.0
c    2.0
dtype: float64
data = DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])
data[['three','one']]
threeone
Ohio20
Colorado64
Utah108
New York1412
data[:2]
onetwothreefour
Ohio0123
Colorado4567
#选取three大于5的所有行所有列
data[data['three']>5]
onetwothreefour
Colorado4567
Utah891011
New York12131415
data[data<5] = 0
data
onetwothreefour
Ohio0000
Colorado0567
Utah891011
New York12131415
data.loc['Colorado',['two','three']]
two      5
three    6
Name: Colorado, dtype: int32
#选取three中>5的前三列
#data.loc[data.three>5,:3]
#TypeError: cannot do slice indexing on Index with these indexers [3] of type int
data.loc[data.three>5,:'three']
onetwothree
Colorado056
Utah8910
New York121314

#又又又又有个知识点
在这里插入图片描述

算术运算和数据对齐

s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','e','f','g'])
s1+s2
a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64
df1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
df1+df2
bcde
ColoradoNaNNaNNaNNaN
Ohio3.0NaN6.0NaN
OregonNaNNaNNaNNaN
Texas9.0NaN12.0NaN
UtahNaNNaNNaNNaN

在算术方法中填充值

add 加法
sub 减法
div 除法
mul 乘法

df1 = DataFrame(np.arange(12.).reshape((3,4)),columns=list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4,5)),columns=list('abcde'))
#当一个对象中某个轴标签在另一个对象中找不到时填充一个特殊值(比如0)
df1.add(df2,fill_value=0)
#类似地,对重新索引也可以指定一个填充值
df1.reindex(columns=df2.columns,fill_value=0)
abcde
00.01.02.03.00
14.05.06.07.00
28.09.010.011.00

DataFrame和Series之间的运算

#计算一个二维数组与其某行之间的差
arr = np.arange(12.).reshape((3,4))
#广播
arr-arr[0]
array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])
frame = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
series = frame.loc['Utah']#series = frame.ix[0]
frame-series
bde
Utah0.00.00.0
Ohio3.03.03.0
Texas6.06.06.0
Oregon9.09.09.0
#若某个索引值找不到,则参与运算的两个对象就会被重新索引形成并集
series2 = Series(range(3),index=['b','e','f'])
frame+series2
bdef
Utah0.0NaN3.0NaN
Ohio3.0NaN6.0NaN
Texas6.0NaN9.0NaN
Oregon9.0NaN12.0NaN
#匹配行在列上广播
series3 = frame['d']
frame.sub(series3,axis=0)
bde
Utah-1.00.01.0
Ohio-1.00.01.0
Texas-1.00.01.0
Oregon-1.00.01.0

函数应用和映射

frame = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
np.abs(frame)
f = lambda x:x.max()-x.min()
frame.apply(f)
frame.apply(f,axis=1)
Utah      2.341364
Ohio      1.889609
Texas     1.479165
Oregon    1.122892
dtype: float64
def f(x):
    return Series([x.min(),x.max()],index=['min','max'])
frame.apply(f)
bde
min-2.121313-0.231704-0.309807
max1.2555920.6704671.786023
#得到各个浮点值的格式化字符串
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)
Utah       1.79
Ohio      -0.31
Texas      1.42
Oregon     0.13
Name: e, dtype: object

排序和排名

obj = Series(range(4),index=['d','a','b','c'])
obj.sort_index()
a    1
b    2
c    3
d    0
dtype: int64
#升序排序
frame = DataFrame(np.arange(8).reshape((2,4)),index=['three','one'],columns=['d','a','b','c'])
frame.sort_index()
dabc
one4567
three0123
frame.sort_index(axis=1)
abcd
three1230
one5674
#降序排序
frame.sort_index(axis=1,ascending=False)
dcba
three0321
one4765
#按值对Series进行排序,可用order方法
#Python3.6之后的版本已经没有order属性了,尝试使用sort_values()方法就好了。
obj = Series([4,7,-3,2])
obj.sort_values()
2   -3
3    2
0    4
1    7
dtype: int64
#排序时,任何缺失值默认都会被放到Series的末尾
obj = Series([4,np.nan,7,np.nan,-3,2])
obj.sort_values()
4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
#在DataFrame上,要根据一个或多个列中的值进行排序
frame = DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
#frame.sort_index(by='b')
frame.sort_values(by='b')
ba
2-30
321
040
171
frame.sort_values(by=['a','b'])
ba
2-30
040
321
171
obj = Series([7,-5,7,4,2,0,4])
obj.rank()
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
#根据值在原数据中出现的顺序给出排名
obj.rank(method='first')
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
#按降序排名
obj.rank(ascending=False,method='max')
0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

#又又又又又有个知识点
在这里插入图片描述

带有重复值的轴索引

obj = Series(range(5),index=['a','a','b','b','c'])
#判断索引值是否唯一
obj.index.is_unique
False
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df.loc['b']
012
b-0.9949410.3047690.930754
b0.9182180.5773932.664499

汇总和计算描述统计

df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
#按列求和
#默认skipna=True,即NA值会自动被排除
df.sum()
#按行求和
df.sum(axis=1)
a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64
df.mean(axis=1,skipna=False)
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

#又又又又又又有个知识点
在这里插入图片描述

#返回达到最大值的索引
df.idxmax()
one    b
two    d
dtype: object
#累计型
df.cumsum()
onetwo
a1.40NaN
b8.50-4.5
cNaNNaN
d9.25-5.8
#一次性产生多个汇总统计
df.describe()
onetwo
count3.0000002.000000
mean3.083333-2.900000
std3.4936852.262742
min0.750000-4.500000
25%1.075000-3.700000
50%1.400000-2.900000
75%4.250000-2.100000
max7.100000-1.300000
#对于非数值型数据
obj = Series(['a','a','b','c']*4)
obj.describe()
count     16
unique     3
top        a
freq       8
dtype: object

#又又又又又又又有个知识点
在这里插入图片描述
在这里插入图片描述

相关系数与协方差

#import pandas.io.data as web
import pandas_datareader.data as web
#px=web.DataReader('F-F_Research_Data_factors','famafrench')
all_data = {}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker,'1/1/2000','1/1/2010')
price = DataFrame({tic:data['Adj Close'] for tic,data in all_data.iteritems()})
volume = DataFrame({tic:data['Volume'] for tic,data in all_data.iteritems()})

报错。

price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
volume = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_volume.pkl')
price.head()
AAPLGOOGIBMMSFT
Date
2010-01-0427.990226313.062468113.30453625.884104
2010-01-0528.038618311.683844111.93582225.892466
2010-01-0627.592626303.826685111.20868325.733566
2010-01-0727.541619296.753749110.82373225.465944
2010-01-0827.724725300.709808111.93582225.641571
volume.head()
AAPLGOOGIBMMSFT
Date
2010-01-041234324003927000615530038409100
2010-01-051504762006031900684140049749600
2010-01-061380400007987100560530058182400
2010-01-0711928280012876600584060050559700
2010-01-081119027009483900419720051197400
#计算价格的百分比变化
returns = price.pct_change()
returns.tail()
AAPLGOOGIBMMSFT
Date
2016-10-17-0.0006800.0018370.002072-0.003483
2016-10-18-0.0006810.019616-0.0261680.007690
2016-10-19-0.0029790.0078460.003583-0.002255
2016-10-20-0.000512-0.0056520.001719-0.004867
2016-10-21-0.0039300.003011-0.0124740.042096
#Series的corr方法用于计算两个Series中重叠的、非NA的、按索引对齐的值的相关系数。cov用于计算协方差
returns.MSFT.corr(returns.IBM)
0.49976361144151155
returns.MSFT.cov(returns.IBM)
8.870655479703546e-05
#对于DataFrame数据
returns.corr()
AAPLGOOGIBMMSFT
AAPL1.0000000.4079190.3868170.389695
GOOG0.4079191.0000000.4050990.465919
IBM0.3868170.4050991.0000000.499764
MSFT0.3896950.4659190.4997641.000000
returns.cov()
AAPLGOOGIBMMSFT
AAPL0.0002770.0001070.0000780.000095
GOOG0.0001070.0002510.0000780.000108
IBM0.0000780.0000780.0001460.000089
MSFT0.0000950.0001080.0000890.000215
#DataFrame的列或行与Series数据或DataFrame之间的相关系数
returns.corrwith(returns.IBM)
AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64
#传入一个DataFrame则会计算按列名配对的相关系数。若axis=1则是按行。
returns.corrwith(volume)
AAPL   -0.075565
GOOG   -0.007067
IBM    -0.204849
MSFT   -0.092950
dtype: float64

唯一值、值计数以及成员资格

isin 计算一个表示“Series各值是否包含于传入的值序列中”的布尔型数组;
unique 计算Series中的唯一值数组,按发现的顺序返回;
value_counts 返回一个Series,其索引为唯一值,其值为频率,按计数值降序排列

obj = Series(['c','a','d','a','a','b','b','c','c'])
#得到Series中唯一值数组
uniques = obj.unique()
uniques
array(['c', 'a', 'd', 'b'], dtype=object)
#各值出现的频率
obj.value_counts()
c    3
a    3
b    2
d    1
dtype: int64
#value_counts可用于任何数组或序列
pd.value_counts(obj.values,sort=False)
c    3
a    3
d    1
b    2
dtype: int64
#判断矢量化集合的成员资格
mask = obj.isin(['b','c'])
obj[mask]
0    c
5    b
6    b
7    c
8    c
dtype: object
data = DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})
result = data.apply(pd.value_counts).fillna(0)
result
Qu1Qu2Qu3
11.01.01.0
20.02.01.0
32.02.00.0
42.00.02.0
50.00.01.0

处理缺失数据

pandas对象上的所有描述统计都排除了缺失数据

string_data = Series(['aardvark','artichoke',np.nan,'avocado'])
string_data.isnull()
0    False
1    False
2     True
3    False
dtype: bool
#python内置的None值也会被当作NA处理
string_data[0] = None
string_data.isnull()
0     True
1    False
2     True
3    False
dtype: bool

#又又又又又又又又有个知识点
在这里插入图片描述

滤除缺失值

from numpy import nan as NA
data = Series([1,NA,3.5,NA,7])
data.dropna()
0    1.0
2    3.5
4    7.0
dtype: float64
data[data.notnull()]
0    1.0
2    3.5
4    7.0
dtype: float64
#dropna默认丢弃任何含有缺失值的行
data = DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])
cleaned = data.dropna()
cleaned
012
01.06.53.0
#只丢弃全为NA的行
data.dropna(how='all')
012
01.06.53.0
11.0NaNNaN
3NaN6.53.0
#丢弃列的操作
data[4] = NA
data.dropna(axis=1,how='all')
012
01.06.53.0
11.0NaNNaN
2NaNNaNNaN
3NaN6.53.0
#只留下一部分观测数据的操作
df = DataFrame(np.random.randn(7,3))
df.loc[:4,1] = NA;df.loc[:2,2] = NA
#thresh:非空元素最低数量。int型,默认为None。如果该行/列中,非空元素数量小于这个值,就删除该行/列。
df.dropna(thresh=3)
012
5-1.991372-1.6445750.675400
61.7184510.312742-1.484959

填充缺失数据

df.fillna(0)
012
0-0.2922300.0000000.000000
10.1298260.0000000.000000
2-0.0753070.0000000.000000
30.2804760.000000-1.259970
4-1.1717380.0000000.206481
5-1.991372-1.6445750.675400
61.7184510.312742-1.484959
#使用字典,实现对不同的列填充不同的值
df.fillna({1:0.5,3:-1})
012
0-0.2922300.500000NaN
10.1298260.500000NaN
2-0.0753070.500000NaN
30.2804760.500000-1.259970
4-1.1717380.5000000.206481
5-1.991372-1.6445750.675400
61.7184510.312742-1.484959
#fillna默认返回新对象,但也可以对现有对象进行就地修改
#总是返回被填充对象的引用
_ = df.fillna(0,inplace=True)
df
012
0-0.2922300.0000000.000000
10.1298260.0000000.000000
2-0.0753070.0000000.000000
30.2804760.000000-1.259970
4-1.1717380.0000000.206481
5-1.991372-1.6445750.675400
61.7184510.312742-1.484959

df = DataFrame(np.random.randn(6,3))
df.loc[2:,1] = NA;df.loc[4:,2] = NA
df.fillna(method='ffill',limit=2)
012
0-0.814015-1.672914-0.437364
10.2942090.038563-0.141332
2-0.3370910.038563-0.041438
30.6984580.038563-0.750640
4-0.369432NaN-0.750640
5-0.437763NaN-0.750640
data = Series([1.,NA,3.5,NA,7])
data.fillna(data.mean())
0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

#又又又又又又又又又有个知识点
在这里插入图片描述

层次化索引

能使一个轴上拥有多个索引级别,能以低维度形式处理高维度数据

data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,2,3]])
data
a  1    0.861096
   2    0.613551
   3    1.130427
b  1   -0.210724
   2    0.962846
   3    0.393051
c  1   -0.774183
   2    0.456655
d  2   -0.824490
   3    0.908530
dtype: float64
data.index
MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 2),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )
#选取子集
data['b']
1   -0.210724
2    0.962846
3    0.393051
dtype: float64
data.loc[['b','d']]#不是data.loc['b','d']
b  1   -0.210724
   2    0.962846
   3    0.393051
d  2   -0.824490
   3    0.908530
dtype: float64
#在内层中进行选取
data[:,2]
a    0.613551
b    0.962846
c    0.456655
d   -0.824490
dtype: float64
#通过unstack将Series数据变为DataFrame数据
data.unstack()
123
a0.8610960.6135511.130427
b-0.2107240.9628460.393051
c-0.7741830.456655NaN
dNaN-0.8244900.908530
data.unstack().stack()
a  1    0.861096
   2    0.613551
   3    1.130427
b  1   -0.210724
   2    0.962846
   3    0.393051
c  1   -0.774183
   2    0.456655
d  2   -0.824490
   3    0.908530
dtype: float64
#对于DataFrame数据,每条轴都可以有分层索引。索引名称和轴标签不一样。
frame = DataFrame(np.arange(12).reshape((4,3)),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
2345
b1678
291011
frame['Ohio']
colorGreenRed
key1key2
a101
234
b167
2910
#另一种创建方法
MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']],names=['state','color'])

大概这个意思

重排分级顺序

#swaplevel接受两个级别编号或称号,并返回一个互换了级别的新对象
frame.swaplevel('key1','key2')
stateOhioColorado
colorGreenRedGreen
key2key1
1a012
2a345
1b678
2b91011
#根据单个级别中的值对数据进行排序
#frame.sortlevel(1)
# 'DataFrame' object has no attribute 'sortlevel'无该函数了
frame.sort_values(axis=0,by='key2')
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
b1678
a2345
b291011
frame.swaplevel(0,1).sort_index()
stateOhioColorado
colorGreenRedGreen
key2key1
1a012
b678
2a345
b91011

在层次化索引的对象上,如果索引是按字典方式从外到内排序(即调用sort_index),数据选取操作的性能要好很多。

根据级别汇总统计

#frame.sum(level='key2')这个未来会弃用
frame.groupby(level='key2').sum()
stateOhioColorado
colorGreenRedGreen
key2
16810
2121416
frame.groupby(level='color',axis=1).sum()
colorGreenRed
key1key2
a121
284
b1147
22010

使用DataFrame的列

frame = DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':[0,1,2,0,1,2,3]})
#set_index可将一个或多个列转换为行索引,并创建一个新的DataFrame
frame2 = frame.set_index(['c','d'])
frame2
ab
cd
one007
116
225
two034
143
252
361
# 也可以将那些列保留下来
frame.set_index(['c','d'],drop=False)
#将层次化索引的级别转移到列里面
frame2.reset_index()
cdab
0one007
1one116
2one225
3two034
4two143
5two252
6two361

其他有关pandas的话题

整数索引

#整数索引
ser = Series(np.arange(3.))
#ser[-1]会报错
#非整数索引
ser2 = Series(np.arange(3.),index=['a','b','c'])
ser2[-1]
2.0
#面向轴标签的索引
# df.loc的第一个参数是行标签,第二个参数为列标签(可选参数,默认为所有列标签),
# 两个参数既可以是列表也可以是单个字符,
# 如果两个参数都为列表则返回的是DataFrame,否则,则为Series。
ser.loc[:1]
0    0.0
1    1.0
dtype: float64
#可靠的、不考虑索引类型的、基于位置的索引
ser3 = Series(range(3),index=[-5,1,3])
#ser3.iget_value(2)被取代了
ser3.iat[2]
2
frame = DataFrame(np.arange(6).reshape(3,2),index=[2,0,1])
#frame.irow(0)被舍弃
frame.iloc[0]
#等价
frame.iloc[0,:]
0    0
1    1
Name: 2, dtype: int32

面板数据

可以用一个由DataFrame对象组成的字典或一个三维ndarray来创建Panel对象

price = pd.read_pickle('E:/python_study_files/python/pydata-notebook-master/examples/yahoo_price.pkl')
type(price)
pandas.core.frame.DataFrame
price.loc['6/1/2012']
AAPL     73.371509
GOOG    285.205295
IBM     168.989059
MSFT     25.262972
Name: 2012-06-01 00:00:00, dtype: float64
#用堆积式的DataFrame方法呈现面板数据
stacked=price.loc['6/1/2012'].to_frame()
stacked
2012-06-01
AAPL73.371509
GOOG285.205295
IBM168.989059
MSFT25.262972
pd.Panel(stacked)
#Panel被移除了。
AttributeError: module 'pandas' has no attribute 'Panel'
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值