1. 作用
可以处理关系型或带标签的数据,方便对数据进行处理、抽取和操作;
2. 数据结构
一维:Series
多维:DataFrame
索引:Index
3. Series使用
3.1 数据定义
# 1.1 普通定义
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
print(s1)
# 1.2 numpy数组定义
arr1 = np.arange(0,5,1)
s2 = pd.Series(arr1)
print(s2)
# 1.3 字典定义
dict1 = {'red':200,'blue':1000,'yellow':500}
s3 = pd.Series(dict1)
print(s3)
'''
a 1
b 2
c 3
d 4
dtype: int64
0 0
1 1
2 2
3 3
4 4
dtype: int32
blue 1000
red 200
yellow 500
dtype: int64
'''
3.2 索引方式(筛选)
print(s1[0:2])
print(s3['blue'])
print(s3[s3>8])
'''
a 1
b 2
dtype: int64
1000
blue 1000
red 200
yellow 500
dtype: int64
'''
3.3 赋值(修改)
s3['blue'] = 1
print(s3)
'''
blue 1
red 200
yellow 500
dtype: int64
'''
3.4 常用函数
# 4. 常用函数
s4 = pd.Series([1,2,3,1,4,2])
print(s4.unique())
print(s4.value_counts())
print(s4.isin([0,3]))
print(s4.isnull())
'''
[1 2 3 4]
2 2
1 2
4 1
3 1
dtype: int64
0 False
1 False
2 True
3 False
4 False
5 False
dtype: bool
0 False
1 False
2 False
3 False
4 False
5 False
dtype: bool
'''
4. DataFrame使用
4.1 DataFrame定义
# 1.1 字典形式
data1 = {'color':['blue','red'],'object':['ball','pen'],'price':[1,2]}
# 定义列名
data1 = pd.DataFrame(data1)
print(data1)
# 1.2 嵌套字典
data2 = {'red':{2012:22,2013:33},
'white':{2011:13,2012:22}}
data2 = pd.DataFrame(data2)
print(data2)
'''
color object price
0 blue ball 1
1 red pen 2
red white
2011 NaN 13.0
2012 22.0 22.0
2013 33.0 NaN
'''
4.2 数据索引
print(data1['color'])#按列取数据
print(data1.ix[1:,1:])#取对应范围的值
print("--------")
print(data1.loc[:,'color'])#loc:用列标签(index范围和column范围)选取数据(列名)
print('---------')
print(data1.iloc[:,1])#iloc:用位置(index范围和column范围)选取数据
'''
0 blue
1 red
Name: color, dtype: object
object price
1 pen 2
--------
0 blue
1 red
Name: color, dtype: object
---------
0 ball
1 pen
Name: object, dtype: object
'''
5. Index使用
# coding=utf-8
import pandas as pd
ser = pd.Series([1,5,3,4],index=['a','e','c','d'])
print(ser)
print(ser.index) # 索引项
print(ser.idxmin(),ser.idxmax()) # 最小、最大索引
ser.reindex(['1','2','3','4'])
print(ser.index) # 更换索引
print(ser.sort_values) #
print(ser.drop(['a']))
'''
a 1
e 5
c 3
d 4
dtype: int64
Index(['a', 'e', 'c', 'd'], dtype='object')
a e
Index(['a', 'e', 'c', 'd'], dtype='object')
<bound method Series.sort_values of a 1
e 5
c 3
d 4
dtype: int64>
e 5
c 3
d 4
dtype: int64
'''
6. 函数映射
import pandas as pd
# pandas映射函数
f = lambda x:x.max()-x.min()
data3 = pd.DataFrame([[1,2,3,4],[5,6,7,8]])
data4 = data3.apply(f,axis=1) # 1为行,0为列
print(data4)
data5 = data3.apply(f)
print(data5)
'''
0 3
1 3
dtype: int64
0 4
1 4
2 4
3 4
dtype: int64
'''
7. 排序
import pandas as pd
import numpy as np
# 0.定义数据
arr1 = np.arange(0,5,1)
data1 = pd.DataFrame(arr1,index=['a','c','b','d','e'],columns=['A'])
print(data1)
# 1.根据索引排序(增序或降序)
data1_index = data1.sort_index(ascending=False)
print(data1_index)
# 2.根据值排序
data1_values = data1.sort_values(by='A')
print(data1_values)
# 3.排位次
data1_ranking = data1.rank()
print(data1_ranking)
'''
A
a 0
c 1
b 2
d 3
e 4
A
e 4
d 3
c 1
b 2
a 0
A
a 0
c 1
b 2
d 3
e 4
A
a 1.0
c 2.0
b 3.0
d 4.0
e 5.0
'''
8. 相关性和协方差
#相关性和协方差
import pandas as pd
import numpy as np
seq1 = pd.Series([1,2,3,4],[2003,2004,2005,2006])
seq2 = pd.Series([1,2,3,4],[2003,2004,2005,2007])
seq3 = pd.Series([1,2],[2003,2004])
arr1 = np.arange(1,5,1).reshape((2,2))
data1 = pd.DataFrame(arr1)
print(seq1.corr(seq2)) # 相关度
print(seq1.cov(seq2)) # 协方差
print(data1.corr())
print(data1.cov())
print(data1.corrwith(seq3))
'''
1.0
1.0
0 1
0 1.0 1.0
1 1.0 1.0
0 1
0 2.0 2.0
1 2.0 2.0
0 NaN
1 NaN
dtype: float64
'''
9. Nan数据
# coding=utf-8
import pandas as pd
import numpy as np
# 对Nan数据处理
data1 = pd.DataFrame([[6,np.nan,10]],columns=['ball','mug','pen'])
print(pd.DataFrame([[6,np.nan,10]]).shape) # (1, 3)
print(pd.DataFrame([6,np.nan,10]).shape) # (3, 1)
# 1.1 为nan赋值
#data1['mug'] = 0
#print(data1)
# 1.2 过滤Nan(过滤使用notnull()针对于series类型)
ser1 = pd.Series([6,np.nan,10],index=['ball','mug','pen'])
print(ser1.notnull())
data2_tep = ser1[ser1.notnull()]
print(data2_tep)
print('-------')
print(data1.dropna(axis=1, how='any')) # all删除行列均Nan的值,any删除所有的Nan值
# 1.3 填充Nan
data2 = data1.fillna(0) # 填充Nan为0
print(data2)
'''
(1, 3)
(3, 1)
ball True
mug False
pen True
dtype: bool
ball 6.0
pen 10.0
dtype: float64
-------
ball pen
0 6 10
ball mug pen
0 6 0.0 10
'''
10. 等级索引
在单轴上创建多级索引,实现二维情况下操作多维数据
# 4.等级索引
print('-------------')
ser1 = pd.Series(np.random.rand(8),index=[['white','white','white','blue','blue','red','red','red'],['up','down','right','up','down','up','down','left']])
print(ser1['white'])
'''
-------------
up 0.821285
down 0.969028
right 0.376834
dtype: float64
'''