Series(一维容器)
Series特点
- 一种key-value型数据类型
- 显式index(用户定义) 和 隐式index(系统分配,类似于列表的下标)
pandas下载
pip install pandas -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
如果出错可能是numpy版本过低
可以更新一下numpy
pip install --upgrade numpy -i https://pypi.tuna.tsinghua.edu.cn/simple
serise定义:pd.Series(date = ,index = )
date与index等长。
import pandas as pd
mySeries = pd.Series(data=[11, 12, 13, 14, 15, 16], index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries)
#a 11
#b 12
#c 13
#d 14
#e 15
#f 16
#dtype: int64
当value只有一个的时候,python会自动补齐
import pandas as pd
mySeries = pd.Series(11, index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries)
'''
a 11
b 11
c 11
d 11
e 11
f 11
dtype: int64
'''
series的操作方法
import pandas as pd
mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries)
'''
a 1
b 2
c 3
d 4
e 5
f 6
dtype: int6
查看显示index部分
print(mySeries.index)
'''
Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
'''
查看values部分
print(mySeries.values)
'''
[1 2 3 4 5 6]
'''
对显示index进行切片操作
import pandas as pd
mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries[['b', 'c']])
'''
b 2
c 3
dtype: int64
'''
print(mySeries['b':'d']) # 有start和end,没有step
'''
b 2
c 3
d 4
dtype: int64
'''
对隐式index进行索引操作
import pandas as pd
mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries[1:4:2]) # start,end,step
'''
b 2
d 4
dtype: int64
'''
支持显示index的in操作
import pandas as pd
mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
print('a' in mySeries)
# True
print('g' in mySeries)
# False
这里的reindex只能改变原来index的次序,而不是定义一个全新的index,如果index再原来的index中没有,系统会自动为新的index填充上NaN的值
import pandas as pd
mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
mySeries1 = mySeries.reindex(['a', 'b', 'j', 'f', 'l', 'm'])
print(mySeries)
'''
a 1
b 2
c 3
d 4
e 5
f 6
dtype: int64
'''
print(mySeries1)
'''
a 1.0
b 2.0
j NaN
f 6.0
l NaN
m NaN
dtype: float64
'''
mySeries1 = mySeries.reindex(['h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'])
print(mySeries1)
'''
h NaN
i NaN
j NaN
k NaN
l NaN
m NaN
o NaN
p NaN
q NaN
dtype: float64
'''
DataFrame(二维容器)
定义
- 直接定义(很少用)
- 导入定义(很常用)
直接定义
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df1)
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
'''
导入定义
当pandas包导入一个外部文件时,自动转化为DataFrame对象
df2 = pd.read_csv('src') #src为csv格式的文件路径
import pandas as pd
import numpy as np
df2 = pd.read_csv('src') # src为csv格式的文件路径
df2 = df2[['id', 'diagnosis']] # 投影列index为'id'、'diagnosis'的内容
head()、tail()用于显示数据框的前几条和后几条
查看行和列
# 查看前几行
print(df1.head())
# 查看后几行
print(df1.tail())
# 查看行名,行的显式索引
print(df1.index) # RangeIndex(start=0, stop=25, step=1)
# 计算行数
print(df1.index.size) # 25
print(df1.shape[0]) # 25
# 查看列名
print(df1.columns) # RangeIndex(start=0, stop=4, step=1)
# 计算列数
print(df1.columns.size) # 4
print(df1.shape[1]) # 4
# 同时显示行数和列数
print(df1.shape) # (25, 4)
引用行或列
按照列名读取
import numpy as np
import pandas as pd
dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
one two
a 1.0 9
b 2.0 8
c 3.0 7
d NaN 6
'''
print(pd.DataFrame(dt,index=['b','c','d'],columns=['two','three']))
'''
two three
b 8 NaN
c 7 NaN
d 6 NaN
'''
'方法一,列名出现在下标中'
print(d['two'])
'方法二,列名当做一个属性来用'
print(d.two)
'''
a 9
b 8
c 7
d 6
Name: two, dtype: int64
'''
'方法三,列名和行号一起'#数据框的第0轴为列
print(d['two'][2]) # 7
'方法四,属性名和行号'
print(d.two[2]) # 7
'方法五,切片'
print(d.two[1:3]) # 7
'''
b 8
c 7
Name: two, dtype: int64
'''
按照index读取
import pandas as pd
dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d.loc['b', 'one']) # 2.0
print(d.loc['b'])
'''
one 2.0
two 8.0
Name: b, dtype: float64
'''
删除或过滤行/列
删除行
drop()删除显式索引,不改变对象本身
import pandas as pd
dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
one two
a 1.0 9
b 2.0 8
c 3.0 7
d NaN 6
'''
print(d.drop(['a']))#删除index为'a'的数据,可为列表或值
'''
one two
b 2.0 8
c 3.0 7
d NaN 6
'''
axis = 0含义
- 计算前后的列数不变
- 以列为单位计算
- 逐列计算
inplace
import pandas as pd
df2 = pd.read_csv("bc_data.csv")
df2 = df2[['id','diagnosis','area_mean']]
df2.drop([3,4],axis=0,inplace=True)
删除列
del
import pandas as pd
dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
one two
a 1.0 9
b 2.0 8
c 3.0 7
d NaN 6
'''
del d['one']
print(d)#删除列名为'a'的数据
'''
two
a 9
b 8
c 7
d 6
'''
import pandas as pd
df2 = pd.read_csv("bc_data.csv")
df2 = df2[['id','diagnosis','area_mean']]
df2.drop(['id','diagnosis'],axis=1,inplace=True)
按条件过滤
import pandas as pd
df2 = pd.read_csv("bc_data.csv")
df2 = df2[['id','diagnosis','area_mean']]
df2[df2.area_mean>100]
df2[df2.area_mean>100][['id','diagnosis']]#过滤加切片
算术运算
规则
- 先补齐显示index(新增索引对应值为NAN),变成相同结构再运算
import pandas as pd
import numpy as np
df4 = pd.DataFrame(np.arange(6).reshape(2,3))
print(df4)
'''
0 1 2
0 0 1 2
1 3 4 5
'''
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df5)
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
'''
print(df4+df5)
'''
0 1 2 3 4
0 0 2 4 NaN NaN
1 8 10 12 NaN NaN
'''
- 为了不产生NAN,可以不使用算术运算符,而改用如add()、sub()、mul()、div()等成员方法
import pandas as pd
import numpy as np
df4 = pd.DataFrame(np.arange(6).reshape(2,3))
print(df4)
'''
0 1 2
0 0 1 2
1 3 4 5
'''
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df5)
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
'''
print(df4.add(df5,fill_value=10))
'''
0 1 2 3 4
0 0 2 4 13.0 14.0
1 8 10 12 18.0 19.0
'''
- 数据框与Series按行广播,先将行改为等长,不补齐,不进行跨行广播
import pandas as pd
import numpy as np
df4 = pd.DataFrame(np.arange(6).reshape(2,3))
print(df4)
'''
0 1 2
0 0 1 2
1 3 4 5
'''
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df5)
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
'''
df6 = df4.add(df5,fill_value=10)
s1 = pd.Series(np.arange(3))
print(df6 - s1)
'''
0 1 2 3 4
0 0.0 1.0 2.0 NaN NaN
1 8.0 9.0 10.0 NaN NaN
'''
等价于
import pandas as pd
import numpy as np
df5 = pd.DataFrame(np.arange(10).reshape(2, 5))
s1 = pd.Series(np.arange(3))
print(df5.sub(s1, axis=1))
'''
0 1 2 3 4
0 0.0 0.0 0.0 NaN NaN
1 5.0 5.0 5.0 NaN NaN
'''
按列计算
import pandas as pd
import numpy as np
df5 = pd.DataFrame(np.arange(10).reshape(2, 5))
s1 = pd.Series(np.arange(3))
print(df5.sub(s1, axis=0))
'''
0 1 2 3 4
0 0.0 1.0 2.0 3.0 4.0
1 4.0 5.0 6.0 7.0 8.0
2 NaN NaN NaN NaN NaN
'''
import pandas as pd
import numpy as np
df7 = pd.DataFrame(np.arange(20).reshape(4,5))
print(df7)
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
'''
print(df7+2)
'''
0 1 2 3 4
0 2 3 4 5 6
1 7 8 9 10 11
2 12 13 14 15 16
3 17 18 19 20 21
'''
一些函数
import pandas as pd
import numpy as np
df7 = pd.DataFrame(np.arange(20).reshape(4,5))
print(df7)
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
'''
print(df7.cumsum())#对每一列求前缀和
'''
0 1 2 3 4
0 0 1 2 3 4
1 5 7 9 11 13
2 15 18 21 24 27
3 30 34 38 42 46
'''
print(df7.rolling(2).sum())#rolling(x).sum()对每一列计算计算相邻x个数的值之和,
#默认按列,即axis = 0
'''
0 1 2 3 4
0 NaN NaN NaN NaN NaN
1 5.0 7.0 9.0 11.0 13.0
2 15.0 17.0 19.0 21.0 23.0
3 25.0 27.0 29.0 31.0 33.0
'''
print(df7.rolling(2,axis=1).sum())#axis=1,按行
'''
0 1 2 3 4
0 NaN 1.0 3.0 5.0 7.0
1 NaN 11.0 13.0 15.0 17.0
2 NaN 21.0 23.0 25.0 27.0
3 NaN 31.0 33.0 35.0 37.0
'''
print(df7.cov())#协方差矩阵
'''
0 1 2 3 4
0 41.666667 41.666667 41.666667 41.666667 41.666667
1 41.666667 41.666667 41.666667 41.666667 41.666667
2 41.666667 41.666667 41.666667 41.666667 41.666667
3 41.666667 41.666667 41.666667 41.666667 41.666667
4 41.666667 41.666667 41.666667 41.666667 41.666667
'''
print(df7.corr())#相关系数矩阵
'''
0 1 2 3 4
0 1.0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0 1.0
'''
print(df7.T)#数据框的转置
'''
0 1 2 3
0 0 5 10 15
1 1 6 11 16
2 2 7 12 17
3 3 8 13 18
4 4 9 14 19
'''
大小比较运算
import pandas as pd
import numpy as np
df4 = pd.DataFrame(np.arange(6).reshape(2,3))
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
df6 = df4.add(df5,fill_value=10)
s1 = pd.Series(np.arange(3))
print(df6)
'''
0 1 2 3 4
0 0 2 4 13.0 14.0
1 8 10 12 18.0 19.0
'''
print(df6>5)#逐个比较
'''
0 1 2 3 4
0 False False False True True
1 True True True True True
'''
s1 = pd.Series(np.arange(3))
print(s1)
'''
0 0
1 1
2 2
dtype: int32
'''
print(df6>s1)#只比较每一行对应的数,其余均为False
'''
0 1 2 3 4
0 False True True False False
1 True True True False False
'''
统计信息
describ()是数据分析常用的描述性统计方法
import pandas as pd
import numpy as np
df4 = pd.DataFrame(np.arange(6).reshape(2,3))
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
df6 = df4.add(df5,fill_value=10)
s1 = pd.Series(np.arange(3))
print(df6)
'''
0 1 2 3 4
0 0 2 4 13.0 14.0
1 8 10 12 18.0 19.0
'''
print(df6.describe())
'''
0 1 2 3 4
count 2.000000 2.000000 2.000000 2.000000 2.000000
mean 4.000000 6.000000 8.000000 15.500000 16.500000
std 5.656854 5.656854 5.656854 3.535534 3.535534
min 0.000000 2.000000 4.000000 13.000000 14.000000
25% 2.000000 4.000000 6.000000 14.250000 15.250000
50% 4.000000 6.000000 8.000000 15.500000 16.500000
75% 6.000000 8.000000 10.000000 16.750000 17.750000
max 8.000000 10.000000 12.000000 18.000000 19.000000
'''
import pandas as pd
import numpy as np
dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
one two
a 1 9
b 2 8
c 3 7
d 4 6
'''
print(d[d.one == 1])#筛选
'''
one two
a 1 9
'''
print(d[d.one == 1].count())#统计频数
'''
one 1
two 1
dtype: int64
'''
print(d['one'])#切片
'''
a 1
b 2
c 3
d 4
Name: one, dtype: int64
'''
排序
按值排序
sort_values()
by= 表示按哪个列名进行排序
axis=0表示按列进行排序,可不写
ascending表示降序还是升序,True为降序,False为升序
import pandas as pd
import numpy as np
dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
one two
a 1 9
b 2 8
c 3 7
d 4 6
'''
print(d.sort_values(by='one',axis=0,ascending=False))
'''
one two
d 4 6
c 3 7
b 2 8
a 1 9
'''
按显示index排序
sort_index(),不写axis默认为0
axis=0,按列进行排序,类似于sort_values()
axis=1,按行进行排序
import pandas as pd
import numpy as np
dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
one two
a 1 9
b 2 8
c 3 7
d 4 6
'''
print(d.sort_index(axis=0,ascending=False))
'''
one two
d 4 6
c 3 7
b 2 8
a 1 9
'''
print(d.sort_index(axis=1,ascending=False))
'''
two one
a 9 1
b 8 2
c 7 3
d 6 4
'''
print(d.sort_index(ascending=False))
'''
one two
d 4 6
c 3 7
b 2 8
a 1 9
'''
导入/导出
查看当前工作目录
import os
print(os.getcwd())
读入文件
pd.to_* * ()
pd.read_ * * * ()
pd.read_csv()
导出文件
import pandas as pd
dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
d.to_excel("d1.xls")#导出
d2 = pd.read_excel("d1.xls")#再次读入
缺失数据处理
判断是否为空数据框
import pandas as pd
dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d.empty) # False
pandas中None和NaN都可以参与计算,None自动转化为np.nan - np.nan + 1
import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
a b
S 10 10
W 20 20
'''
B = pd.DataFrame(np.array([1,1,1,2,2,2,3,3,3]).reshape(3,3),columns=list("abc"),index=list("SWT"))
print(B)
'''
a b c
S 1 1 1
W 2 2 2
T 3 3 3
'''
print(A+B)#先补列索引,并补NaN,再按索引计算
'''
a b c
S 11.0 11.0 NaN
T NaN NaN NaN
W 22.0 22.0 NaN
'''
import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
a b
S 10 10
W 20 20
'''
B = pd.DataFrame(np.array([1,1,1,2,2,2,3,3,3]).reshape(3,3),columns=list("abc"),index=list("SWT"))
print(B)
'''
a b c
S 1 1 1
W 2 2 2
T 3 3 3
'''
print(A.add(B,fill_value = 0))#缺省值补为0
'''
a b c
S 11.0 11.0 1.0
T 3.0 3.0 3.0
W 22.0 22.0 2.0
'''
print(A.add(B,fill_value = A.stack().mean()))#缺省值补为A的均值15
'''
a b c
S 11.0 11.0 16.0
T 18.0 18.0 18.0
W 22.0 22.0 17.0
'''
A.mean()为按列计算,要想计算数据框内所有均值要加stack()
stack()为建立多级索引
print(A.mean())
'''
a 15.0
b 15.0
dtype: float64
'''
print(A.stack())
'''
S a 10
b 10
W a 20
b 20
dtype: int32
'''
print(A.stack().mean())#15.0
缺失值处理的重要函数
import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
a b
S 10 10
W 20 20
'''
B = pd.DataFrame(np.array([1,1,1,2,2,2,3,3,3]).reshape(3,3),columns=list("abc"),index=list("SWT"))
print(B)
'''
a b c
S 1 1 1
W 2 2 2
T 3 3 3
'''
C = A + B
print(C)
'''
a b c
S 11.0 11.0 NaN
T NaN NaN NaN
W 22.0 22.0 NaN
'''
print(C.isnull())#判断每一个元素是否为空
'''
a b c
S False False True
T True True True
W False False True
'''
print(C.notnull())#判断每一个元素是否为非空
'''
a b c
S True True False
T False False False
W True True False
'''
print(C.dropna(axis='index'))#直接删除缺失值,有缺失值的行被删除
'''
Empty DataFrame
Columns: [a, b, c]
Index: []
'''
print(C.dropna(axis='columns'))#直接删除缺失值,有缺失值的列被删除
'''
Empty DataFrame
Columns: []
Index: [S, T, W]
'''
print(C.fillna(0))#用0来填补缺失值
'''
a b c
S 11.0 11.0 0.0
T 0.0 0.0 0.0
W 22.0 22.0 0.0
'''
print(C.fillna(method='ffill'))#向前填充ffill,默认axis=0按列填充
'''
a b c
S 11.0 11.0 NaN
T 11.0 11.0 NaN
W 22.0 22.0 NaN
'''
print(C.fillna(method='bfill',axis = 1))#向后填充bfill,axis = 1按行填充
'''
a b c
S 11.0 11.0 NaN
T NaN NaN NaN
W 22.0 22.0 NaN
'''
分组统计
import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
a b
S 10 10
W 20 20
'''
print(A.groupby('a')['b'].mean())#按a分组后,按b进行统计均值
'''
a
10 10
20 20
Name: b, dtype: int32
'''
同时计算多个函数值,用aggregate以列表形式枚举
import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
a b
S 10 10
W 20 20
'''
print(A.groupby('a')['b'].aggregate(['mean','max','min','sum']))
'''
mean max min sum
a
10 10 10 10 10
20 20 20 20 20
'''
print(A.groupby('a')['b'].aggregate(['mean','max','min','sum']).unstack())
#unstack()将关系表转换为二级索引
'''
a
mean 10 10
20 20
max 10 10
20 20
min 10 10
20 20
sum 10 10
20 20
dtype: int32
'''
可用apply()把函数换成自定义函数
import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
a b
S 10 10
W 20 20
'''
def myfunc(x):
x['b'] /= x['b'].sum()
return x
print(A.groupby('a').apply(myfunc))