Python学习之pandas

最新推荐文章于 2024-07-03 14:12:08 发布

我为什么可以这么菜

最新推荐文章于 2024-07-03 14:12:08 发布

阅读量252

点赞数

分类专栏： Python学习

本文链接：https://blog.csdn.net/weixin_43866408/article/details/104374802

版权

Python学习专栏收录该内容

18 篇文章 0 订阅

订阅专栏

Series（一维容器）

Series特点

一种key-value型数据类型
显式index(用户定义) 和隐式index(系统分配,类似于列表的下标)

pandas下载

pip install pandas -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com

如果出错可能是numpy版本过低
可以更新一下numpy

pip install --upgrade numpy -i https://pypi.tuna.tsinghua.edu.cn/simple

serise定义：pd.Series(date = ,index = )
date与index等长。

import pandas as pd

mySeries = pd.Series(data=[11, 12, 13, 14, 15, 16], index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries)
#a    11
#b    12
#c    13
#d    14
#e    15
#f    16
#dtype: int64

当value只有一个的时候,python会自动补齐

import pandas as pd

mySeries = pd.Series(11, index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries)

'''

a    11
b    11
c    11
d    11
e    11
f    11
dtype: int64

'''

series的操作方法

import pandas as pd

mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
print(mySeries)
'''
a    1
b    2
c    3
d    4
e    5
f    6
dtype: int6

查看显示index部分

print(mySeries.index)
'''
Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
'''

查看values部分

print(mySeries.values)
'''
[1 2 3 4 5 6]
'''

对显示index进行切片操作

import pandas as pd

mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])

print(mySeries[['b', 'c']])
'''
b    2
c    3
dtype: int64
'''
print(mySeries['b':'d'])  # 有start和end，没有step
'''
b    2
c    3
d    4
dtype: int64
'''

对隐式index进行索引操作

import pandas as pd

mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])

print(mySeries[1:4:2])  # start,end,step
'''
b    2
d    4
dtype: int64
'''

支持显示index的in操作

import pandas as pd

mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
print('a' in mySeries)
# True
print('g' in mySeries)
# False

这里的reindex只能改变原来index的次序,而不是定义一个全新的index,如果index再原来的index中没有，系统会自动为新的index填充上NaN的值

import pandas as pd

mySeries = pd.Series([i for i in range(1, 7)], index=['a', 'b', 'c', 'd', 'e', 'f'])
mySeries1 = mySeries.reindex(['a', 'b', 'j', 'f', 'l', 'm'])
print(mySeries)
'''
a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64
'''
print(mySeries1)
'''
a    1.0
b    2.0
j    NaN
f    6.0
l    NaN
m    NaN
dtype: float64
'''
mySeries1 = mySeries.reindex(['h', 'i', 'j', 'k', 'l', 'm', 'o', 'p', 'q'])
print(mySeries1)
'''
h   NaN
i   NaN
j   NaN
k   NaN
l   NaN
m   NaN
o   NaN
p   NaN
q   NaN
dtype: float64
'''

DataFrame（二维容器）

定义

直接定义（很少用）
导入定义（很常用）

直接定义

import pandas as pd
import  numpy as np

df1 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df1)
'''
   0  1  2  3  4
0  0  1  2  3  4
1  5  6  7  8  9
'''

在这里插入图片描述
导入定义
当pandas包导入一个外部文件时,自动转化为DataFrame对象

df2 = pd.read_csv('src') #src为csv格式的文件路径

import pandas as pd
import numpy as np

df2 = pd.read_csv('src')  # src为csv格式的文件路径
df2 = df2[['id', 'diagnosis']]  # 投影列index为'id'、'diagnosis'的内容

head()、tail()用于显示数据框的前几条和后几条

查看行和列

# 查看前几行
print(df1.head())
# 查看后几行
print(df1.tail())
# 查看行名,行的显式索引
print(df1.index)  # RangeIndex(start=0, stop=25, step=1)
# 计算行数
print(df1.index.size)  # 25
print(df1.shape[0])  # 25
# 查看列名
print(df1.columns)  # RangeIndex(start=0, stop=4, step=1)
# 计算列数
print(df1.columns.size)  # 4
print(df1.shape[1])  # 4
# 同时显示行数和列数
print(df1.shape)  # (25, 4)

引用行或列

按照列名读取

import numpy as np
import pandas as pd

dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
   one  two
a  1.0    9
b  2.0    8
c  3.0    7
d  NaN    6
'''
print(pd.DataFrame(dt,index=['b','c','d'],columns=['two','three']))
'''
   two three
b    8   NaN
c    7   NaN
d    6   NaN
'''
'方法一,列名出现在下标中'
print(d['two'])
'方法二,列名当做一个属性来用'
print(d.two)
'''
a    9
b    8
c    7
d    6
Name: two, dtype: int64
'''
'方法三,列名和行号一起'#数据框的第0轴为列
print(d['two'][2])  # 7
'方法四,属性名和行号'
print(d.two[2])  # 7
'方法五,切片'
print(d.two[1:3])  # 7
'''
b    8
c    7
Name: two, dtype: int64
'''

按照index读取

import pandas as pd

dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d.loc['b', 'one'])  # 2.0
print(d.loc['b'])
'''
one    2.0
two    8.0
Name: b, dtype: float64
'''

删除或过滤行/列
删除行
drop()删除显式索引，不改变对象本身

import pandas as pd

dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
   one  two
a  1.0    9
b  2.0    8
c  3.0    7
d  NaN    6
'''
print(d.drop(['a']))#删除index为'a'的数据，可为列表或值
'''
  one  two
b  2.0    8
c  3.0    7
d  NaN    6
'''

axis = 0含义

计算前后的列数不变
以列为单位计算
逐列计算
inplace

import pandas as pd

df2 = pd.read_csv("bc_data.csv")
df2 = df2[['id','diagnosis','area_mean']]
df2.drop([3,4],axis=0,inplace=True)

删除列
del

import pandas as pd

dt = {'one': pd.Series([1, 2, 3], index=['a', 'b', 'c']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
   one  two
a  1.0    9
b  2.0    8
c  3.0    7
d  NaN    6
'''
del d['one']
print(d)#删除列名为'a'的数据
'''
   two
a    9
b    8
c    7
d    6
'''

import pandas as pd

df2 = pd.read_csv("bc_data.csv")
df2 = df2[['id','diagnosis','area_mean']]
df2.drop(['id','diagnosis'],axis=1,inplace=True)

按条件过滤

import pandas as pd

df2 = pd.read_csv("bc_data.csv")
df2 = df2[['id','diagnosis','area_mean']]
df2[df2.area_mean>100]
df2[df2.area_mean>100][['id','diagnosis']]#过滤加切片

算术运算

规则

先补齐显示index(新增索引对应值为NAN)，变成相同结构再运算

import pandas as pd
import numpy as np

df4 = pd.DataFrame(np.arange(6).reshape(2,3))
print(df4)
'''
   0  1  2
0  0  1  2
1  3  4  5
'''
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df5)
'''
   0  1  2  3  4
0  0  1  2  3  4
1  5  6  7  8  9
'''
print(df4+df5)
'''
   0   1   2   3   4
0  0   2   4 NaN NaN
1  8  10  12 NaN NaN
'''

为了不产生NAN，可以不使用算术运算符，而改用如add()、sub()、mul()、div()等成员方法

import pandas as pd
import numpy as np

df4 = pd.DataFrame(np.arange(6).reshape(2,3))
print(df4)
'''
   0  1  2
0  0  1  2
1  3  4  5
'''
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df5)
'''
   0  1  2  3  4
0  0  1  2  3  4
1  5  6  7  8  9
'''
print(df4.add(df5,fill_value=10))
'''
   0   1   2     3     4
0  0   2   4  13.0  14.0
1  8  10  12  18.0  19.0
'''

数据框与Series按行广播，先将行改为等长，不补齐，不进行跨行广播

import pandas as pd
import numpy as np

df4 = pd.DataFrame(np.arange(6).reshape(2,3))
print(df4)
'''
   0  1  2
0  0  1  2
1  3  4  5
'''
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
print(df5)
'''
   0  1  2  3  4
0  0  1  2  3  4
1  5  6  7  8  9
'''
df6 = df4.add(df5,fill_value=10)
s1 = pd.Series(np.arange(3))
print(df6 - s1)
'''
     0    1     2   3   4
0  0.0  1.0   2.0 NaN NaN
1  8.0  9.0  10.0 NaN NaN
'''

等价于

import pandas as pd
import numpy as np

df5 = pd.DataFrame(np.arange(10).reshape(2, 5))
s1 = pd.Series(np.arange(3))
print(df5.sub(s1, axis=1))
'''
     0    1    2   3   4
0  0.0  0.0  0.0 NaN NaN
1  5.0  5.0  5.0 NaN NaN
'''

按列计算

import pandas as pd
import numpy as np

df5 = pd.DataFrame(np.arange(10).reshape(2, 5))
s1 = pd.Series(np.arange(3))
print(df5.sub(s1, axis=0))
'''
     0    1    2    3    4
0  0.0  1.0  2.0  3.0  4.0
1  4.0  5.0  6.0  7.0  8.0
2  NaN  NaN  NaN  NaN  NaN
'''

import pandas as pd
import numpy as np

df7 = pd.DataFrame(np.arange(20).reshape(4,5))
print(df7)
'''
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
'''
print(df7+2)
'''
    0   1   2   3   4
0   2   3   4   5   6
1   7   8   9  10  11
2  12  13  14  15  16
3  17  18  19  20  21
'''

一些函数

import pandas as pd
import numpy as np

df7 = pd.DataFrame(np.arange(20).reshape(4,5))
print(df7)
'''
    0   1   2   3   4
0   0   1   2   3   4
1   5   6   7   8   9
2  10  11  12  13  14
3  15  16  17  18  19
'''
print(df7.cumsum())#对每一列求前缀和
'''
    0   1   2   3   4
0   0   1   2   3   4
1   5   7   9  11  13
2  15  18  21  24  27
3  30  34  38  42  46
'''
print(df7.rolling(2).sum())#rolling(x).sum()对每一列计算计算相邻x个数的值之和，
#默认按列，即axis = 0
'''
      0     1     2     3     4
0   NaN   NaN   NaN   NaN   NaN
1   5.0   7.0   9.0  11.0  13.0
2  15.0  17.0  19.0  21.0  23.0
3  25.0  27.0  29.0  31.0  33.0
'''
print(df7.rolling(2,axis=1).sum())#axis=1,按行
'''
    0     1     2     3     4
0 NaN   1.0   3.0   5.0   7.0
1 NaN  11.0  13.0  15.0  17.0
2 NaN  21.0  23.0  25.0  27.0
3 NaN  31.0  33.0  35.0  37.0
'''
print(df7.cov())#协方差矩阵
'''
           0          1          2          3          4
0  41.666667  41.666667  41.666667  41.666667  41.666667
1  41.666667  41.666667  41.666667  41.666667  41.666667
2  41.666667  41.666667  41.666667  41.666667  41.666667
3  41.666667  41.666667  41.666667  41.666667  41.666667
4  41.666667  41.666667  41.666667  41.666667  41.666667
'''
print(df7.corr())#相关系数矩阵
'''
     0    1    2    3    4
0  1.0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0  1.0
'''
print(df7.T)#数据框的转置
'''
   0  1   2   3
0  0  5  10  15
1  1  6  11  16
2  2  7  12  17
3  3  8  13  18
4  4  9  14  19
'''

大小比较运算

import pandas as pd
import numpy as np

df4 = pd.DataFrame(np.arange(6).reshape(2,3))
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
df6 = df4.add(df5,fill_value=10)
s1 = pd.Series(np.arange(3))
print(df6)
'''
   0   1   2     3     4
0  0   2   4  13.0  14.0
1  8  10  12  18.0  19.0
'''
print(df6>5)#逐个比较
'''
       0      1      2     3     4
0  False  False  False  True  True
1   True   True   True  True  True
'''
s1 = pd.Series(np.arange(3))
print(s1)
'''
0    0
1    1
2    2
dtype: int32
'''
print(df6>s1)#只比较每一行对应的数，其余均为False
'''
       0     1     2      3      4
0  False  True  True  False  False
1   True  True  True  False  False
'''

统计信息

describ()是数据分析常用的描述性统计方法
import pandas as pd
import numpy as np

df4 = pd.DataFrame(np.arange(6).reshape(2,3))
df5 = pd.DataFrame(np.arange(10).reshape(2,5))
df6 = df4.add(df5,fill_value=10)
s1 = pd.Series(np.arange(3))
print(df6)
'''
   0   1   2     3     4
0  0   2   4  13.0  14.0
1  8  10  12  18.0  19.0
'''
print(df6.describe())
'''
              0          1          2          3          4
count  2.000000   2.000000   2.000000   2.000000   2.000000
mean   4.000000   6.000000   8.000000  15.500000  16.500000
std    5.656854   5.656854   5.656854   3.535534   3.535534
min    0.000000   2.000000   4.000000  13.000000  14.000000
25%    2.000000   4.000000   6.000000  14.250000  15.250000
50%    4.000000   6.000000   8.000000  15.500000  16.500000
75%    6.000000   8.000000  10.000000  16.750000  17.750000
max    8.000000  10.000000  12.000000  18.000000  19.000000
'''

import pandas as pd
import numpy as np

dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
   one  two
a    1    9
b    2    8
c    3    7
d    4    6
'''
print(d[d.one == 1])#筛选
'''
   one  two
a    1    9
'''
print(d[d.one == 1].count())#统计频数
'''
one    1
two    1
dtype: int64
'''
print(d['one'])#切片
'''
a    1
b    2
c    3
d    4
Name: one, dtype: int64
'''

排序

按值排序
sort_values()
by= 表示按哪个列名进行排序
axis=0表示按列进行排序，可不写
ascending表示降序还是升序，True为降序，False为升序

import pandas as pd
import numpy as np

dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
   one  two
a    1    9
b    2    8
c    3    7
d    4    6
'''
print(d.sort_values(by='one',axis=0,ascending=False))
'''
   one  two
d    4    6
c    3    7
b    2    8
a    1    9
'''

按显示index排序
sort_index()，不写axis默认为0
axis=0，按列进行排序，类似于sort_values()
axis=1，按行进行排序

import pandas as pd
import numpy as np

dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d)
'''
   one  two
a    1    9
b    2    8
c    3    7
d    4    6
'''
print(d.sort_index(axis=0,ascending=False))
'''
   one  two
d    4    6
c    3    7
b    2    8
a    1    9
'''
print(d.sort_index(axis=1,ascending=False))
'''
   two  one
a    9    1
b    8    2
c    7    3
d    6    4
'''
print(d.sort_index(ascending=False))
'''
   one  two
d    4    6
c    3    7
b    2    8
a    1    9
'''

导入/导出
查看当前工作目录

import os
print(os.getcwd())

读入文件
pd.to_* * ()
pd.read_ * * * ()

pd.read_csv()

导出文件

import pandas as pd

dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
d.to_excel("d1.xls")#导出
d2 = pd.read_excel("d1.xls")#再次读入

缺失数据处理

判断是否为空数据框

import pandas as pd

dt = {'one': pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd']),
      'two': pd.Series([9, 8, 7, 6], index=['a', 'b', 'c', 'd'])}
d = pd.DataFrame(dt)
print(d.empty)  # False

pandas中None和NaN都可以参与计算，None自动转化为np.nan - np.nan + 1

import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
    a   b
S  10  10
W  20  20
'''
B = pd.DataFrame(np.array([1,1,1,2,2,2,3,3,3]).reshape(3,3),columns=list("abc"),index=list("SWT"))
print(B)
'''
   a  b  c
S  1  1  1
W  2  2  2
T  3  3  3
'''
print(A+B)#先补列索引，并补NaN，再按索引计算
'''
      a     b   c
S  11.0  11.0 NaN
T   NaN   NaN NaN
W  22.0  22.0 NaN
'''

import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
    a   b
S  10  10
W  20  20
'''
B = pd.DataFrame(np.array([1,1,1,2,2,2,3,3,3]).reshape(3,3),columns=list("abc"),index=list("SWT"))
print(B)
'''
   a  b  c
S  1  1  1
W  2  2  2
T  3  3  3
'''
print(A.add(B,fill_value = 0))#缺省值补为0
'''
      a     b    c
S  11.0  11.0  1.0
T   3.0   3.0  3.0
W  22.0  22.0  2.0
'''
print(A.add(B,fill_value = A.stack().mean()))#缺省值补为A的均值15
'''
      a     b     c
S  11.0  11.0  16.0
T  18.0  18.0  18.0
W  22.0  22.0  17.0
'''

A.mean()为按列计算，要想计算数据框内所有均值要加stack()
stack()为建立多级索引

print(A.mean())
'''
a    15.0
b    15.0
dtype: float64
'''
print(A.stack())
'''
S  a    10
   b    10
W  a    20
   b    20
dtype: int32
'''
print(A.stack().mean())#15.0

缺失值处理的重要函数

import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
    a   b
S  10  10
W  20  20
'''
B = pd.DataFrame(np.array([1,1,1,2,2,2,3,3,3]).reshape(3,3),columns=list("abc"),index=list("SWT"))
print(B)
'''
   a  b  c
S  1  1  1
W  2  2  2
T  3  3  3
'''
C = A + B
print(C)
'''
      a     b   c
S  11.0  11.0 NaN
T   NaN   NaN NaN
W  22.0  22.0 NaN
'''
print(C.isnull())#判断每一个元素是否为空
'''
       a      b     c
S  False  False  True
T   True   True  True
W  False  False  True
'''
print(C.notnull())#判断每一个元素是否为非空
'''
       a      b      c
S   True   True  False
T  False  False  False
W   True   True  False
'''
print(C.dropna(axis='index'))#直接删除缺失值,有缺失值的行被删除
'''
Empty DataFrame
Columns: [a, b, c]
Index: []
'''
print(C.dropna(axis='columns'))#直接删除缺失值，有缺失值的列被删除
'''
Empty DataFrame
Columns: []
Index: [S, T, W]
'''
print(C.fillna(0))#用0来填补缺失值
'''
      a     b    c
S  11.0  11.0  0.0
T   0.0   0.0  0.0
W  22.0  22.0  0.0
'''
print(C.fillna(method='ffill'))#向前填充ffill，默认axis=0按列填充
'''
      a     b   c
S  11.0  11.0 NaN
T  11.0  11.0 NaN
W  22.0  22.0 NaN
'''
print(C.fillna(method='bfill',axis = 1))#向后填充bfill,axis = 1按行填充
'''
      a     b   c
S  11.0  11.0 NaN
T   NaN   NaN NaN
W  22.0  22.0 NaN
'''

分组统计

在这里插入图片描述

import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
    a   b
S  10  10
W  20  20
'''
print(A.groupby('a')['b'].mean())#按a分组后，按b进行统计均值
'''
a
10    10
20    20
Name: b, dtype: int32
'''

同时计算多个函数值，用aggregate以列表形式枚举

import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
    a   b
S  10  10
W  20  20
'''
print(A.groupby('a')['b'].aggregate(['mean','max','min','sum']))
'''
    mean  max  min  sum
a                      
10    10   10   10   10
20    20   20   20   20
'''
print(A.groupby('a')['b'].aggregate(['mean','max','min','sum']).unstack())
#unstack()将关系表转换为二级索引
'''
      a 
mean  10    10
      20    20
max   10    10
      20    20
min   10    10
      20    20
sum   10    10
      20    20
dtype: int32
'''

可用apply()把函数换成自定义函数

import pandas as pd
import numpy as np
A = pd.DataFrame(np.array([10,10,20,20]).reshape(2,2),columns=list("ab"),index=list("SW"))
print(A)
'''
    a   b
S  10  10
W  20  20
'''
def myfunc(x):
    x['b'] /= x['b'].sum()
    return x
print(A.groupby('a').apply(myfunc))