【Pandas】1.2.DataFrame

LouHerGetUp

已于 2023-11-29 08:45:09 修改

阅读量651

点赞数 23

分类专栏：机器学习文章标签： pandas

于 2023-11-25 22:41:37 首次发布

本文链接：https://blog.csdn.net/CSDNLHCC/article/details/134620208

版权

机器学习专栏收录该内容

48 篇文章 0 订阅

订阅专栏

包含全部示例的代码仓库见GIthub

1 导入库

import pandas as pd
import numpy as np

2 新建DataFrame

2.1 示例1

新建DataFrame

data = pd.DataFrame({'a':[1,2,3,4],
                     'b':list('abcd'),
                     'c':9,
                     'd':pd.Series([5,6,7,8])})
data  # abcd列索引
# output
	a	b	c	d
0	1	a	9	5
1	2	b	9	6
2	3	c	9	7
3	4	d	9	8

列索引

data.columns # 列索引
# output
Index(['a', 'b', 'c', 'd'], dtype='object')

行索引

data.index # 行索引
# output
RangeIndex(start=0, stop=4, step=1)

2.2 示例2

新建DataFrame

xuhao = ['one','two','three','four']
df = pd.DataFrame(np.random.randn(4,3), index=xuhao, columns=['a','b','c'])
df
# output
       a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066
four	-1.125733	-0.765275	1.492720

行索引和列索引

df.index
# output
Index(['one', 'two', 'three', 'four'], dtype='object')

df.columns
# output
Index(['a', 'b', 'c'], dtype='object')

返回单列

df['a']
# output
one     -0.327567
two      0.577296
three    0.297807
four    -1.125733
Name: a, dtype: float64

df.b
# output
one     -0.190464
two      0.846016
three    1.177698
four    -0.765275
Name: b, dtype: float64

返回多列

df[['a', 'c']]  #返回多列
# output
       a	c
one	-0.327567	0.489604
two	0.577296	-1.272735
three	0.297807	-0.182066
four	-1.125733	1.492720

单列是Series，多列是DataFrame

type(df.b)
# output
pandas.core.series.Series

type(df[['a', 'c']])
# output
pandas.core.frame.DataFrame

返回前3行

df.head(3)
# output
	a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066

返回最后2行

df.tail(2)
# output
           a	b	c
three	0.297807	1.177698	-0.182066
four	-1.125733	-0.765275	1.492720

返回array

df.values  #返回array
# output
array([[-0.3275668 , -0.19046443,  0.48960352],
       [ 0.57729616,  0.8460159 , -1.27273538],
       [ 0.29780732,  1.17769787, -0.18206624],
       [-1.1257328 , -0.76527496,  1.49272042]])

3 loc按索引取值

df
# output
           a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066
four	-1.125733	-0.765275	1.492720

返回单行

df.loc['one']
# output
a   -0.327567
b   -0.190464
c    0.489604
Name: one, dtype: float64

返回多行

df.loc[['one','four']]  #取多个索引，需要输入一个列表
# output
	a	b	c
one	-0.327567	-0.190464	0.489604
four	-1.125733	-0.765275	1.492720

切片返回多行

df.loc['one':'three']  # 包含最后一列
# output
	a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066

切片返回多行多列

df.loc['one':'three',['a','b']]
# output
	a	b
one	-0.327567	-0.190464
two	0.577296	0.846016
three	0.297807	1.177698

返回单列

df.a
# output
one     -0.327567
two      0.577296
three    0.297807
four    -1.125733
Name: a, dtype: float64

切片返回多行多列

df.loc['one':'three','a':'c']
# output
       a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066

切片返回多行多列

df.loc[['one','four'], 'a':'c']
# output
       a	b	c
one	-0.327567	-0.190464	0.489604
four	-1.125733	-0.765275	1.492720

返回单个值，前面是行，后面是列

df.loc['two','b']  # 前面是行，后面是列
# output
0.8460159027597443

切片返回多行多列，前后都包含

df.loc[:'two','b':]
# output
       b	c
one	-0.190464	0.489604
two	0.846016	-1.272735

4 iloc

df
# output
           a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066
four	-1.125733	-0.765275	1.492720

按位取值，返回行

df.iloc[1]  # 按位取值
# output
a    0.577296
b    0.846016
c   -1.272735
Name: two, dtype: float64

切片返回多行多列

df.iloc[1:]
# output
       a	b	c
two	0.577296	0.846016	-1.272735
three	0.297807	1.177698	-0.182066
four	-1.125733	-0.765275	1.492720

切片返回多行

df.iloc[1:, 0] 
# output
two      0.577296
three    0.297807
four    -1.125733
Name: a, dtype: float64

切片返回多行多列，前包后不包

df.iloc[:2, 1:]  # 按位取值,不包含最后一位
# output
       b	c
one	-0.190464	0.489604
two	0.846016	-1.272735

切片返回多行多列

df.iloc[:2, [0,2]]
# output
	a	c
one	-0.327567	0.489604
two	0.577296	-1.272735

返回单个值

df.iloc[2,1]
# output
1.1776978729086762

单个值赋值

df.iloc[2,1]=1000
df
# output
       a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1000.000000	-0.182066
four	-1.125733	-0.765275	1.492720

5 添加和删除行和列

df
# output
       a	b	c
one	-0.327567	-0.190464	0.489604
two	0.577296	0.846016	-1.272735
three	0.297807	1000.000000	-0.182066
four	-1.125733	-0.765275	1.492720

筛选，返回boolean值

df>0
# output
       a	b	c
one	False	False	True
two	True	True	False
three	True	True	False
four	False	False	True

筛选

df[df>0]
# output
   a	b	c
one	NaN	NaN	0.489604
two	0.577296	0.846016	NaN
three	0.297807	1000.000000	NaN
four	NaN	NaN	1.492720

筛选赋值

df[df<0] = 0
df
# output
       a	b	c
one	0.000000	0.000000	0.489604
two	0.577296	0.846016	0.000000
three	0.297807	1000.000000	0.000000
four	0.000000	0.000000	1.492720

按列筛选

df[df.b > 1]
# output
	       a	b	c
three	0.297807	1000.0	0.0

添加列

df
# output
       a	b	c
one	0.000000	0.000000	0.489604
two	0.577296	0.846016	0.000000
three	0.297807	1000.000000	0.000000
four	0.000000	0.000000	1.492720

df['d'] = 4  # 添加列
df['e'] = np.arange(4)
df
# output
       a	b	c	d	e
one	0.000000	0.000000	0.489604	4	0
two	0.577296	0.846016	0.000000	4	1
three	0.297807	1000.000000	0.000000	4	2
four	0.000000	0.000000	1.492720	4	3

添加列

df['f'] = pd.Series([2,3,4,5], index=['one', 'three', 'four', 'five'])
df
# output
       a	b	c	d	e	f
one	0.000000	0.000000	0.489604	4	0	2.0
two	0.577296	0.846016	0.000000	4	1	NaN
three	0.297807	1000.000000	0.000000	4	2	3.0
four	0.000000	0.000000	1.492720	4	3	4.0

删除列

del df['e']  # 删除列
df
# output
	a	b	c	d	f
one	0.000000	0.000000	0.489604	4	2.0
two	0.577296	0.846016	0.000000	4	NaN
three	0.297807	1000.000000	0.000000	4	3.0
four	0.000000	0.000000	1.492720	4	4.0

这样不会添加

df.h = 5  # 并不会添加，要使用中括号，正式的列名
df
# output
       a	b	c	d	f
one	0.000000	0.000000	0.489604	4	2.0
two	0.577296	0.846016	0.000000	4	NaN
three	0.297807	1000.000000	0.000000	4	3.0
four	0.000000	0.000000	1.492720	4	4.0

添加行

df.loc['five'] = 10  # 添加行
df
# output
       a	b	c	d	f
one	0.000000	0.000000	0.489604	4	2.0
two	0.577296	0.846016	0.000000	4	NaN
three	0.297807	1000.000000	0.000000	4	3.0
four	0.000000	0.000000	1.492720	4	4.0
five	10.000000	10.000000	10.000000	10	10.0

删除行

df.drop('one')  # 删除行
# output
	a	b	c	d	f
two	0.577296	0.846016	0.00000	4	NaN
three	0.297807	1000.000000	0.00000	4	3.0
four	0.000000	0.000000	1.49272	4	4.0
five	10.000000	10.000000	10.00000	10	10.0

删除要赋值回去

df = df.drop('one')
df
# output
	a	b	c	d	f
two	0.577296	0.846016	0.00000	4	NaN
three	0.297807	1000.000000	0.00000	4	3.0
four	0.000000	0.000000	1.49272	4	4.0
five	10.000000	10.000000	10.00000	10	10.0

删除多行

df = df.drop(['four','five'])

取多列，效果和删除多列相同

df[['a','b']]
# output
	a	b
two	0.577296	0.846016
three	0.297807	1000.000000

删除多列

df.drop(columns=['c','f'])  # 删除列
# output
       a	b	d
two	0.577296	0.846016	4
three	0.297807	1000.000000	4

但是不赋值回去不会删除成功

df
# output
	a	b	c	d	f
two	0.577296	0.846016	0.0	4	NaN
three	0.297807	1000.000000	0.0	4	3.0

6 排序与数据对齐

num = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
data = pd.DataFrame(np.random.randn(7,4), index=num, columns=list('abcd'))
data
# output

           a	b	c	d
one	0.695007	0.712372	0.739108	-0.932024
two	-1.524550	0.945240	1.424037	-0.262905
three	0.110267	0.872907	-1.592445	-0.063497
four	-1.507138	0.020152	-0.145890	0.673170
five	-0.398889	1.113829	0.503900	0.288276
six	-0.852355	0.654533	1.220746	2.075212
seven	-0.551279	1.460979	-1.770640	-0.379884

data.shape
# output
(7, 4)

np.shape(data)
# output
(7, 4)

转置

data.T
# output
     one	two	three	four	five	six	seven
a	0.695007	-1.524550	0.110267	-1.507138	-0.398889	-0.852355	-0.551279
b	0.712372	0.945240	0.872907	0.020152	1.113829	0.654533	1.460979
c	0.739108	1.424037	-1.592445	-0.145890	0.503900	1.220746	-1.770640
d	-0.932024	-0.262905	-0.063497	0.673170	0.288276	2.075212	-0.379884

data.T.shape
# output
(4, 7)

排序

data.sort_index(axis=1, ascending=False)  # 默认升序，ascending=False 降序
# output
       d	c	b	a
one	-0.932024	0.739108	0.712372	0.695007
two	-0.262905	1.424037	0.945240	-1.524550
three	-0.063497	-1.592445	0.872907	0.110267
four	0.673170	-0.145890	0.020152	-1.507138
five	0.288276	0.503900	1.113829	-0.398889
six	2.075212	1.220746	0.654533	-0.852355
seven	-0.379884	-1.770640	1.460979	-0.551279

不赋值回去不会覆盖原dataframe

data
# output
       a	b	c	d
one	0.695007	0.712372	0.739108	-0.932024
two	-1.524550	0.945240	1.424037	-0.262905
three	0.110267	0.872907	-1.592445	-0.063497
four	-1.507138	0.020152	-0.145890	0.673170
five	-0.398889	1.113829	0.503900	0.288276
six	-0.852355	0.654533	1.220746	2.075212
seven	-0.551279	1.460979	-1.770640	-0.379884

inplace=True立即替换，完成排序

data.sort_index(axis=1, ascending=False, inplace=True)
data
# output
       d	c	b	a
one	-0.932024	0.739108	0.712372	0.695007
two	-0.262905	1.424037	0.945240	-1.524550
three	-0.063497	-1.592445	0.872907	0.110267
four	0.673170	-0.145890	0.020152	-1.507138
five	0.288276	0.503900	1.113829	-0.398889
six	2.075212	1.220746	0.654533	-0.852355
seven	-0.379884	-1.770640	1.460979	-0.551279

axis=0按行排序

data.sort_index(axis=0, ascending=False)
# output
	d	c	b	a
two	-0.262905	1.424037	0.945240	-1.524550
three	-0.063497	-1.592445	0.872907	0.110267
six	2.075212	1.220746	0.654533	-0.852355
seven	-0.379884	-1.770640	1.460979	-0.551279
one	-0.932024	0.739108	0.712372	0.695007
four	0.673170	-0.145890	0.020152	-1.507138
five	0.288276	0.503900	1.113829	-0.398889

根据'c'列，按axis=0排序

data.sort_values(by='c')
# output
       d	c	b	a
seven	-0.379884	-1.770640	1.460979	-0.551279
three	-0.063497	-1.592445	0.872907	0.110267
four	0.673170	-0.145890	0.020152	-1.507138
five	0.288276	0.503900	1.113829	-0.398889
one	-0.932024	0.739108	0.712372	0.695007
six	2.075212	1.220746	0.654533	-0.852355
two	-0.262905	1.424037	0.945240	-1.524550

7 索引重新排序

data
# output
       d	c	b	a
one	-0.932024	0.739108	0.712372	0.695007
two	-0.262905	1.424037	0.945240	-1.524550
three	-0.063497	-1.592445	0.872907	0.110267
four	0.673170	-0.145890	0.020152	-1.507138
five	0.288276	0.503900	1.113829	-0.398889
six	2.075212	1.220746	0.654533	-0.852355
seven	-0.379884	-1.770640	1.460979	-0.551279

按列重新排序索引

data.reindex(columns=['b','c','a','e','d'])
# output
       b	c	a	e	d
one	0.712372	0.739108	0.695007	NaN	-0.932024
two	0.945240	1.424037	-1.524550	NaN	-0.262905
three	0.872907	-1.592445	0.110267	NaN	-0.063497
four	0.020152	-0.145890	-1.507138	NaN	0.673170
five	1.113829	0.503900	-0.398889	NaN	0.288276
six	0.654533	1.220746	-0.852355	NaN	2.075212
seven	1.460979	-1.770640	-0.551279	NaN	-0.379884

data.reindex(columns=['b','c','e','d'])
# output
       b	c	e	d
one	0.712372	0.739108	NaN	-0.932024
two	0.945240	1.424037	NaN	-0.262905
three	0.872907	-1.592445	NaN	-0.063497
four	0.020152	-0.145890	NaN	0.673170
five	1.113829	0.503900	NaN	0.288276
six	0.654533	1.220746	NaN	2.075212
seven	1.460979	-1.770640	NaN	-0.379884

8 同时行索引和列索引对齐

data1 = data.reindex(columns=['b','c','e','d'], index=['one', 'two', 'three', 'four', 'six', 'seven'])
data1
# output
       b	c	e	d
one	0.712372	0.739108	NaN	-0.932024
two	0.945240	1.424037	NaN	-0.262905
three	0.872907	-1.592445	NaN	-0.063497
four	0.020152	-0.145890	NaN	0.673170
six	0.654533	1.220746	NaN	2.075212
seven	1.460979	-1.770640	NaN	-0.379884

data
# output
	d	c	b	a
one	-0.932024	0.739108	0.712372	0.695007
two	-0.262905	1.424037	0.945240	-1.524550
three	-0.063497	-1.592445	0.872907	0.110267
four	0.673170	-0.145890	0.020152	-1.507138
five	0.288276	0.503900	1.113829	-0.398889
six	2.075212	1.220746	0.654533	-0.852355
seven	-0.379884	-1.770640	1.460979	-0.551279

dataframe相加

data2 = data + data1
data2
# output
data2
data2
       a	b	c	d	e
five	NaN	NaN	NaN	NaN	NaN
four	NaN	0.040303	-0.291781	1.346339	NaN
one	NaN	1.424744	1.478216	-1.864048	NaN
seven	NaN	2.921959	-3.541281	-0.759769	NaN
six	NaN	1.309065	2.441493	4.150424	NaN
three	NaN	1.745814	-3.184889	-0.126993	NaN
two	NaN	1.890480	2.848073	-0.525810	NaN

求平均值，对axis=1

np.mean(data2, axis=1)
# output
five          NaN
four     0.364954
one      0.346304
seven   -0.459697
six      2.633661
three   -0.522023
two      1.404248
dtype: float64

data2.mean(1)
# output
five          NaN
four     0.364954
one      0.346304
seven   -0.459697
six      2.633661
three   -0.522023
two      1.404248
dtype: float64

求和，对axis=1

np.sum(data2, axis=1)
# output
five     0.000000
four     1.094862
one      1.038912
seven   -1.379091
six      7.900982
three   -1.566068
two      4.212743
dtype: float64

data2.sum(1)
# output
five     0.000000
four     1.094862
one      1.038912
seven   -1.379091
six      7.900982
three   -1.566068
two      4.212743
dtype: float64

求最大值

data2.max(1)
# output
five          NaN
four     1.346339
one      1.478216
seven    2.921959
six      4.150424
three    1.745814
two      2.848073
dtype: float64

LouHerGetUp

关注

23
点赞
踩
8

收藏

觉得还不错? 一键收藏
打赏
0
评论
【Pandas】1.2.DataFrame

单列是Series，多列是DataFrame。不赋值回去不会覆盖原dataframe。返回单个值，前面是行，后面是列。切片返回多行多列，前后都包含。切片返回多行多列，前包后不包。筛选，返回boolean值。取多列，效果和删除多列相同。但是不赋值回去不会删除成功。新建DataFrame。新建DataFrame。dataframe相加。
复制链接

扫一扫