Python Pandas DataFrame

本文详细介绍了Pandas库中DataFrame的数据操作,包括获取行列名称、按行名和列名选择数据、通过索引访问、混合调用、添加删除修改列、行以及数据合并与拼接。示例涵盖了基本操作和进阶技巧,帮助读者深入理解Pandas数据处理的各个方面。
摘要由CSDN通过智能技术生成

DataFrame


行列名称获取与修改

import pandas as pd
data = {
    'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
    'year':[2000,2001,2002,2001,2002],
    'pop':[1.5,1.7,3.6,2.4,2.9]
}
df = pd.DataFrame(data)
#获取行名列名
print(df.index)
print(df.columns)
#修改行名列名
df.index=[0,1,'a','b','3']
df.columns=['one','two','three']
print(df)

输出:
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
RangeIndex(start=0, stop=5, step=1)
Index([‘state’, ‘year’, ‘pop’], dtype=‘object’)
one two three
0 Ohio 2000 1.5
1 Ohio 2001 1.7
a Ohio 2002 3.6
b Nevada 2001 2.4
3 Nevada 2002 2.9

按照行名和列名获取数据

#根据行名,列名获取数据
print(df.loc[[1,2]])#没有的补位NaN
print(df.loc[:,['one']])
print(df.loc[[1,2],['one']])

#一下是简化写法
print(df[['one','two']])#只选取列
#如果只有一个的话可以用一层括号,返回series不是dataframe
print(type(df.loc[1]))#选取一行
print(df['one'])

输出:
one two three
1 Ohio 2001.0 1.7
2 NaN NaN NaN
one
0 Ohio
1 Ohio
a Ohio
b Nevada
3 Nevada
one
1 Ohio
2 NaN
one two
0 Ohio 2000
1 Ohio 2001
a Ohio 2002
b Nevada 2001
3 Nevada 2002
<class ‘pandas.core.series.Series’>
0 Ohio
1 Ohio
a Ohio
b Nevada
3 Nevada
Name: one, dtype: object

按照索引获取数据

#根据索引,获取
print(df.iloc[[1,2]])#没有的补位NaN
print(df.iloc[:,[1]])#没有的补位NaN
print(df.iloc[[1,2],[0,2]])#没有的补位NaN
#一下是一些简化写法
df.iloc[1,2]
df.iloc[2]#只有一行或者一列的时候用
df.iloc[:,2]

输出:
one two three
1 Ohio 2001 1.7
a Ohio 2002 3.6
two
0 2000
1 2001
a 2002
b 2001
3 2002
one three
1 Ohio 1.7
a Ohio 3.6
0 1.5
1 1.7
a 3.6
b 2.4
3 2.9
Name: three, dtype: float64

混合调用

#混合调用
#print(type(df['one']))
print(df['one'].iloc[1:3])#df['one']返回series
print(df.iloc[1:3]['two'])#df.iloc[1:3]返回dataframe

输出:
1 Ohio
a Ohio
Name: one, dtype: object
1 2001
a 2002
Name: two, dtype: int64

添加删除修改

data = {
    'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
    'year':[2000,2001,2002,2001,2002],
    'pop':[1.5,1.7,3.6,2.4,2.9]
}
df = pd.DataFrame(data)
print(df)
#添加、删除、修改
#添加列
df.loc[:,'four']=[1,2,3,4,5]#没有就添加,有就修改
#df['four']=[1,2,3,4,5]#简写
print(df)
#添加行
df.loc['last']=[5,5,5,5]#没有就添加,有就修改
print(df)
#删除列
df=df.drop(['pop'],axis = 1)#没有就报错
print(1,df)
df = df.drop([3,4],axis = 0)#没有就报错
print(df)
#插入列
col_name = df.columns.tolist()
col_name.insert(1,'city')
col_name.pop()
df = df.reindex(columns = col_name)#reindex就是使用新的行列名,并按照之前顺序,之前没有的添加为NaN,之前有现在没有的就删了
print(2,df,sep = '\n')
df['city']=[8,8,8,8]
print(df)
#插入列
ind_name = df.index.tolist()
ind_name.insert(1,'charu')
df = df.reindex(index = ind_name)
print(df)
df.loc['charu'] = [9,9,9]
print(df)

输出:
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
state year pop four
0 Ohio 2000 1.5 1
1 Ohio 2001 1.7 2
2 Ohio 2002 3.6 3
3 Nevada 2001 2.4 4
4 Nevada 2002 2.9 5
state year pop four
0 Ohio 2000 1.5 1
1 Ohio 2001 1.7 2
2 Ohio 2002 3.6 3
3 Nevada 2001 2.4 4
4 Nevada 2002 2.9 5
last 5 5 5.0 5
1 state year four
0 Ohio 2000 1
1 Ohio 2001 2
2 Ohio 2002 3
3 Nevada 2001 4
4 Nevada 2002 5
last 5 5 5
state year four
0 Ohio 2000 1
1 Ohio 2001 2
2 Ohio 2002 3
last 5 5 5
2
state city year
0 Ohio NaN 2000
1 Ohio NaN 2001
2 Ohio NaN 2002
last 5 NaN 5
state city year
0 Ohio 8 2000
1 Ohio 8 2001
2 Ohio 8 2002
last 5 8 5
state city year
0 Ohio 8.0 2000.0
charu NaN NaN NaN
1 Ohio 8.0 2001.0
2 Ohio 8.0 2002.0
last 5 8.0 5.0
state city year
0 Ohio 8.0 2000.0
charu 9 9.0 9.0
1 Ohio 8.0 2001.0
2 Ohio 8.0 2002.0
last 5 8.0 5.0

合并拼接追加

concat

"""
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False,
          copy=True)
"""
#合并两个
import numpy as np
df1 = pd.DataFrame(np.ones((4, 4))*1, columns=list('DCBA'), index=list('4321'))
df2 = pd.DataFrame(np.ones((4, 4))*2, columns=list('FEDC'), index=list('6543'))
df3 = pd.DataFrame(np.ones((4, 4))*2, columns=list('FEDC'), index=list('6543'))
print(df1)
print(df2)
print(pd.concat([df1, df2]))
print(pd.concat([df1, df2], axis=1))

        
#默认值:axis=0

#axis=0:竖方向(index)合并,合并方向index作列表相加(可重复),非合并方向columns取并集

#axis=1:横方向(columns)合并,合并方向columns作列表相加(可重复),非合并方向index取并集
#备注:原df中,取并集的行/列名称不能有重复项,即axis=0时columns不能有重复项,axis=1时index不能有重复项:对于参与和并单个矩阵而言
df1.columns = list('DDBA')
print(df1)
pd.concat([df1, df2], axis=0)#ValueError: Plan shapes are not aligned

输出:
D C B A
4 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
F E D C
6 2.0 2.0 2.0 2.0
5 2.0 2.0 2.0 2.0
4 2.0 2.0 2.0 2.0
3 2.0 2.0 2.0 2.0
A B C D E F
4 1.0 1.0 1.0 1.0 NaN NaN
3 1.0 1.0 1.0 1.0 NaN NaN
2 1.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 NaN NaN
6 NaN NaN 2.0 2.0 2.0 2.0
5 NaN NaN 2.0 2.0 2.0 2.0
4 NaN NaN 2.0 2.0 2.0 2.0
3 NaN NaN 2.0 2.0 2.0 2.0
D C B A F E D C
1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
2 1.0 1.0 1.0 1.0 NaN NaN NaN NaN
3 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
4 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
5 NaN NaN NaN NaN 2.0 2.0 2.0 2.0
6 NaN NaN NaN NaN 2.0 2.0 2.0 2.0
D D B A
4 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0

merge

"""
merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None)
基于一列或多列,列名要在两个矩阵中都能找到,进行拼接
"""
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'], 'key2': ['K0', 'K1', 'K0', 'K1'],'A': ['A0', 'A1', 'A2', 'A3'], 'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],'key2': ['K0', 'K0', 'K0', 'K0'],'C': ['C0', 'C1', 'C2', 'C3'],'D': ['D0', 'D1', 'D2', 'D3']})

print(left)
print(right)
#默认相同列名作为连接键,从第一个里面找 K0 K0在right中有,K0,K1在right中没有,K1   K0在right中有两个,K2   K1没有
"""
    key1 key2   A   B   C   D
0   K0   K0  A0  B0  C0  D0
1   K1   K0  A2  B2  C1  D1
2   K1   K0  A2  B2  C2  D2 #因为AB只有一次K1   K0所以复制一次
"""
print(pd.merge(left, right))
print(pd.merge(left, right, on=['key1', 'key2']))
#以右边矩阵为基础,没有的就NAN
print(pd.merge(left, right, how='right', on=['key1', 'key2']))
#求并,保留所有行,没有的补充NAN
result = pd.merge(left, right, how='outer', on=['key1', 'key2'])
print(result)

输出:
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3

update

"""
updata主要是数据更新,尽可能多的更新

"""
df = pd.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]})
print(df)
new_df = pd.DataFrame({'B': [4, 5, 6, 7, 8], 'C': [7, 8, 9, 10, 11]})#根据列名和行名进行匹配
df.update(new_df)
print(df)
df = pd.DataFrame({'A': [1, 2, 3], 'B': [400, 500, 600]})
df.index= list("ABC")
print(df.index)
new_df = pd.DataFrame({'B': [4, 5, 6, 7, 8], 'C': [7, 8, 9, 10, 11]})#根据列名和行index进行匹配,而鄙视行名成
df.update(new_df)
print(df)
#使用series更新
new_column = pd.Series(['d', 'e'], name='B', index=list("AB"))
print(new_column)
df.update(new_column)#替换中如果新表中数据是NAN则不替换
print(df)

A B
0 1 400
1 2 500
2 3 600
A B
0 1 4.0
1 2 5.0
2 3 6.0
Index([‘A’, ‘B’, ‘C’], dtype=‘object’)
A B
A 1 400
B 2 500
C 3 600
A d
B e
Name: B, dtype: object
A B
A 1 d
B 2 e
C 3 600

append

"""
append

"""
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df2 = pd.DataFrame([[5, 6,7], [7, 8,9],[9,10,11]], columns=list('ABC'))
print(df)
print(df2)

print(df.append(df2))#列为两个列的并
print(df.append(df2,ignore_index=True))

输出:
A B
0 1 2
1 3 4
A B C
0 5 6 7
1 7 8 9
2 9 10 11
A B C
0 1 2 NaN
1 3 4 NaN
0 5 6 7.0
1 7 8 9.0
2 9 10 11.0
A B C
0 1 2 NaN
1 3 4 NaN
2 5 6 7.0
3 7 8 9.0
4 9 10 11.0

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值