【课程2.7】 Pandas数据结构Dataframe:基本技巧
数据查看、转置 / 添加、修改、删除值 / 对齐 / 排序
1.数据查看、转置
df = pd.DataFrame(np.random.rand(16).reshape(8,2)*100,
columns = ['a','b'])
print(df.head(2))
print(df.tail())
# .head()查看头部数据
# .tail()查看尾部数据
# 默认查看5条
print(df.T)
# .T 转置
-----------------------------------------------------------------------
a b
0 5.777208 18.374283
1 85.961515 55.120036
a b
3 21.236577 15.902872
4 46.137564 29.350647
5 70.157709 58.972728
6 8.368292 42.011356
7 29.824574 87.062295
0 1 2 3 4 5 \
a 5.777208 85.961515 11.005284 21.236577 46.137564 70.157709
b 18.374283 55.120036 35.595598 15.902872 29.350647 58.972728
6 7
a 8.368292 29.824574
b 42.011356 87.062295
2.添加与修改
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)
df['e'] = 10
df.loc[4] = 20
print(df)
# 新增列/行并赋值
df['e'] = 20
df[['a','c']] = 100
print(df)
# 索引后直接修改值
-----------------------------------------------------------------------
a b c d
0 17.148791 73.833921 39.069417 5.675815
1 91.572695 66.851601 60.320698 92.071097
2 79.377105 24.314520 44.406357 57.313429
3 84.599206 61.310945 3.916679 30.076458
a b c d e
0 17.148791 73.833921 39.069417 5.675815 10
1 91.572695 66.851601 60.320698 92.071097 10
2 79.377105 24.314520 44.406357 57.313429 10
3 84.599206 61.310945 3.916679 30.076458 10
4 20.000000 20.000000 20.000000 20.000000 20
a b c d e
0 100 73.833921 100 5.675815 20
1 100 66.851601 100 92.071097 20
2 100 24.314520 100 57.313429 20
3 100 61.310945 100 30.076458 20
4 100 20.000000 100 20.000000 20
3.删除 del / drop()
df = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df)
del df['a']
print(df)
print('-----')
# del语句 - 删除列
print(df.drop(0))
print(df.drop([1,2]))
print(df)
print('-----')
# drop()删除行,inplace=False → 删除后生成新的数据,不改变原数据
print(df.drop(['d'], axis = 1))
print(df)
# drop()删除列,需要加上axis = 1,inplace=False → 删除后生成新的数据,不改变原数据
-----------------------------------------------------------------------
a b c d
0 91.866806 88.753655 18.469852 71.651277
1 64.835568 33.844967 6.391246 54.916094
2 75.930985 19.169862 91.042457 43.648258
3 15.863853 24.788866 10.625684 82.135316
b c d
0 88.753655 18.469852 71.651277
1 33.844967 6.391246 54.916094
2 19.169862 91.042457 43.648258
3 24.788866 10.625684 82.135316
-----
b c d
1 33.844967 6.391246 54.916094
2 19.169862 91.042457 43.648258
3 24.788866 10.625684 82.135316
b c d
0 88.753655 18.469852 71.651277
3 24.788866 10.625684 82.135316
b c d
0 88.753655 18.469852 71.651277
1 33.844967 6.391246 54.916094
2 19.169862 91.042457 43.648258
3 24.788866 10.625684 82.135316
-----
b c
0 88.753655 18.469852
1 33.844967 6.391246
2 19.169862 91.042457
3 24.788866 10.625684
b c d
0 88.753655 18.469852 71.651277
1 33.844967 6.391246 54.916094
2 19.169862 91.042457 43.648258
3 24.788866 10.625684 82.135316
4.对齐
df1 = pd.DataFrame(np.random.randn(10, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(7, 3), columns=['A', 'B', 'C'])
print(df1 + df2)
# DataFrame对象之间的数据自动按照列和索引(行标签)对齐
-----------------------------------------------------------------------
A B C D
0 -0.281123 -2.529461 1.325663 NaN
1 -0.310514 -0.408225 -0.760986 NaN
2 -0.172169 -2.355042 1.521342 NaN
3 1.113505 0.325933 3.689586 NaN
4 0.107513 -0.503907 -1.010349 NaN
5 -0.845676 -2.410537 -1.406071 NaN
6 1.682854 -0.576620 -0.981622 NaN
7 NaN NaN NaN NaN
8 NaN NaN NaN NaN
9 NaN NaN NaN NaN
5.排序1 - 按值排序 .sort_values
# 同样适用于Series
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_values(['a'], ascending = True)) # 升序
print(df1.sort_values(['a'], ascending = False)) # 降序
print('------')
# ascending参数:设置升序降序,默认升序
# 单列排序
df2 = pd.DataFrame({'a':[1,1,1,1,2,2,2,2],
'b':list(range(8)),
'c':list(range(8,0,-1))})
print(df2)
print(df2.sort_values(['a','c']))
# 多列排序,按列顺序排序
-----------------------------------------------------------------------
a b c d
0 16.519099 19.601879 35.464189 58.866972
1 34.506472 97.106578 96.308244 54.049359
2 87.177828 47.253416 92.098847 19.672678
3 66.673226 51.969534 71.789055 14.504191
a b c d
0 16.519099 19.601879 35.464189 58.866972
1 34.506472 97.106578 96.308244 54.049359
3 66.673226 51.969534 71.789055 14.504191
2 87.177828 47.253416 92.098847 19.672678
a b c d
2 87.177828 47.253416 92.098847 19.672678
3 66.673226 51.969534 71.789055 14.504191
1 34.506472 97.106578 96.308244 54.049359
0 16.519099 19.601879 35.464189 58.866972
------
a b c
0 1 0 8
1 1 1 7
2 1 2 6
3 1 3 5
4 2 4 4
5 2 5 3
6 2 6 2
7 2 7 1
a b c
3 1 3 5
2 1 2 6
1 1 1 7
0 1 0 8
7 2 7 1
6 2 6 2
5 2 5 3
4 2 4 4
排序2 - 索引排序 .sort_index
df1 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = [5,4,3,2],
columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.random.rand(16).reshape(4,4)*100,
index = ['h','s','x','g'],
columns = ['a','b','c','d'])
print(df1)
print(df1.sort_index())
print(df2)
print(df2.sort_index())
# 按照index排序
# 默认 ascending=True, inplace=False
-----------------------------------------------------------------------
a b c d
5 57.327269 87.623119 93.655538 5.859571
4 69.739134 80.084366 89.005538 56.825475
3 88.148296 6.211556 68.938504 41.542563
2 29.248036 72.005306 57.855365 45.931715
a b c d
2 29.248036 72.005306 57.855365 45.931715
3 88.148296 6.211556 68.938504 41.542563
4 69.739134 80.084366 89.005538 56.825475
5 57.327269 87.623119 93.655538 5.859571
a b c d
h 50.579469 80.239138 24.085110 39.443600
s 30.906725 39.175302 11.161542 81.010205
x 19.900056 18.421110 4.995141 12.605395
g 67.760755 72.573568 33.507090 69.854906
a b c d
g 67.760755 72.573568 33.507090 69.854906
h 50.579469 80.239138 24.085110 39.443600
s 30.906725 39.175302 11.161542 81.010205
x 19.900056 18.421110 4.995141 12.605395