import pandas as pd
import numpy as np
- 索引的相关操作
data = 'http://www.gairuo.com/file/data/dataset/team.xlsx'
df = pd.read_excel(data)
df
name | team | Q1 | Q2 | Q3 | Q4 | |
---|---|---|---|---|---|---|
0 | Liver | E | 89 | 21 | 24 | 64 |
1 | Arry | C | 36 | 37 | 37 | 57 |
2 | Ack | A | 57 | 60 | 18 | 84 |
3 | Eorge | C | 93 | 96 | 71 | 78 |
4 | Oah | D | 65 | 49 | 61 | 86 |
... | ... | ... | ... | ... | ... | ... |
95 | Gabriel | C | 48 | 59 | 87 | 74 |
96 | Austin7 | C | 21 | 31 | 30 | 43 |
97 | Lincoln4 | C | 98 | 93 | 1 | 20 |
98 | Eli | E | 11 | 74 | 58 | 91 |
99 | Ben | E | 21 | 43 | 41 | 74 |
100 rows × 6 columns
# 设置name作为索引
df.set_index('name')# 若后面添加 inplace=True则改变原表的结构
team | Q1 | Q2 | Q3 | Q4 | |
---|---|---|---|---|---|
name | |||||
Liver | E | 89 | 21 | 24 | 64 |
Arry | C | 36 | 37 | 37 | 57 |
Ack | A | 57 | 60 | 18 | 84 |
Eorge | C | 93 | 96 | 71 | 78 |
Oah | D | 65 | 49 | 61 | 86 |
... | ... | ... | ... | ... | ... |
Gabriel | C | 48 | 59 | 87 | 74 |
Austin7 | C | 21 | 31 | 30 | 43 |
Lincoln4 | C | 98 | 93 | 1 | 20 |
Eli | E | 11 | 74 | 58 | 91 |
Ben | E | 21 | 43 | 41 | 74 |
100 rows × 5 columns
df
name | team | Q1 | Q2 | Q3 | Q4 | |
---|---|---|---|---|---|---|
0 | Liver | E | 89 | 21 | 24 | 64 |
1 | Arry | C | 36 | 37 | 37 | 57 |
2 | Ack | A | 57 | 60 | 18 | 84 |
3 | Eorge | C | 93 | 96 | 71 | 78 |
4 | Oah | D | 65 | 49 | 61 | 86 |
... | ... | ... | ... | ... | ... | ... |
95 | Gabriel | C | 48 | 59 | 87 | 74 |
96 | Austin7 | C | 21 | 31 | 30 | 43 |
97 | Lincoln4 | C | 98 | 93 | 1 | 20 |
98 | Eli | E | 11 | 74 | 58 | 91 |
99 | Ben | E | 21 | 43 | 41 | 74 |
100 rows × 6 columns
# 重置索引 df.reset_index(drop = True ) 表示重置成功
df.head().rank()
name | team | Q1 | Q2 | Q3 | Q4 | |
---|---|---|---|---|---|---|
0 | 4.0 | 5.0 | 4.0 | 1.0 | 2.0 | 2.0 |
1 | 2.0 | 2.5 | 1.0 | 2.0 | 3.0 | 1.0 |
2 | 1.0 | 1.0 | 2.0 | 4.0 | 1.0 | 4.0 |
3 | 3.0 | 2.5 | 5.0 | 5.0 | 5.0 | 3.0 |
4 | 5.0 | 4.0 | 3.0 | 3.0 | 4.0 | 5.0 |
df.get('name',0)
0 Liver
1 Arry
2 Ack
3 Eorge
4 Oah
...
95 Gabriel
96 Austin7
97 Lincoln4
98 Eli
99 Ben
Name: name, Length: 100, dtype: object
# df.truncate(before=2,after=4)
- 数据类型的转换
# astype() 对于单个列转换利用astype(''),对于多个列进行转换利用astype(字典)
# 排序 sort_values(by = [],ascending=[])
# df.assign(列名= 数据)
查找过滤
df.where(df>60,不及格)
np.where(df>=60,及格,不及格)
df.where(df==100100000,np.where(df>=60,及格,不及格))
数据透视表
pd.pivot_table(data,index=[‘A’,‘B’],columns=[‘C’],aggfunc=np.sum,fill_value=0,margin=True)
df1 = pd.DataFrame({
'A':['a1','a1','a1','a2','a2','a2'],
'B':['b1','b1','b1','b2','b2','b2'],
'C':['c1','c1','c1','c2','c2','c2'],
'D':[1,2,3,4,5,6,]
})
df1
A | B | C | D | |
---|---|---|---|---|
0 | a1 | b1 | c1 | 1 |
1 | a1 | b1 | c1 | 2 |
2 | a1 | b1 | c1 | 3 |
3 | a2 | b2 | c2 | 4 |
4 | a2 | b2 | c2 | 5 |
5 | a2 | b2 | c2 | 6 |
df1.pivot_table(index='A',columns='B',values='D')
B | b1 | b2 |
---|---|---|
A | ||
a1 | 2.0 | NaN |
a2 | NaN | 5.0 |
数据清洗基本操作
1 缺失值
df.isna()
df.isnull()
缺失值筛选
缺失的行 df.loc[df.isna().any(1)]
\ \ 缺失的列 df.loc[:,df.isna().any()]
缺失值的填充
填充平均值 df.fillna(df.mean())
# 缺失值的删除
# df.dropna(axis=0,inplace=True)
指定值的替换
ser = pd.Series([0,1,2,3])
ser.replace(0,5)
0 5
1 1
2 2
3 3
dtype: int64
重复值的识别
# df.duplicated()
重复值的删除
# df.drop_duplicates(subset=None,keep='first',inplace=False,ignore_index=False)