数据清洗
数据清洗是数据分析关键的一步,直接影响之后的处理工作
数据需要修改吗?有什么需要修改的吗?数据应该怎么调整才能适用于接下来的分析和挖掘?
是一个迭代的过程,实际项目中可能需要不止一次地执行这些清洗操作
处理缺失数据:pd.fillna(),pd.dropna()
数据连接(pd.merge)
pd.merge
根据单个或多个键将不同DataFrame的行连接起来
类似数据库的连接操作
import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
print(df_obj1)
print(df_obj2)
运行结果:
data1 key
data1 key
0 8 b
1 8 b
2 3 a
3 5 c
4 4 a
5 9 a
6 6 b
data2 key
0 9 a
1 0 b
2 3 d
1. 默认将重叠列的列名作为“外键”进行连接
# 默认将重叠列的列名作为“外键”进行连接
print(pd.merge(df_obj1, df_obj2))
2. on显示指定“外键”
# on显示指定“外键”
print(pd.merge(df_obj1, df_obj2, on='key'))
3. left_on,左侧数据的“外键”,right_on,右侧数据的“外键”
# left_on,right_on分别指定左侧数据和右侧数据的“外键”
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2'))
默认是“内连接”(inner),即结果中的键是交集how指定连接方式
4. “外连接”(outer),结果中的键是并集
# “外连接”
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer'))
5. “左连接”(left)
# 左连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left'))
6. “右连接”(right)
# 右连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))
完整代码:
import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': np.random.randint(0, 10, 7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2': np.random.randint(0, 10, 3)})
print(df_obj1)
print(df_obj2)
# 默认将重叠列的列名作为“外键”进行连接
print(pd.merge(df_obj1, df_obj2))
# on显示指定“外键”
print(pd.merge(df_obj1, df_obj2, on='key'))
# left_on,right_on分别指定左侧数据和右侧数据的“外键”
# 更改列名
df_obj1 = df_obj1.rename(columns={'key': 'key1'})
df_obj2 = df_obj2.rename(columns={'key': 'key2'})
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2'))
# “外连接”
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer'))
# 左连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left'))
# 右连接
print(pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right'))
输出:
key data1
0 b 0
1 b 3
2 a 8
3 c 9
4 a 4
5 a 4
6 b 2
key data2
0 a 7
1 b 0
2 d 1
key data1 data2
0 b 0 0
1 b 3 0
2 b 2 0
3 a 8 7
4 a 4 7
5 a 4 7
key data1 data2
0 b 0 0
1 b 3 0
2 b 2 0
3 a 8 7
4 a 4 7
5 a 4 7
key1 data1 key2 data2
0 b 0 b 0
1 b 3 b 0
2 b 2 b 0
3 a 8 a 7
4 a 4 a 7
5 a 4 a 7
key1 data1 key2 data2
0 b 0.0 b 0.0
1 b 3.0 b 0.0
2 b 2.0 b 0.0
3 a 8.0 a 7.0
4 a 4.0 a 7.0
5 a 4.0 a 7.0
6 c 9.0 NaN NaN
7 NaN NaN d 1.0
key1 data1 key2 data2
0 b 0 b 0.0
1 b 3 b 0.0
2 a 8 a 7.0
3 c 9 NaN NaN
4 a 4 a 7.0
5 a 4 a 7.0
6 b 2 b 0.0
key1 data1 key2 data2
0 b 0.0 b 0
1 b 3.0 b 0
2 b 2.0 b 0
3 a 8.0 a 7
4 a 4.0 a 7
5 a 4.0 a 7
6 NaN NaN d 1
7. 处理重复列名
suffixes,默认为_x, _y
import pandas as pd
import numpy as np
# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data' : np.random.randint(0,10,3)})
print(pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right')))
输出:
key data_left data_right
0 b 0 4
1 b 2 4
2 b 6 4
3 a 2 4
4 a 1 4
5 a 7 4
8. 按索引连接
left_index=True或right_index=True
# 按索引连接
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
print(pd.merge(df_obj1, df_obj2, left_on='key', right_index=True))
数据合并
沿轴方向将多个对象合并到一起
import numpy as np
import pandas as pd
arr1 = np.random.randint(0, 10, (3, 4))
arr2 = np.random.randint(0, 10, (3, 4))
print(arr1)
print(arr2)
print(np.concatenate([arr1, arr2]))
print(np.concatenate([arr1, arr2], axis=1))
输出:
key data1 data2
0 b 2 1
1 b 1 1
6 b 9 1
2 a 9 0
4 a 1 0
5 a 5 0
2. pd.concat
注意指定轴方向,默认axis=0
join指定合并方式,默认为outer
Series合并时查看行索引有无重复
1) index 没有重复的情况
# index 没有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(0,5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(5,9))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(9,12))
print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
print(pd.concat([ser_obj1, ser_obj2, ser_obj3], axis=1))
0 5
1 9
2 1
3 9
4 8
dtype: int32
5 2
6 1
7 0
8 8
dtype: int32
9 8
10 3
11 4
dtype: int32
0 5
1 9
2 1
3 9
4 8
5 2
6 1
7 0
8 8
9 8
10 3
11 4
dtype: int32
0 1 2
0 5.0 NaN NaN
1 9.0 NaN NaN
2 1.0 NaN NaN
3 9.0 NaN NaN
4 8.0 NaN NaN
5 NaN 2.0 NaN
6 NaN 1.0 NaN
7 NaN 0.0 NaN
8 NaN 8.0 NaN
9 NaN NaN 8.0
10 NaN NaN 3.0
11 NaN NaN 4.0
2) index 有重复的情况
# index 有重复的情况
ser_obj1 = pd.Series(np.random.randint(0, 10, 5), index=range(5))
ser_obj2 = pd.Series(np.random.randint(0, 10, 4), index=range(4))
ser_obj3 = pd.Series(np.random.randint(0, 10, 3), index=range(3))
print(ser_obj1)
print(ser_obj2)
print(ser_obj3)
print(pd.concat([ser_obj1, ser_obj2, ser_obj3]))
效果:
0 5
1 5
2 5
3 7
4 2
dtype: int32
0 0
1 7
2 5
3 0
dtype: int32
0 3
1 5
2 9
dtype: int32
0 5
1 5
2 5
3 7
4 2
0 0
1 7
2 5
3 0
0 3
1 5
2 9
dtype: int32
3) DataFrame合并时同时查看行索引和列索引有无重复
import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame(np.random.randint(0, 10, (3, 2)), index=['a', 'b', 'c'],
columns=['A', 'B'])
df_obj2 = pd.DataFrame(np.random.randint(0, 10, (2, 2)), index=['a', 'b'],
columns=['C', 'D'])
print(df_obj1)
print(df_obj2)
print(pd.concat([df_obj1, df_obj2],sort=False))
print(pd.concat([df_obj1, df_obj2], axis=1, join='inner'))
输出:
A B
a 8 4
b 5 2
c 5 0
C D
a 2 8
b 8 4
A B C D
a 8.0 4.0 NaN NaN
b 5.0 2.0 NaN NaN
c 5.0 0.0 NaN NaN
a NaN NaN 2.0 8.0
b NaN NaN 8.0 4.0
A B C D
a 8 4 2 8
b 5 2 8 4
数据重构
1. stack
将列索引旋转为行索引,完成层级索引
DataFrame->Series
import numpy as np
import pandas as pd
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)
stacked = df_obj.stack()
print(stacked)
输出:
# print(df_obj)
data1 data2
0 7 9
1 7 8
2 8 9
3 4 1
4 1 2
# print(stacked)
0 data1 7
data2 9
1 data1 7
data2 8
2 data1 8
data2 9
3 data1 4
data2 1
4 data1 1
data2 2
dtype: int64
2. unstack
将层级索引展开
Series->DataFrame
认操作内层索引,即level=-1
import numpy as np
import pandas as pd
df_obj = pd.DataFrame(np.random.randint(0,10, (5,2)), columns=['data1', 'data2'])
print(df_obj)
stacked = df_obj.stack()
# 默认操作内层索引
print(stacked.unstack())
# 通过level指定操作索引的级别
print(stacked.unstack(level=0))
效果:
data1 data2
0 9 8
1 8 5
2 1 7
3 5 8
4 9 9
data1 data2
0 9 8
1 8 5
2 1 7
3 5 8
4 9 9
0 1 2 3 4
data1 9 8 1 5 9
data2 8 5 7 8 9
数据转换
一、 处理重复数据
1 duplicated() 返回布尔型Series表示每行是否为重复行
import numpy as np
import pandas as pd
df_obj = pd.DataFrame({'data1' : ['a'] * 4 + ['b'] * 4,
'data2' : np.random.randint(0, 4, 8)})
print(df_obj)
print(df_obj.duplicated())
输出:
# print(df_obj)
data1 data2
0 a 3
1 a 2
2 a 3
3 a 3
4 b 1
5 b 0
6 b 3
7 b 0
# print(df_obj.duplicated())
0 False
1 False
2 True
3 True
4 False
5 False
6 False
7 True
dtype: bool
2 drop_duplicates()
过滤重复行
默认判断全部列
可指定按某些列判断
print(df_obj.drop_duplicates())
print(df_obj.drop_duplicates('data2'))
输出:
# print(df_obj.drop_duplicates())
data1 data2
0 a 3
1 a 2
4 b 1
5 b 0
6 b 3
# print(df_obj.drop_duplicates('data2'))
data1 data2
0 a 3
1 a 2
4 b 1
5 b 0
3. 根据map传入的函数对每行或每列进行转换 Series根据map传入的函数对每行或每列进行转换 示例代码:
ser_obj = pd.Series(np.random.randint(0,10,10))
print(ser_obj)
print(ser_obj.map(lambda x : x ** 2))
输出:
# print(ser_obj)
0 1
1 4
2 8
3 6
4 8
5 6
6 6
7 4
8 7
9 3
dtype: int64
# print(ser_obj.map(lambda x : x ** 2))
0 1
1 16
2 64
3 36
4 64
5 36
6 36
7 16
8 49
9 9
dtype: int64
数据替换
replace根据值的内容进行替换
# 单个值替换单个值
print(ser_obj.replace(1, -100))
# 多个值替换一个值
print(ser_obj.replace([6, 8], -100))
# 多个值替换多个值
print(ser_obj.replace([4, 7], [-100, -200]))
输出:
# print(ser_obj.replace(1, -100))
0 -100
1 4
2 8
3 6
4 8
5 6
6 6
7 4
8 7
9 3
dtype: int64
# print(ser_obj.replace([6, 8], -100))
0 1
1 4
2 -100
3 -100
4 -100
5 -100
6 -100
7 4
8 7
9 3
dtype: int64
# print(ser_obj.replace([4, 7], [-100, -200]))
0 1
1 -100
2 8
3 6
4 8
5 6
6 6
7 -100
8 -200
9 3
dtype: int64
完
码上加油站
一起来加油
长按扫码关注
![695a464f76993d86fe59354f3db4b560.png](https://img-blog.csdnimg.cn/img_convert/695a464f76993d86fe59354f3db4b560.png)