文章目录
Pandas处理丢失数据
使用dropna()除去NaN的行或列
dates = pd.date_range('20180901',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A', 'B', 'C', 'D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print("除去异常值:")
# 使用pd.dropna()去掉有NaN的行或列
df1 = df.dropna(axis=1, how='any') # ‘any’:只要存在NaN就drop掉;’all‘:必须全部是NaN才drop掉#0:对行进行操作;1:对列进行操作
print(df1)
A B C D
2018-09-01 0 NaN 2.0 3
2018-09-02 4 5.0 NaN 7
2018-09-03 8 9.0 10.0 11
2018-09-04 12 13.0 14.0 15
2018-09-05 16 17.0 18.0 19
2018-09-06 20 21.0 22.0 23
除去异常值:
A D
2018-09-01 0 3
2018-09-02 4 7
2018-09-03 8 11
2018-09-04 12 15
2018-09-05 16 19
2018-09-06 20 23
使用fillna指定值代替NaN
df2 = df.fillna(value=0) # 使用0代替nan
print(df2)
A B C D
2018-09-01 0 0.0 2.0 3
2018-09-02 4 5.0 0.0 7
2018-09-03 8 9.0 10.0 11
2018-09-04 12 13.0 14.0 15
2018-09-05 16 17.0 18.0 19
2018-09-06 20 21.0 22.0 23
##使用isnull判断是否是NaN
A B C D
2018-09-01 False True False False
2018-09-02 False False True False
2018-09-03 False False False False
2018-09-04 False False False False
2018-09-05 False False False False
2018-09-06 False False False False
Pandas导入导出
pd.read_csv读取csv文件
data = pd.read_csv('student.csv') # 读入csv
print(data)
print(data.index)
print(data.columns)
print(data.values)
print(data.describe())
姓名 性别 分数
0 李召 男 96
1 李明 男 89
2 王红 女 95
RangeIndex(start=0, stop=3, step=1)
Index(['姓名 性别 分数'], dtype='object')
[['李召 男 96']
['李明 男 89']
['王红 女 95']]
姓名 性别 分数
count 3
unique 3
top 王红 女 95
freq 1
将资料存取成pickle
data.to_pickle('student.pickle') # 将资料存取成pickle
Pandas合并concat
pandas
处理多组数据需要用到数据的合并处理,concat
是一种基本的合并方式,concat
中有很多参数可以调整。
axis方向合并
# 定义资料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
# concat纵向
print(df1)
print(df2)
print(df3)
# concat纵向合并
res = pd.concat([df1, df2, df3], axis=0)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
ignore_index
# 纵向合并,并重置index
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
join(合并方式)
# 定义资料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
# 纵向“外”合并df1和df2,扩展
res = pd.concat([df1, df2], axis=0, join='outer', sort=False)
print(res)
# 纵向“内”合并df1和df2,找交集
res = pd.concat([df1, df2], axis=0, join='inner')
print(res)
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
join_axes(依照axes合并)
# 依照'df1.index'进行横向合并
res = pd.concat([df1, df2], axis=1, join_axes=[df1.index])
print(res)
# 移除join_axes
res = pd.concat([df1, df2], axis=1)
print(res)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
append(添加数据)
# 定义资料集
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
# 将df2合并到df1的下面,以及重置index,并打印出结果
res = df1.append(df2, ignore_index=True)
print(res)
# 合并多个df,讲df2与df3合并至df1的下面,以及重置index
res = df1.append([df2, df3], ignore_index=True)
print(res)
# 合并Series,将s1合并至df1,以及重置index,
res = df1.append(s1, ignore_index=True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
Pandas合并merge
pandas
中的merge
和concat
类似,但主要是用于两组有key column的数据,统一索引的数据,通常也被用在Database的处理中
依据一组key合并
left = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
print(left)
print(right)
# 依据key_column合并
res = pd.merge(left, right, on='key')
print(res)
key A B
0 K0 A0 B0
1 K1 A1 B1
2 K2 A2 B2
3 K3 A3 B3
key C D
0 K0 C0 D0
1 K1 C1 D1
2 K2 C2 D2
3 K3 C3 D3
key A B C D
0 K0 A0 B0 C0 D0
1 K1 A1 B1 C1 D1
2 K2 A2 B2 C2 D2
3 K3 A3 B3 C3 D3
依据两组key合并
left = pd.DataFrame({
'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']
})
right = pd.DataFrame({
'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']
})
print(left)
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K2 K1 A3 B3
print(right)
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
# 依据key1与key2 column进行合并
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
5 K2 K0 NaN NaN C3 D3
res = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K0 K1 A1 B1 NaN NaN
2 K1 K0 A2 B2 C1 D1
3 K1 K0 A2 B2 C2 D2
4 K2 K1 A3 B3 NaN NaN
res = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(res)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
Indicator
Indicator=True
会将合并的记录放在新的一列
df1 = pd.DataFrame({
'col1': [0, 1], 'col_left': ['a', 'b']
})
df2 = pd.DataFrame({
'col1': [1, 2, 2], 'col_right': [2, 2, 2]
})
print(df1)
print(df2)
col1 col_left
0 0 a
1 1 b
col1 col_right
0 1 2
1 2 2
2 2 2
# 依据col1进行合并,并启用indicator=True
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
col1 col_left col_right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
# 自定义indicator column的名称
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
col1 col_left col_right indicator_column
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
依据index合并
# 定义资料集
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
'C': ['C0', 'C1', 'C2'],
'D': ['D0', 'D1', 'D2']},
index=['K0', 'K2', 'K3']
)
print(left)
print(right)
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C1 D1
K3 C2 D2
# 依据左右资料集的index进行合并,how='outer'
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C1 D1
K3 NaN NaN C2 D2
# 依据左右资料集的index进行合并,how='inner'
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
A B C D
K0 A0 B0 C0 D0
K2 A2 B2 C1 D1
解决overlapping
# 解决overlapping的问题
boys = pd.DataFrame({'K': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'K': ['K0', 'K1', 'K2'], 'age': [4, 5, 6]})
print(boys)
print(girls)
K age
0 K0 1
1 K1 2
2 K2 3
K age
0 K0 4
1 K1 5
2 K2 6
# 使用suffixes解决overlapping问题
res = pd.merge(boys, girls, on='K', suffixes=['_boy', '_girl'], how='inner')
print(res)
K age_boy age_girl
0 K0 1 4
1 K1 2 5
2 K2 3 6
Pandas plot绘图
创建一个Series
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 随机生成1000个数据
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
# 累加数据
data1 = data.cumsum()
# pandas数据可以直接观察可视化形式
data1.plot()
plt.show()
Dataframe可视化
data = pd.DataFrame(
np.random.randn(1000, 4),
index=np.arange(1000),
columns=list("ABCD")
)
data1 = data.cumsum()
data1.plot()
plt.show()