Python Pandas
1.Pandas基本介绍
import pandas as pd
import numpy as np
# np.nan空
s = pd.Series([1, 3, 5, np.nan, 44, 1])
print(s)
# 0 1.0
# 1 3.0
# 2 5.0
# 3 NaN
# 4 44.0
# 5 1.0
# dtype: float64
dates = pd.date_range("20220301", periods=6)
print(dates)
# DatetimeIndex(['2022-03-01', '2022-03-02', '2022-03-03', '2022-03-04',
# '2022-03-05', '2022-03-06'],
# dtype='datetime64[ns]', freq='D')
# index:行的索引
# columns:列的索引
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=["a", "b", "c", "d"])
print(df)
# a b c d
# 2022-03-01 0.194540 -0.553569 -0.966694 -0.759754
# 2022-03-02 0.381299 -0.916151 -1.332240 0.424302
# 2022-03-03 -2.277082 0.756127 -1.934086 0.772821
# 2022-03-04 -2.624071 -0.925296 -0.457871 -0.163357
# 2022-03-05 0.780169 0.883999 0.773308 2.210360
# 2022-03-06 0.451705 -2.097732 -0.286711 0.216228
# 未指定行和列的索引,默认为0, 1, 2, 3...
df_1 = pd.DataFrame(np.arange(12).reshape(3, 4))
print(df_1)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
df_2 = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20220311'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': 'foo'})
print(df_2)
# A B C D E F
# 0 1.0 2022-03-11 1.0 3 test foo
# 1 1.0 2022-03-11 1.0 3 train foo
# 2 1.0 2022-03-11 1.0 3 test foo
# 3 1.0 2022-03-11 1.0 3 train foo
# describe计算数字的相关内容,日期和字符串被省略掉了
print(df_2.describe())
# A C D
# count 4.0 4.0 4.0
# mean 1.0 1.0 3.0
# std 0.0 0.0 0.0
# min 1.0 1.0 3.0
# 25% 1.0 1.0 3.0
# 50% 1.0 1.0 3.0
# 75% 1.0 1.0 3.0
# max 1.0 1.0 3.0
print(df_2.T)
# 0 ... 3
# A 1.0 ... 1.0
# B 2022-03-11 00:00:00 ... 2022-03-11 00:00:00
# C 1.0 ... 1.0
# D 3 ... 3
# E test ... train
# F foo ... foo
#
# [6 rows x 4 columns]
# sort_index(选择行或列, 选择顺序): 按照索引排序
# True为正序,False为倒叙
print(df_2.sort_index(axis=1, ascending=False))
# F E D C B A
# 0 foo test 3 1.0 2022-03-11 1.0
# 1 foo train 3 1.0 2022-03-11 1.0
# 2 foo test 3 1.0 2022-03-11 1.0
# 3 foo train 3 1.0 2022-03-11 1.0
print(df_2.sort_index(axis=0, ascending=False))
# A B C D E F
# 3 1.0 2022-03-11 1.0 3 train foo
# 2 1.0 2022-03-11 1.0 3 test foo
# 1 1.0 2022-03-11 1.0 3 train foo
# 0 1.0 2022-03-11 1.0 3 test foo
# sort_valuse(该例选择E列): 按照值排序
print(df_2.sort_values(by='E'))
# A B C D E F
# 0 1.0 2022-03-11 1.0 3 test foo
# 2 1.0 2022-03-11 1.0 3 test foo
# 1 1.0 2022-03-11 1.0 3 train foo
# 3 1.0 2022-03-11 1.0 3 train foo
2.Pandas选择数据
import pandas as pd
import numpy as np
dates = pd.date_range('20220301', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2022-03-01 0 1 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 8 9 10 11
# 2022-03-04 12 13 14 15
# 2022-03-05 16 17 18 19
# 2022-03-06 20 21 22 23
print(df['A'])
# 2022-03-01 0
# 2022-03-02 4
# 2022-03-03 8
# 2022-03-04 12
# 2022-03-05 16
# 2022-03-06 20
# Freq: D, Name: A, dtype: int32
print(df.A)
# 2022-03-01 0
# 2022-03-02 4
# 2022-03-03 8
# 2022-03-04 12
# 2022-03-05 16
# 2022-03-06 20
# Freq: D, Name: A, dtype: int32
# 切片1.0
print(df[0: 3])
# A B C D
# 2022-03-01 0 1 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 8 9 10 11
# 切片2.0
print(df['20220302': '20220304'])
# A B C D
# 2022-03-02 4 5 6 7
# 2022-03-03 8 9 10 11
# 2022-03-04 12 13 14 15
# 按照标签进行选择
# 可以选择横向的标签,也可以选择纵向的标签
print(df.loc['20220301'])
# A 0
# B 1
# C 2
# D 3
# Name: 2022-03-01 00:00:00, dtype: int32
# 选择所有的行,和A, B两列
print(df.loc[:, ['A', 'B']])
# A B
# 2022-03-01 0 1
# 2022-03-02 4 5
# 2022-03-03 8 9
# 2022-03-04 12 13
# 2022-03-05 16 17
# 2022-03-06 20 21
# 通过切片的方式选择标签
print(df.loc['20220302':, ['A', 'B']])
# A B
# 2022-03-02 4 5
# 2022-03-03 8 9
# 2022-03-04 12 13
# 2022-03-05 16 17
# 2022-03-06 20 21
# 选择特定的行,选择部分列标签
print(df.loc['20220302', ['A', 'B']])
# A 4
# B 5
# Name: 2022-03-02 00:00:00, dtype: int32
# 通过数字筛选(行数列数)
# 选择第三行
print(df.iloc[3])
# A 12
# B 13
# C 14
# D 15
# Name: 2022-03-04 00:00:00, dtype: int32
# 选择第三行第一位
print(df.iloc[3, 1])
# 13
# 通过切片选择第3行到第5行的第1位到第3位
print(df.iloc[3: 5, 1: 3])
# B C
# 2022-03-04 13 14
# 2022-03-05 17 18
# 可以不连续地筛选行
print(df.iloc[[1, 3, 5], 1: 3])
# B C
# 2022-03-02 5 6
# 2022-03-04 13 14
# 2022-03-06 21 22
print(df[df.A > 8])
# A B C D
# 2022-03-04 12 13 14 15
# 2022-03-05 16 17 18 19
# 2022-03-06 20 21 22 23
3.pandas设置值
import pandas as pd
import numpy as np
dates = pd.date_range('20220301', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
# A B C D
# 2022-03-01 0 1 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 8 9 10 11
# 2022-03-04 12 13 14 15
# 2022-03-05 16 17 18 19
# 2022-03-06 20 21 22 23
# 更改第二行第二列的值,通过数字表示要修改的位置
df.iloc[2, 2] = 1111
print(df)
# A B C D
# 2022-03-01 0 1 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 8 9 1111 11
# 2022-03-04 12 13 14 15
# 2022-03-05 16 17 18 19
# 2022-03-06 20 21 22 23
# 更改特定行和列的值,通过标签表示要修改的位置
df.loc['20220301', 'B'] = 2222
print(df)
# A B C D
# 2022-03-01 0 2222 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 8 9 1111 11
# 2022-03-04 12 13 14 15
# 2022-03-05 16 17 18 19
# 2022-03-06 20 21 22 23
# 将A列中大于4的值全部赋值为0
df.A[df.A > 4] = 0
print(df)
# A B C D
# 2022-03-01 0 2222 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 0 9 1111 11
# 2022-03-04 0 13 14 15
# 2022-03-05 0 17 18 19
# 2022-03-06 0 21 22 23
df[df.A > 4] = 0
print(df)
# A B C D
# 2022-03-01 0 2222 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 0 9 1111 11
# 2022-03-04 0 13 14 15
# 2022-03-05 0 17 18 19
# 2022-03-06 0 21 22 23
df.B[df.A > 4] = 0
print(df)
# A B C D
# 2022-03-01 0 2222 2 3
# 2022-03-02 4 5 6 7
# 2022-03-03 0 9 1111 11
# 2022-03-04 0 13 14 15
# 2022-03-05 0 17 18 19
# 2022-03-06 0 21 22 23
# 添加了新的一列F
df['F'] = np.nan
print(df)
# A B C D F
# 2022-03-01 0 2222 2 3 NaN
# 2022-03-02 4 5 6 7 NaN
# 2022-03-03 0 9 1111 11 NaN
# 2022-03-04 0 13 14 15 NaN
# 2022-03-05 0 17 18 19 NaN
# 2022-03-06 0 21 22 23 NaN
# 添加了一个序列
df['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20220301', periods=6))
print(df)
# A B C D F E
# 2022-03-01 0 2222 2 3 NaN 1
# 2022-03-02 4 5 6 7 NaN 2
# 2022-03-03 0 9 1111 11 NaN 3
# 2022-03-04 0 13 14 15 NaN 4
# 2022-03-05 0 17 18 19 NaN 5
# 2022-03-06 0 21 22 23 NaN 6
4.pandas处理丢失数据
import pandas as pd
import numpy as np
dates = pd.date_range('20220301', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)
# A B C D
# 2022-03-01 0 NaN 2.0 3
# 2022-03-02 4 5.0 NaN 7
# 2022-03-03 8 9.0 10.0 11
# 2022-03-04 12 13.0 14.0 15
# 2022-03-05 16 17.0 18.0 19
# 2022-03-06 20 21.0 22.0 23
# 判断是否为NAN
print(df.isnull())
# A B C D
# 2022-03-01 False True False False
# 2022-03-02 False False True False
# 2022-03-03 False False False False
# 2022-03-04 False False False False
# 2022-03-05 False False False False
# 2022-03-06 False False False False
# 判断是否存在任何一个NAN
print(np.any(df.isnull()) == True)
# True
# how={'any', 'all'}
# any: 出现任何一个NAN就会丢掉整行的数据; all: 改行中所有的值都为NAN时丢掉该行数据
# axis: 设置行或列
print(df.dropna(axis=0, how='any'))
# A B C D
# 2022-03-03 8 9.0 10.0 11
# 2022-03-04 12 13.0 14.0 15
# 2022-03-05 16 17.0 18.0 19
# 2022-03-06 20 21.0 22.0 23
# 将显示为NAN的数据填充为0
print(df.fillna(value=0))
# A B C D
# 2022-03-01 0 0.0 2.0 3
# 2022-03-02 4 5.0 0.0 7
# 2022-03-03 8 9.0 10.0 11
# 2022-03-04 12 13.0 14.0 15
# 2022-03-05 16 17.0 18.0 19
# 2022-03-06 20 21.0 22.0 23
5.pandas导入导出数据
import pandas as pd
# 读取csv文件
data = pd.read_csv("xxx.csv")
# 将数据存到pickle类型的文件中
data.to_pickle("xxx.pickle")
6.pandas合并
import pandas as pd
import numpy as np
# concatenating
df_1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df_2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df_3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
print(df_1)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
print(df_2)
# a b c d
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
print(df_3)
# a b c d
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0
# axis=0:竖向合并 axis=1:横向合并
res = pd.concat([df_1, df_2, df_3], axis=0)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 0 1.0 1.0 1.0 1.0
# 1 1.0 1.0 1.0 1.0
# 2 1.0 1.0 1.0 1.0
# 0 2.0 2.0 2.0 2.0
# 1 2.0 2.0 2.0 2.0
# 2 2.0 2.0 2.0 2.0
# ignore_index=True: 对索引重新排序
res = pd.concat([df_1, df_2, df_3], axis=0, ignore_index=True)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
import pandas as pd
import numpy as np
# join, ['inner', 'outer']
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
print(df1)
# a b c d
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
print(df2)
# b c d e
# 2 0.0 0.0 0.0 0.0
# 3 0.0 0.0 0.0 0.0
# 4 0.0 0.0 0.0 0.0
res = pd.concat([df1, df2])
print(res)
# a b c d e
# 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 0.0 NaN
# 3 0.0 0.0 0.0 0.0 NaN
# 2 NaN 1.0 1.0 1.0 1.0
# 3 NaN 1.0 1.0 1.0 1.0
# 4 NaN 1.0 1.0 1.0 1.0
# join='inner': 只显示两者都有的部分
res = pd.concat([df1, df2], join='inner', ignore_index=True)
print(res)
# b c d
# 0 0.0 0.0 0.0
# 1 0.0 0.0 0.0
# 2 0.0 0.0 0.0
# 3 1.0 1.0 1.0
# 4 1.0 1.0 1.0
# 5 1.0 1.0 1.0
import numpy as np
import pandas as pd
# append
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd'])
res = df1.append(df2, ignore_index=True)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
res = df1.append([df2, df3], ignore_index=True)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 1.0
# 6 2.0 2.0 2.0 2.0
# 7 2.0 2.0 2.0 2.0
# 8 2.0 2.0 2.0 2.0
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index=True)
print(res)
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 2.0 3.0 4.0
import pandas as pd
# merge two df by key/keys. (may be used in database)
# simple example
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
# key A B
# 0 K0 A0 B0
# 1 K1 A1 B1
# 2 K2 A2 B2
# 3 K3 A3 B3
print(right)
# key C D
# 0 K0 C0 D0
# 1 K1 C1 D1
# 2 K2 C2 D2
# 3 K3 C3 D3
res = pd.merge(left, right, on='key')
print(res)
# key A B C D
# 0 K0 A0 B0 C0 D0
# 1 K1 A1 B1 C1 D1
# 2 K2 A2 B2 C2 D2
# 3 K3 A3 B3 C3 D3
import pandas as pd
# merge two df by key/keys. (may be used in database)
# consider two keys
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
# key1 key2 A B
# 0 K0 K0 A0 B0
# 1 K0 K1 A1 B1
# 2 K1 K0 A2 B2
# 3 K2 K1 A3 B3
print(right)
# key1 key2 C D
# 0 K0 K0 C0 D0
# 1 K1 K0 C1 D1
# 2 K1 K0 C2 D2
# 3 K2 K0 C3 D3
res = pd.merge(left, right, on=['key1', 'key2'])
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
# how='inner': 默认的合并方法,只考虑相同的key
res = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
# how="outer": 考虑所有的key,没有的部分用NAN表示
res = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(res)
# key1 key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K0 K1 A1 B1 NaN NaN
# 2 K1 K0 A2 B2 C1 D1
# 3 K1 K0 A2 B2 C2 D2
# 4 K2 K1 A3 B3 NaN NaN
# 5 K2 K0 NaN NaN C3 D3
import pandas as pd
# merge two df by key/keys. (may be used in database)
# indicator
df1 = pd.DataFrame({'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
print(df1)
# col1 col_left
# 0 0 a
# 1 1 b
print(df2)
# col1 col_right
# 0 1 2
# 1 2 2
# 2 2 2
# indicator: 显示合并的方式
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True)
print(res)
# col1 col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
# 给indicator一个名字,名字默认为_merge
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
print(res)
# col1 col_left col_right indicator_column
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
import pandas as pd
# merge two df by key/keys. (may be used in database)
# merged by index
left = pd.DataFrame({'A': ["A0", "A1", "A2"],
'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ["C0", "C2", "C3"],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
# A B
# K0 A0 B0
# K1 A1 B1
# K2 A2 B2
print(right)
# C D
# K0 C0 D0
# K2 C2 D2
# K3 C3 D3
# left_index and right_index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K1 A1 B1 NaN NaN
# K2 A2 B2 C2 D2
# K3 NaN NaN C3 D3
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
# A B C D
# K0 A0 B0 C0 D0
# K2 A2 B2 C2 D2
import pandas as pd
# merge two df by key/keys. (may be used in database)
# handle overlapping
boys = pd.DataFrame({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
print(boys)
girls = pd.DataFrame({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
print(girls)
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner')
print(res)
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='outer')
print(res)
# k age_boy age_girl
# 0 K0 1.0 4.0
# 1 K0 1.0 5.0
# 2 K1 2.0 NaN
# 3 K2 3.0 NaN
# 4 K3 NaN 6.0
7.pandas plot图表
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4),
index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
print(data.head())
# A B C D
# 0 0.076246 -0.071643 -1.384471 1.114611
# 1 2.754119 1.074283 -2.139756 0.259078
# 2 2.364150 0.617473 -1.771385 1.417175
# 3 2.171843 0.738065 -1.409256 1.942316
# 4 1.688500 2.546109 -0.901585 1.028159
data.plot()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4),
index=np.arange(1000), columns=list("ABCD"))
data = data.cumsum()
print(data.head())
# A B C D
# 0 0.076246 -0.071643 -1.384471 1.114611
# 1 2.754119 1.074283 -2.139756 0.259078
# 2 2.364150 0.617473 -1.771385 1.417175
# 3 2.171843 0.738065 -1.409256 1.942316
# 4 1.688500 2.546109 -0.901585 1.028159
# plot methods:
# 'bat','hist','box','kde','area','scatter','hexbin','pie'
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class 1')
data.plot.scatter(x='A', y='C', color='DarkGreen', label='Class 2', ax=ax)
plt.show()