import pandas as pd
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})#按列定义
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
## col1 col_left
##0 0 a
##1 1 b
print(df2)
## col1 col_right
##0 1 2
##1 2 2
##2 2 2
#indicator = True会将合并的记录放在新的一列
#根据col1进行合并
res = pd.merge(df1,df2,on = 'col1', how = 'outer', indicator = True)
print(res)
## col1 col_left col_right _merge
##0 0 a NaN left_only
##1 1 b 2.0 both
##2 2 NaN 2.0 right_only
##3 2 NaN 2.0 right_only
#自定义indicator column的名称
res = pd.merge(df1,df2,on = 'col1', how = 'outer',indicator = 'indicator_column')
print(res)
## col1 col_left col_right indicator_column
##0 0 a NaN left_only
##1 1 b 2.0 both
##2 2 NaN 2.0 right_only
##3 2 NaN 2.0 right_only
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index = ['K0','K2','K3'])
print(left)
## A B
##K0 A0 B0
##K1 A1 B1
##K2 A2 B2
print(right)
## C D
##K0 C0 D0
##K2 C2 D2
##K3 C3 D3
#根据index进行合并,how = 'outer',并且打印输出
res = pd.merge(left,right,left_index = True, right_index = True, how = 'outer')
print(res)
#根据index进行合并,how = 'inner'
res = pd.merge(left,right,left_index = True, right_index = True, how = 'inner')
print(res)
## A B C D
##K0 A0 B0 C0 D0
##K2 A2 B2 C2 D2
boys = pd.DataFrame({'K':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'K':['K0','K1','K3'],'age':[4,5,6]})
#使用suffixes后缀解决overlapping重叠的问题
res = pd.merge(boys,girls,on='K',suffixes = ['_boy','_girl'],how = 'inner')
print(res)
## K age_boy age_girl
##0 K0 1 4
##1 K1 2 5
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#数据的可视化 matplotlib --> plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#随机生成1000个数据
data = pd.Series(np.random.randn(1000),index = np.arange(1000))
#为了方便观看效果,累加数据
data.cumsum()
#pandas数据可以直接观看可视化效果
data.plot()
plt.show()
#生成1000*4的DataFrame,并对它们进行累加
data = pd.DataFrame(np.random.randn(1000,4),
index = np.arange(1000),
columns = list('ABCD')
)
data.cumsum()
data.plot()
plt.show()
#除了plot,scatter散点图,其他还有bar,hist,box,kde,area,hexbin
ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class1')
#画在同一个ax上面,选择不同的数据列
data.plot.scatter(x='A',y = 'C',color = 'LightGreen',label = 'Class2',ax = ax)
plt.show()
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})#按列定义
df2 = pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})
print(df1)
## col1 col_left
##0 0 a
##1 1 b
print(df2)
## col1 col_right
##0 1 2
##1 2 2
##2 2 2
#indicator = True会将合并的记录放在新的一列
#根据col1进行合并
res = pd.merge(df1,df2,on = 'col1', how = 'outer', indicator = True)
print(res)
## col1 col_left col_right _merge
##0 0 a NaN left_only
##1 1 b 2.0 both
##2 2 NaN 2.0 right_only
##3 2 NaN 2.0 right_only
#自定义indicator column的名称
res = pd.merge(df1,df2,on = 'col1', how = 'outer',indicator = 'indicator_column')
print(res)
## col1 col_left col_right indicator_column
##0 0 a NaN left_only
##1 1 b 2.0 both
##2 2 NaN 2.0 right_only
##3 2 NaN 2.0 right_only
left = pd.DataFrame({'A':['A0','A1','A2'],
'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],
'D':['D0','D2','D3']},
index = ['K0','K2','K3'])
print(left)
## A B
##K0 A0 B0
##K1 A1 B1
##K2 A2 B2
print(right)
## C D
##K0 C0 D0
##K2 C2 D2
##K3 C3 D3
#根据index进行合并,how = 'outer',并且打印输出
res = pd.merge(left,right,left_index = True, right_index = True, how = 'outer')
print(res)
#根据index进行合并,how = 'inner'
res = pd.merge(left,right,left_index = True, right_index = True, how = 'inner')
print(res)
## A B C D
##K0 A0 B0 C0 D0
##K2 A2 B2 C2 D2
boys = pd.DataFrame({'K':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'K':['K0','K1','K3'],'age':[4,5,6]})
#使用suffixes后缀解决overlapping重叠的问题
res = pd.merge(boys,girls,on='K',suffixes = ['_boy','_girl'],how = 'inner')
print(res)
## K age_boy age_girl
##0 K0 1 4
##1 K1 2 5
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#数据的可视化 matplotlib --> plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#随机生成1000个数据
data = pd.Series(np.random.randn(1000),index = np.arange(1000))
#为了方便观看效果,累加数据
data.cumsum()
#pandas数据可以直接观看可视化效果
data.plot()
plt.show()
#生成1000*4的DataFrame,并对它们进行累加
data = pd.DataFrame(np.random.randn(1000,4),
index = np.arange(1000),
columns = list('ABCD')
)
data.cumsum()
data.plot()
plt.show()
#除了plot,scatter散点图,其他还有bar,hist,box,kde,area,hexbin
ax = data.plot.scatter(x = 'A', y = 'B', color = 'DarkBlue', label = 'Class1')
#画在同一个ax上面,选择不同的数据列
data.plot.scatter(x='A',y = 'C',color = 'LightGreen',label = 'Class2',ax = ax)
plt.show()