0 M
1 F
10 NaN
11 NaN
Name: Gender, dtype: object 0 NaN
1 NaN
10 M
11 F
Name: Gender, dtype: object
0 173.0
1 192.0
10 NaN
11 NaN
Name: Height, dtype: float64 0 NaN
1 NaN
10 161.0
11 175.0
Name: Height, dtype: float64
df1 = pd.DataFrame({'A':['A0','A1'],'B':['B0','B1']},
index =[0,1])
df2 = pd.DataFrame({'A':['A2','A3'],'B':['B2','B3']},
index =[2,3])
df3 = pd.DataFrame({'A':['A1','A3'],'D':['D1','D3'],'E':['E1','E3']},
index =[1,3])
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
right2 = pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3']})
left = pd.DataFrame({'A':[1,2],'B':[2,2]})
right = pd.DataFrame({'A':[4,5,6],'B':[2,3,4]})#pd.merge(left, right, on='B', how='outer',validate='one_to_one') #报错
left = pd.DataFrame({'A':[1,2],'B':[2,1]})
pd.merge(left, right, on='B', how='outer',validate='one_to_one')
left = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},
index=['K0','K2','K3'])
left.join(right)
A
B
C
D
K0
A0
B0
C0
D0
K1
A1
B1
NaN
NaN
K2
A2
B2
C2
D2
对于many_to_one模式下的合并,往往join更为方便
同样可以指定key:
left = pd.DataFrame({'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3'],'key':['K0','K1','K0','K1']})
right = pd.DataFrame({'C':['C0','C1'],'D':['D0','D1']},
index=['K0','K1'])
left.join(right, on='key')
A
B
key
C
D
0
A0
B0
K0
C0
D0
1
A1
B1
K1
C1
D1
2
A2
B2
K0
C0
D0
3
A3
B3
K1
C1
D1
多层key:
left = pd.DataFrame({'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3'],'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1']})
index = pd.MultiIndex.from_tuples([('K0','K0'),('K1','K0'),('K2','K0'),('K2','K1')],names=['key1','key2'])
right = pd.DataFrame({'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']},
index=index)
left.join(right, on=['key1','key2'])
left = pd.DataFrame({'A':[1,2],'B':[2,2],'C':[9,8]}).set_index(['A','B'])
right = pd.DataFrame({'A':[2,2,6],'B':[2,3,4],'C':[7,8,9]}).set_index(['A','B'])
left.append(right)
C
A
B
1
2
9
2
2
8
2
7
3
8
6
4
9
left.combine(right,lambda x,y:x if x.mean()>y.mean()else y)
left = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},
index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},
index=['K0','K2','K3'])
left.join(right,how='inner')
A
B
C
D
K0
A0
B0
C0
D0
K2
A2
B2
C2
D2
left.join(right,how='left')
A
B
C
D
K0
A0
B0
C0
D0
K1
A1
B1
NaN
NaN
K2
A2
B2
C2
D2
left.join(right,how='right')
A
B
C
D
K0
A0
B0
C0
D0
K2
A2
B2
C2
D2
K3
NaN
NaN
C3
D3
会根据连接方式不同而不同
2. 练习
【练习一】有2张公司的员工信息表,每个公司共有16名员工,共有五个公司,请解决如下问题:
df1=pd.read_csv('data/Employee1.csv')
df1.head()
Company
Name
Age
Height
Weight
Salary
0
A
a1
47
188
63.7
25819
1
A
a3
39
172
55.9
21983
2
A
a4
43
158
62.5
21755
3
A
a6
42
182
76.9
17354
4
A
a7
49
171
94.6
6177
df2=pd.read_csv('data/Employee2.csv')
df2.head()
Company
Name
Age
Height
Weight
Salary
0
A
a1
30
156
91.2
28133
1
A
a2
50
190
83.4
6673
2
A
a3
34
168
96.6
16503
3
A
a5
51
176
97.2
23294
4
A
a6
37
183
93.2
19256
(a) 每个公司有多少员工满足如下条件:既出现第一张表,又出现在第二张表。
n =set(df1['Name'].values.tolist())&set(df2['Name'].values.tolist())
n
df1 = pd.read_csv('data/Employee1.csv')
df2 = pd.read_csv('data/Employee2.csv')
df1['重复']=['Y_1'if df1.loc[i,'Name']in n else'N'for i inrange(df1.shape[0])]
df2['重复']=['Y_2'if df2.loc[i,'Name']in n else'N'for i inrange(df2.shape[0])]
df1 = df1.set_index(['Name','重复'])
df2 = df2.set_index(['Name','重复'])
df_c = pd.concat([df1,df2])
result = pd.DataFrame({'Company':[],'Name':[],'Age':[],'Height':[],'Weight':[],'Salary':[]})
group = df_c.groupby(['Company','重复'])for i in n:
first = group.get_group((i[0].upper(),'Y_1')).reset_index(level=1).loc[i,:][-4:]
second = group.get_group((i[0].upper(),'Y_2')).reset_index(level=1).loc[i,:][-4:]
mean = group.get_group((i[0].upper(),'N')).reset_index(level=1).mean()
final =[i[0].upper(),i]for j inrange(4):
final.append(first[j]ifabs(first[j]-mean[j])<abs(second[j]-mean[j])else second[j])
result = pd.concat([result,pd.DataFrame({result.columns.tolist()[k]:[final[k]]for k inrange(6)})])
result = pd.concat([result.set_index('Name'),b])for i inlist('abcde'):for j inrange(1,17):
item = i+str(j)if item notin result.index:
result = pd.concat([result,pd.DataFrame({'Company':[i.upper()],'Name':[item],'Age':[np.nan],'Height':[np.nan],'Weight':[np.nan],'Salary':[np.nan]}).set_index('Name')])print(result.index)
result['Number']=[i for i in result.index]
result.reset_index().drop(columns='Name').set_index(['Company','Number']).sort_index()
/home/myth/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:18: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
/home/myth/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:24: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.
To accept the future behavior, pass 'sort=False'.
To retain the current behavior and silence the warning, pass 'sort=True'.
Index(['e10', 'd10', 'c13', 'a6', 'b1', 'c12', 'c10', 'c3', 'b15', 'a1',
...
'e4', 'e5', 'e6', 'e7', 'e9', 'e12', 'e13', 'e14', 'e15', 'e16'],
dtype='object', length=125)
df_a11,df_a12,df_a21,df_a22 =0,0,0,0
df_a11= df1.query('课程类别 in ["学科基础课","专业必修课","专业选修课"]')
df_a12= df1.query('课程类别 not in ["学科基础课","专业必修课","专业选修课"]')
df_a21= df2.query('课程类别 in ["学科基础课","专业必修课","专业选修课"]')
df_a22= df2.query('课程类别 not in ["学科基础课","专业必修课","专业选修课"]')
df_a11.head()