# Author:Nimo_Ding
import pandas as pd
import numpy as np
from pandas import Series,DataFrame
# 数据表的合并
df1=DataFrame({
'name':['ZhangFei','GuanYu','a','b','c'],
'data1':range(5)
})
df2=DataFrame({
'name':['ZhangFei','GuanYu','A','B','c'],
'data2':range(5)
})
print('\n1、基于指定列进行连接:')
print(df1)
print(df2)
'''
结果为:
name data1
0 ZhangFei 0
1 GuanYu 1
2 a 2
3 b 3
4 c 4
name data2
0 ZhangFei 0
1 GuanYu 1
2 A 2
3 B 3
4 c 4
'''
df3=pd.merge(df1,df2,on='name')
print(df3)
'''
结果为:
name data1 data2
0 ZhangFei 0 0
1 GuanYu 1 1
2 c 4 4
'''
print('\n2、inner内连接:是merge合并的默认模式')
df3=pd.merge(df1,df2,how='inner')
print(df3)
'''
结果为:
name data1 data2
0 ZhangFei 0 0
1 GuanYu 1 1
2 c 4 4
'''
print('\n3、left左连接')
df3=pd.merge(df1,df2,how='left')
print(df3)
'''
结果为:
name data1 data2
0 ZhangFei 0 0.0
1 GuanYu 1 1.0
2 a 2 NaN
3 b 3 NaN
4 c 4 4.0
'''
print('\n4、right右连接')
df3=pd.merge(df1,df2,how='right')
print(df3)
'''
结果为:
name data1 data2
0 ZhangFei 0.0 0
1 GuanYu 1.0 1
2 c 4.0 4
3 A NaN 2
4 B NaN 3
'''
print('\n5、outer外连接')
df3=pd.merge(df1,df2,how='outer')
print(df3)
'''
结果为:
name data1 data2
0 ZhangFei 0.0 0.0
1 GuanYu 1.0 1.0
2 a 2.0 NaN
3 b 3.0 NaN
4 c 4.0 4.0
5 A NaN 2.0
6 B NaN 3.0
'''
print('\n如何用SQL方式打开pandas:')
# 借助pandasql工具,pandasql中的主要函数式sqldf,
# 它接收两个参数:一个SQL查询语句,还有一组环境变量globals()或locals()
# 这样就可以直接用SQL语句对DataFrame进行操作。
import pandas as pd
from pandas import DataFrame
from pandasql import sqldf,load_meat,load_births
df1=DataFrame({
'name':['ZhangFei','GuanYu','a','b','c'],
'data1':range(5)
})
pysqldf=lambda sql:sqldf(sql,globals())
sql="select * from df1 where name='ZhangFei'"
print(pysqldf(sql))
'''
结果为:
name data1
0 ZhangFei 0
'''
print('\n练习题:根据表数据对数据进行清洗,同时新增一列"总和"计算每个人的三科成绩之和')
data={
'语文':[66,95,95,90,80,80],
'英语':[65,85,92,88,90,90],
'数学':[None,98,96,77,90,90]
}
df1=DataFrame(data,
index=['张飞','关羽','赵云','黄忠','典韦','典韦'],
columns=['语文','英语','数学'])
# 去除重复行
df1.drop_duplicates()
# 补充缺失值
# 将数学成绩均值作为张飞的缺失值。
df1['数学'].fillna(df1['数学'].mean(),inplace=True)
# 定义函数:增加一列
def add_col(df1):
df1['总和']=df1['语文']+df1['英语']+df1['数学']
return df1
df1=df1.apply(add_col,axis=1)
print(df1)