# 测试数据
df = pd.DataFrame(np.arange(12).reshape(3,4), index=['first','two','three'], columns=list('abcd'))
df = df.reindex(['first','two','four'])print(df)'''
a b c d
first 0.0 1.0 2.0 3.0
two 4.0 5.0 6.0 7.0
four NaN NaN NaN NaN'''# 判断 nullprint(df.isnull())print(df.a.notnull())'''
a b c d
first False False False False
two False False False False
four True True True True
first True
two True
four False
'''# 计算sum,NA会被视为0 print(df.loc['four'].sum())'''0.0'''# 丢去 NA 部分
data = df.dropna()print(data)'''
a b c d
first 0.0 1.0 2.0 3.0
two 4.0 5.0 6.0 7.0'''# 填充 NA 部分
data = df.fillna(0)print(data)'''
a b c d
first 0.0 1.0 2.0 3.0
two 4.0 5.0 6.0 7.0
four 0.0 0.0 0.0 0.0'''# 使用填充模式
data = df.fillna(method='ffill')#向前填充 print(data)'''
a b c d
first 0.0 1.0 2.0 3.0
two 4.0 5.0 6.0 7.0
four 4.0 5.0 6.0 7.0
'''
data = df.ffill()print(data)'''
a b c d
first 0.0 1.0 2.0 3.0
two 4.0 5.0 6.0 7.0
four 4.0 5.0 6.0 7.0
'''
df = df.reindex(['first','four','two'])print(df)'''
a b c d
first 0.0 1.0 2.0 3.0
four NaN NaN NaN NaN
two 4.0 5.0 6.0 7.0'''# 替换 replace()
data = df.replace({np.nan:0,3:33})print(data)'''
a b c d
first 0.0 1.0 2.0 33.0
four 0.0 0.0 0.0 0.0
two 4.0 5.0 6.0 7.0'''
###分组
函数/属性
说明
groupby()
参数key可以是标签或标签数组,axis选择方向
get_group()
选择分组
agg()
聚合,参数可以为数组
transform()
分组或列上的转换返回索引大小与被分组的索引相同的对象,参数为转换函数
filter()
过滤根据定义的标准过滤数据并返回数据的子集,参数为过滤函数
# 测试数据
df = pd.DataFrame({
'name':['Tom','Tom','Andy','tony','Andy','Tom','tony'],
'rank':[1,2,3,1,4,3,2],
'year':[2016,2014,2015,2013,2012,2009,2011]
})
print(df)
'''
name rank year
0 Tom 1 2016
1 Tom 2 2014
2 Andy 3 2015
3 tony 1 2013
4 Andy 4 2012
5 Tom 3 2009
6 tony 2 2011'''
# 拆封成组
grouped = df.groupby(by='name') # by:映射,函数,str或可迭代对象
print(grouped)
'''<pandas.core.groupby.DataFrameGroupBy object at 0x110ea99e8>'''
# 查看分组
print(grouped.groups)
'''
{'tony': Int64Index([3, 6], dtype='int64'), 'Andy': Int64Index([2, 4], dtype='int64'), 'Tom': Int64Index([0, 1, 5], dtype='int64')}
'''
# 选择一个分组
print(grouped.get_group('Tom'))
'''
name rank year
0 Tom 1 2016
1 Tom 2 2014
5 Tom 3 2009
'''
# 遍历分组
for name,values in grouped:
print('{}:'.format(name))
print(values)
'''
Andy:
name rank year
2 Andy 3 2015
4 Andy 4 2012
Tom:
name rank year
0 Tom 1 2016
1 Tom 2 2014
5 Tom 3 2009
tony:
name rank year
3 tony 1 2013
6 tony 2 2011
'''
# 聚合
data = grouped['rank'].agg(np.mean) # rank算术平均
print(data)
'''
name
Andy 3.5
Tom 2.0
tony 1.5
Name: rank, dtype: float64'''
data = grouped['rank'].agg([np.sum, np.size, np.mean]) #rank的多种聚合
print(data)
'''
sum size mean
name
Andy 7 2 3.5
Tom 6 3 2.0
tony 3 2 1.5
'''
# 自定义函数转换
func = lambda x: x**2
data = grouped['rank'].transform(func)
print(data)
'''
0 1
1 4
2 9
3 1
4 16
5 9
6 4
Name: rank, dtype: int64
'''
# 过滤
func = lambda x: len(x)>2
data = grouped['name'].filter(func)
print(data)
'''
0 Tom
1 Tom
5 Tom
Name: name, dtype: object
'''
left = pd.DataFrame({
'name':['Tom','Tim','Marry'],
'age':[10,12,8]
})
right = pd.DataFrame({
'name':['Sum','Andy','Marry'],
'age':[11,9,8]
})
print(left)
print(right)
'''
age name
0 10 Tom
1 12 Tim
2 8 Marry
age name
0 11 Sum
1 9 Andy
2 8 Marry
'''
# 根据多个键合并
data = pd.merge(left, right, on=['name','age']) # 默认为内inner
print(data)
'''
age name
0 8 Marry
'''
# 根据一个键合并,指定方式
data = pd.merge(left, right, on='name', how='outer')
print(data)
'''
age_x name age_y
0 10.0 Tom NaN
1 12.0 Tim NaN
2 8.0 Marry 8.0
3 NaN Sum 11.0
4 NaN Andy 9.0'''
# 指定以 left 的数据合并
data = pd.merge(left, right, on='name', how='left')
print(data)
'''
age_x name age_y
0 10 Tom NaN
1 12 Tim NaN
2 8 Marry 8.0'''
left = pd.DataFrame({
'name':['Tom','Tim','Marry'],
'age':[10,12,8]
})
right = pd.DataFrame({
'name':['Sum','Andy','Marry'],
'age':[11,9,8]
})
data = pd.concat([left, right])
print(data)
'''
age name
0 10 Tom
1 12 Tim
2 8 Marry
0 11 Sum
1 9 Andy
2 8 Marry
'''
# 重建索引
data = pd.concat([left, right], ignore_index=True)
print(data)
'''
age name
0 10 Tom
1 12 Tim
2 8 Marry
3 11 Sum
4 9 Andy
5 8 Marry
'''
# 选择方向
data = pd.concat([left, right], axis=1)
print(data)
'''
age name age name
0 10 Tom 11 Sum
1 12 Tim 9 Andy
2 8 Marry 8 Marry
'''
# append 方向沿着纵向
data = left.append(right, ignore_index=True)
print(data)
'''
age name
0 10 Tom
1 12 Tim
2 8 Marry
3 11 Sum
4 9 Andy
5 8 Marry
'''