有时候我们只需要数据集中的一部分,并不需要全部的数据。这个时候我们就要对数据集进行随机的抽样。随机抽样,是指随机从数据中,按照一定的行数或者比例抽取数据。
import numpy as np
import pandas as pd
test_dict = {'id':[1,2,3,4,5,6],
'name':['Alice','Bob','Kaka','Raul','Fabregas','Grace']
}
test_df = pd.DataFrame(test_dict)
# 按照个数抽样
sam_df = test_df.sample(n=2)
# 按照比例抽样
sam_df2 = test_df.sample(frac=0.5)
# 可回放的随机抽样
sam_df3 = test_df.sample(n=8,replace=True)
test_dict2 = {'id':[1,2,3,4,5,6,7,8,9],
'name':['Alice','Bob','Kaka','Raul','Fabregas','Grace','Torres','Villa','Van Persi'],
'team':['Man City','Arsenal','Man City','Arsenal','Man City','Arsenal','Arsenal','Arsenal','Man City']
}
test_df2 = pd.DataFrame(test_dict2)
# 分组抽样,按照team字段分组
gbr = test_df2.groupby('team')
gbr.groups
type_train = {'Man City':1,'Arsenal':2}
def teamSampling(group,typicalNDict):
name = group.name
n = typicalNDict[name]
return group.sample(n=n)
result_train = gbr.apply(teamSampling,type_train)
result_train