5.5 数据抽样
5.5.1 采样函数
def get_sample(df, sampling="simple_random", k=1, stratified_col=None):
"""
对输入的 dataframe 进行抽样的函数
参数:
- df: 输入的数据框 pandas.dataframe 对象
- sampling:抽样方法 str
可选值有 ["simple_random", "stratified", "systematic"]
按顺序分别为: 简单随机抽样、分层抽样、系统抽样
- k: 抽样个数或抽样比例 int or float
(int, 则必须大于0; float, 则必须在区间(0,1)中)
如果 0 < k < 1 , 则 k 表示抽样对于总体的比例
如果 k >= 1 , 则 k 表示抽样的个数;当为分层抽样时,代表每层的样本量
- stratified_col: 需要分层的列名的列表 list
只有在分层抽样时才生效
返回值:
pandas.dataframe 对象, 抽样结果
"""
import random
import pandas as pd
from functools import reduce
import numpy as np
import math
len_df = len(df)
if k <= 0:
raise AssertionError("k不能为负数")
elif k >= 1:
assert isinstance(k, int), "选择抽样个数时, k必须为正整数"
sample_by_n=True
if sampling is "stratified":
alln=k*df.groupby(by=stratified_col)[stratified_col[0]].count().count()
if alln >= len_df:
raise AssertionError("请确认k乘以层数不能超过总样本量")
else:
sample_by_n=False
if sampling in ("simple_random", "systematic"):
k = math.ceil(len_df * k)
if sampling is "simple_random":
print("使用简单随机抽样")
idx = random.sample(range(len_df), k)
res_df = df.iloc[idx,:].copy()
return res_df
elif sampling is "systematic":
print("使用系统抽样")
step = len_df // k+1
start = 0
idx = range(len_df)[start::step]
res_df = df.iloc[idx,:].copy()
return res_df
elif sampling is "stratified":
assert stratified_col is not None, "请传入包含需要分层的列名的列表"
assert all(np.in1d(stratified_col, df.columns)), "请检查输入的列名"
grouped = df.groupby(by=stratified_col)[stratified_col[0]].count()
if sample_by_n==True:
group_k = grouped.map(lambda x:k)
else:
group_k = grouped.map(lambda x: math.ceil(x * k))
res_df = pd.DataFrame(columns=df.columns)
for df_idx in group_k.index:
df1=df
if len(stratified_col)==1:
df1=df1[df1[stratified_col[0]]==df_idx]
else:
for i in range(len(df_idx)):
df1=df1[df1[stratified_col[i]]==df_idx[i]]
idx = random.sample(range(len(df1)), group_k[df_idx])
group_df = df1.iloc[idx,:].copy()
res_df = res_df.append(group_df)
return res_df
else:
raise AssertionError("sampling is illegal")
5.5.2 采样方案
- 在每个地区分别用简单随机抽样、分层抽样、系统抽样,三种方式抽取样本
import pandas as pd
clients = pd.read_csv(r'.\clients.csv',encoding = 'gbk')
clients.head()
| client_id | sex | birth_date | district_id |
---|
0 | 1 | 女 | 1970-12-13 | 18 |
---|
1 | 2 | 男 | 1945-02-04 | 1 |
---|
2 | 3 | 女 | 1940-10-09 | 1 |
---|
3 | 4 | 男 | 1956-12-01 | 5 |
---|
4 | 5 | 女 | 1960-07-03 | 5 |
---|
clients["district_id_c"]=clients["district_id"].map(lambda x:"id"+str(x))
clients["district_id_c"].head()
0 id18
1 id1
2 id1
3 id5
4 id5
Name: district_id_c, dtype: object
1、简单随机抽样
srn=get_sample(clients, sampling="simple_random", k=22, stratified_col=None)
srp=get_sample(clients, sampling="simple_random", k=0.1, stratified_col=None)
len(srn),len(srp)
使用简单随机抽样
使用简单随机抽样
(22, 537)
2、分层抽样
strn=get_sample(clients, sampling="stratified", k=2, stratified_col=['district_id'])
strp=get_sample(clients, sampling="stratified", k=0.1, stratified_col=['district_id'])
len(strn),len(strp)
(154, 573)
3、系统抽样
sysn=get_sample(clients, sampling="systematic", k=4, stratified_col=None)
sysp=get_sample(clients, sampling="systematic", k=0.1, stratified_col=None)
len(sysn),len(sysp)
使用系统抽样
使用系统抽样
(4, 537)