目标编码的技巧-Soothing 和 Hierarchical Bayesian
**公众号:ChallengeHub **
本文简单介绍两种目标编码方式,大家有兴趣的可以参考原文链接进行理解与尝试使用,祝大家取得好成绩~
Target encoding with smoothing
def add_noise(series, noise_level):
return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,
tst_series=None,
target=None,
min_samples_leaf=1,
smoothing=1,
noise_level=0):
"""
Smoothing is computed like in the following paper by Daniele Micci-Barreca
https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
trn_series : training categorical feature as a pd.Series
tst_series : test categorical feature as a pd.Series
target : target data as a pd.Series
min_samples_leaf (int) : minimum samples to take category average into account
smoothing (int) : smoothing effect to balance categorical average vs prior
"""
assert len(trn_series) == len(target)
assert trn_series.name == tst_series.name
temp = pd.concat([trn_series, target], axis=1)
# Compute target mean
averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
# Compute smoothing
smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
# Apply average function to all target data
prior = target.mean()
# The bigger the count the less full_avg is taken into account
averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
averages.drop(["mean", "count"], axis=1, inplace=True)
# Apply averages to trn and tst series
ft_trn_series = pd.merge(
trn_series.to_frame(trn_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=trn_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_trn_series.index = trn_series.index
ft_tst_series = pd.merge(
tst_series.to_frame(tst_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=tst_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_tst_series.index = tst_series.index
return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
原文:https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
Hierarchical Bayesian Target Encoding
class GaussianTargetEncoder():
def __init__(self, group_cols, target_col="target", prior_cols=None):
self.group_cols = group_cols
self.target_col = target_col
self.prior_cols = prior_cols
def _get_prior(self, df):
if self.prior_cols is None:
prior = np.full(len(df), df[self.target_col].mean())
else:
prior = df[self.prior_cols].mean(1)
return prior
def fit(self, df):
self.stats = df.assign(mu_prior=self._get_prior(df), y=df[self.target_col])
self.stats = self.stats.groupby(self.group_cols).agg(
n = ("y", "count"),
mu_mle = ("y", np.mean),
sig2_mle = ("y", np.var),
mu_prior = ("mu_prior", np.mean),
)
def transform(self, df, prior_precision=1000, stat_type="mean"):
precision = prior_precision + self.stats.n/self.stats.sig2_mle
if stat_type == "mean":
numer = prior_precision*self.stats.mu_prior\
+ self.stats.n/self.stats.sig2_mle*self.stats.mu_mle
denom = precision
elif stat_type == "var":
numer = 1.0
denom = precision
elif stat_type == "precision":
numer = precision
denom = 1.0
else:
raise ValueError(f"stat_type={stat_type} not recognized.")
mapper = dict(zip(self.stats.index, numer / denom))
if isinstance(self.group_cols, str):
keys = df[self.group_cols].values.tolist()
elif len(self.group_cols) == 1:
keys = df[self.group_cols[0]].values.tolist()
else:
keys = zip(*[df[x] for x in self.group_cols])
values = np.array([mapper.get(k) for k in keys]).astype(float)
prior = self._get_prior(df)
values[~np.isfinite(values)] = prior[~np.isfinite(values)]
return values
def fit_transform(self, df, *args, **kwargs):
self.fit(df)
return self.transform(df, *args, **kwargs)
原文:https://www.kaggle.com/mmotoki/hierarchical-bayesian-target-encoding
扫码关注ChallengeHub公众号:
欢迎加入ChallengeHub学习交流群