废弃的自带rare combine的OHE代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : tqichun@gmail.com
import numpy as np
from category_encoders import OneHotEncoder as OriginOneHotEncoder
from category_encoders.utils import convert_input
from sklearn.base import BaseEstimator, TransformerMixin

__all__ = ["OneHotEncoder"]


class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(
            self,
            minimum_fraction=None,
            verbose=0,
            cols=None,
            drop_invariant=False,
            return_df=True,
            handle_missing='value',
            handle_unknown='value',
            use_cat_names=False,
    ):
        self.minimum_fraction = minimum_fraction
        self.use_cat_names = use_cat_names
        self.handle_unknown = handle_unknown
        self.handle_missing = handle_missing
        self.return_df = return_df
        self.drop_invariant = drop_invariant
        self.cols = cols
        self.verbose = verbose
        self.ohe = OriginOneHotEncoder(
            verbose=verbose,
            cols=cols,
            drop_invariant=drop_invariant,
            return_df=return_df,
            handle_missing=handle_missing,
            handle_unknown=handle_unknown,
            use_cat_names=use_cat_names
        )

    def fit(self, X, y=None, **kwargs):
        X = convert_input(X)
        # 遍历每列
        X_ = X.values
        do_not_replace_by_other = list()
        converted_values = list()
        if self.minimum_fraction is not None:
            for column in range(X.shape[1]):
                do_not_replace_by_other.append(set())
                unique, counts = np.unique(
                    X_[:, column], return_counts=True,
                )
                colsize = X.shape[0]
                for unique_value, count in zip(unique, counts):
                    fraction = float(count) / colsize
                    if fraction >= self.minimum_fraction:
                        do_not_replace_by_other[-1].add(unique_value)
                converted_value = None
                for unique_value in unique:
                    if unique_value not in do_not_replace_by_other[-1]:
                        if converted_value is None:
                            converted_value = unique_value
                        X_[:, column][(X_[:, column] == unique_value)] = converted_value
                converted_values.append(converted_value)
        self.do_not_replace_by_other_ = do_not_replace_by_other
        self.converted_values_ = converted_values
        self.ohe.fit(X.astype(str), y, **kwargs)
        return self

    def transform(self, X):
        X = convert_input(X)
        X_ = X.values
        if self.minimum_fraction is not None:
            for column in range(X_.shape[1]):
                unique = np.unique(X_[:, column])
                for unique_value in unique:
                    if unique_value not in self.do_not_replace_by_other_[column]:
                        X_[:, column][(X_[:, column] == unique_value)] = self.converted_values_[column]
        return self.ohe.transform(X.astype(str))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值