需求:把类别或字符串类型的特征转化为多热编码,特征是逗号、竖线等方式分割
import numpy as np
import pandas as pd
from scipy import sparse
class MultiHotEncoder:
"""
Encode categorical features as a multi-hot numeric array.
Parameters
----------
sep : string, default='|', the separation string.
Attributes
----------
categories_ : a dictionary of encoding results.
Examples
--------
>>> enc = MultiHotEncoder()
>>> X = ['red|green', 'green', 'red|yellow']
>>> enc.fit(X)
>>> enc.categories_
{'red': 0, 'green': 1, 'yellow': 2}
>>> enc.transform(['green', 'yellow|red', None, 'red|green|yellow'])
array([[0., 1., 0.],
[1., 0., 1.],
[0., 0., 0.],
[1., 1., 1.]])
"""
def __init__(self, sep='|')