DataTypes_Auto_infer
bool类型转category
查int64和float64中的id列
处理这种情况:pandas把int的列转成了float,因为nan
对于int64的的变量,视基数情况转成float64或类别变量
float64,如果基数=2,转OHE
# if there are inf or -inf then replace them with NaN
data.replace([np.inf,-np.inf],np.NaN,inplace=True)
#remove columns with duplicate name
data = data.loc[:,~data.columns.duplicated()]
from pyod.models.knn import KNN
from pyod.models.iforest import IForest
from pyod.models.pca import PCA as PCA_od
# Outlier
class Outlier(BaseEstimator,TransformerMixin):
'''
- Removes outlier using ABOD,KNN,IFO,PCA & HOBS using hard voting
- Only takes numerical / One Hot Encoded features
'''
def __init__(self,target,contamination=.20, random_state=42, methods=['knn','iso','pca']):
self.target = target
self.contamination = contamination
self.random_state = random_state
self.methods = methods
return(None)
def fit(self,data,y=None):
#self.abod.fit(data)
return(None)
def transform(self,data,y=None):
return(data)
def fit_transform(self,dataset,y=None):
# dummify if there are any obects
if len(dataset.select_dtypes(include='object').columns)>0:
self.dummy = Dummify(self.target)
data = self.dummy.fit_transform(dataset)
else:
data = dataset.copy()
# reduce data size for faster processing
# try:
# data = data.astype('float16')
# except:
# None
if 'knn' in self.methods:
self.knn = KNN