在机器学习中,常量特征指的是在训练数据中取值始终相同的特征。这种特征对于建模和预测任务来说是没有任何信息的,因为它们没有助于区分不同的样本或提供任何有关目标变量的信息。如果一个特征的方差很小(比如0.1),它可能是一个低方差特征。低方差特征的取值变化非常有限,无论对目标变量还是其他特征来说,它们都提供的信息量非常少。
常量特征及低方差特征都可能会对机器学习模型产生负面影响,因为它们不会提供任何变化或区分性,或很难提供有用的区分性信息。
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.feature_selection import SelectKBest,RFE,SelectFromModel
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import VarianceThreshold
data = pd.read_csv('data')
features = data.drop(['glucose','name'],axis = 1)
target = data['glucose']
x_train,x_test,y_train,y_test = train_test_split(features,target,test_size=0.2,random_state=0)
data.head()
data.shape
#删除常量特征
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(features)
constant_filter.get_support().sum()
constant_list = [not temp for temp in constant_filter.get_support()]
features.columns[constant_list]
features.columns[constant_filter.get_support()]
x_train_filter = constant_filter.transform(x_train)
x_test_filter = constant_filter.transform(x_test)
print(x_train_filter.shape,x_test_filter.shape,x_train.shape,x_test.shape)
#删除准常量特征
quasi_constant_filter = VarianceThreshold(threshold=0.01)
quasi_constant_filter.fit(features)
quasi_constant_filter.get_support().sum()
x_train_quasi_filter = quasi_constant_filter.transform(x_train)
x_test_quasi_filter = quasi_constant_filter.transform(x_test)
print(x_train_filter.shape,x_test_filter.shape,x_train_quasi_filter.shape,x_test_quasi_filter.shape)
x_train_quasi_filter_features = features.columns[quasi_constant_filter.get_support()]
x_train_quasi_filter_features
#删除重复的特征
x_train_T = x_train_quasi_filter.T
x_test_T = x_test_quasi_filter.T
type(x_train_T)
x_train_T = pd.DataFrame(x_train_T)
# 使用 rename() 方法给行索引换名字
x_train_T.rename(index=dict(zip(x_train_T.index, x_train_quasi_filter_features)), inplace=True)
# 打印修改后的 DataFrame
# print(x_train_T)
x_test_T = pd.DataFrame(x_test_T)
x_test_T.rename(index=dict(zip(x_test_T.index, x_train_quasi_filter_features)), inplace=True)
print(x_train_T.shape,x_test_T.shape)
print(x_train_T.duplicated().sum())
duplicated_features = x_train_T.duplicated()
print(duplicated_features)
features_to_keep = [not index for index in duplicated_features]
x_train_unique = x_train_T[features_to_keep].T
x_test_unique = x_test_T[features_to_keep].T
print(x_train_unique.shape,x_train.shape)