方法一 删除缺失值
import pandas as pd
df = pd.read_csv('data.csv')
#删除包含缺失值的行
df_dropna = df.dropna()
方法二 使用前一行或者后一行的值填充
import pandas as pd
df = pd.read_csv('data.csv')
#前一行的值填充
df_ffill = df.fillna(method='ffill')
#后一行的值填充
df_bfill = df.fillna(method='bfill')
方法三 使用均值 中位数 或者众数填充
import pandas as pd
from sklearn.impute import SimpleImputer
df = pd.read_csv('data.csv')
# 用均值填充缺失值
impute = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df))
方法四 用预测模型填充缺失值
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
df= pd.read_csv('data.csv')
#拆分缺失值行
train = df[df['feature'].notna()]
test = df[df['feature'].isna()]
#特征和目标向量
X_trian = train.drop(columns=['feature'])
y_train = train['feature']
X_test = test.drop(columns=['feature'])
#使用随机森林预测缺失值
model = RandomForestRegressor()
model.fit(X_train, y_train)
predicted_values = model.predict(X_test)
#填充预测的缺失值
df.loc[df['feature'].isna(), 'feature'] = predicted_values
使用k邻近填充
import pandas as pd
from sklearn.impute import KNNImputer
# 假设df是你的数据框
df = pd.read_csv('your_data.csv')
# 使用K近邻填充缺失值
imputer = KNNImputer(n_neighbors=5)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)