Python异常值处理

我爱鸢尾花

已于 2024-08-05 14:21:33 修改

阅读量243

点赞数 4

文章标签： python 数据分析 pandas

于 2024-08-05 11:41:49 首次发布

本文链接：https://blog.csdn.net/m0_46670850/article/details/140918377

版权

异常值处理

在做数据分析时，常常会遇到有异常值处理。异常值需要先识别，再进行处理，这里提供一种我觉得还不错的方式

异常值识别

step 1 原始数据标准化
在这里我用标准化去识别，也就是Z-score（Z分数），重点见红框，其实我就是百度的。

在这里插入图片描述
step 2 定义阈值：这里采用3σ原则，因为是标准正态分布！！！

代码

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
import warnings
warnings.filterwarnings("ignore")

data = pd.read_excel("XXXXXXX\\异常值处理.xlsx",engine='openpyxl')   
data

随便示例的数据
在这里插入图片描述

data_yichang = data.copy()
def detect_outliers(data, m=2.):    # m = 2 ,代表落在平均值± 2 个标准差范围以外的数据点 为异常点
    """
    使用z分数（Z-score）检测数据中的异常值  #标准分数
    """
    data = np.array(data)
    mean = np.mean(data)
    std = np.std(data)
    z_score = (data - mean) / std   # 应服从正态分布 
    return [index for index, score in enumerate(z_score) if abs(score) > m]

# 用四分位数 检测每一列数的异常值
for m in range(0,len(data.columns)):
    outliers = detect_outliers(data.iloc[:,m])
    
    print(f"异常值的列索引: {outliers}")
    row = 1
    col = 3
    if outliers:
        print(f"{data.iloc[:,m].name}--列有异常值--{ data.iloc[outliers[0],m]}")
        data.replace(data.iloc[outliers[0],m], np.nan, inplace=True)      
        # data.iloc[outliers[0],m] =data.iloc[outliers[0]-2:outliers[0]+2,m].mean()     #剔除异常值以外的近几个值的均值
        data.iloc[outliers[0],m] =data.iloc[:,m].mean()   #剔除异常值以外的均值，否则均值是将异常值包括进去计算出的均值
        print("将异常值替换为",data.iloc[outliers[0],m],'\n')

        # 画出数据处理前后异常点的对比图
        plt.subplots(figsize=(12,2))
        ax=plt.subplot(row,col,1) 
        ax.plot([a for a in range(len(data_yichang.index))],data_yichang.iloc[:,m],label=f'{data_yichang.columns[m]}',c='green')
        ax.scatter(outliers[0],data_yichang.iloc[:,m][outliers[0]],c='red',s=80,label = '异常点')
        ax.legend()
        ax=plt.subplot(row,col,2) 
        ax.plot([a for a in range(len(data.index))],data.iloc[:,m],label=f'{data.columns[m]}',c='green')
        ax.legend()
        plt.show()
    else:
        print(f"{data.iloc[:,m].name}列-没有-个异常值")
data