多指标上下界界定

问题描述:

多个指标,每个指标之间的数值互相不相关,需要通过机器学习方式自动对数值的上下界进行界定,有以下问题:

  1. 不知道何种范围内的数值是正常的;
  2. 不清楚数据的分布情况;
  3. 暂定用Kmeas对数据进行聚类,将数据的最高和最低0.5%的数据视为异常值。将正常值中的最大值作为上界,最小值作为下界。

以下为代码示例:

import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from yellowbrick.cluster.elbow import kelbow_visualizer
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype


def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))


def get_downedge(clu_center,label):
    data = clu_center.copy()
    num_label = (label == clu_center.index(min(data))).sum()
    for i in range(len(clu_center)):
        num = num_label/len(label)
        if num >= 0.005:
            return min(data)
        else:
            data.remove(min(data))
            num_label += (label == clu_center.index(min(data))).sum()

def get_upedge(clu_center,label):
    data = clu_center.copy()
    num_label = (label == clu_center.index(max(data))).sum()   # 得到聚类最大值对应的标签数量
    for i in range(len(clu_center)):    # 在聚类的值的范围中遍历
        num = num_label/len(label)      # 计算当前最大值对应的标签数据占比
        if num >= 0.005:
            return max(data)
        else:
            data.remove(max(data))
            num_label += (label == clu_center.index(max(data))).sum()   # 加上不足1%的标签数据数量

def kmeans_(dt, n=10, t=1):
    #找出最合适的K
    dt = dt.reshape(-1, 1)
    oz = kelbow_visualizer(KMeans(random_state=1, n_init='auto'), dt, k=(2, n), show=False)
    k = oz.elbow_value_
    # kmeans
    kmeans = KMeans(n_clusters=k*t, random_state=1, n_init='auto')
    n_clu = kmeans.fit(dt)
    # 得到聚类中心点和相应的标签
    clu_center = n_clu.cluster_centers_.tolist()
    label = n_clu.labels_
    # 取上下限
    data_ = 0
    downedge = get_downedge(clu_center,label)
    upedge = get_upedge(clu_center,label)
    if downedge == upedge:
        slabel= clu_center.index(downedge)  # 相同上下界的对应的值的标签
        tlabel = np.where(label == slabel)
        data_ = dt[tlabel]
    return downedge, upedge, data_


# 判断该样本是否值不变
def ifnext(data):
    if len(set(data)) == 1:
        return False
    else:
        return True


def run1(file):
    dic = {}
    for i in file.columns:
        print(i)
        c_data = file[i].dropna()
        if ifnext(np.array(c_data)):
            downedge, upedge, data = kmeans_(np.array(c_data))
            if downedge == upedge:
                if len(set(pd.DataFrame(data))) == 1:   # 分了一次之后,data如果出现所有值都不变的情况,按分不开处理, 即认为该指标合格率>99%
                    dic[i] = 1
                    continue
                else:
                    downedge, upedge, data = kmeans_(data)
            print(downedge, upedge)
            dic[i] = [downedge, upedge]
        else:
            dic[i] = 0  # 所有值都是一个值,不变的情况下,说明该指标对找出异常值没有帮助,赋值0,后续删除该指标
    return dic

if __name__ == '__main__':
    # 读取样本数据
    file_path = r'data/demo.csv'
    file = pd.read_csv(file_path)
    # 样本数据压缩
    reduce_mem_usage(file)

    # 过滤空列和重复样本
    file_p = file
    file_p = file_p.dropna(axis=1,how='all')   # 去掉空列
    file_p.drop_duplicates(inplace=True)  # 去掉重复行

    # 界定上下界
    ret = run1(file_p)
    #
    edge = pd.DataFrame(ret)
    col = ['downedge', 'upedge']
    edge.index = col
    print(edge)

    # 删除所有值不变的指标,即删除0列
    df = edge.loc[:, (edge != 0).any(axis=0)]

    # 保存数据
    df.to_csv('data/edge.csv',index=False)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值