问题描述:
多个指标,每个指标之间的数值互相不相关,需要通过机器学习方式自动对数值的上下界进行界定,有以下问题:
- 不知道何种范围内的数值是正常的;
- 不清楚数据的分布情况;
- 暂定用Kmeas对数据进行聚类,将数据的最高和最低0.5%的数据视为异常值。将正常值中的最大值作为上界,最小值作为下界。
以下为代码示例:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from yellowbrick.cluster.elbow import kelbow_visualizer
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
def reduce_mem_usage(df, use_float16=False):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024 ** 2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
if is_datetime(df[col]) or is_categorical_dtype(df[col]):
# skip datetime type or categorical type
continue
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024 ** 2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
def get_downedge(clu_center,label):
data = clu_center.copy()
num_label = (label == clu_center.index(min(data))).sum()
for i in range(len(clu_center)):
num = num_label/len(label)
if num >= 0.005:
return min(data)
else:
data.remove(min(data))
num_label += (label == clu_center.index(min(data))).sum()
def get_upedge(clu_center,label):
data = clu_center.copy()
num_label = (label == clu_center.index(max(data))).sum() # 得到聚类最大值对应的标签数量
for i in range(len(clu_center)): # 在聚类的值的范围中遍历
num = num_label/len(label) # 计算当前最大值对应的标签数据占比
if num >= 0.005:
return max(data)
else:
data.remove(max(data))
num_label += (label == clu_center.index(max(data))).sum() # 加上不足1%的标签数据数量
def kmeans_(dt, n=10, t=1):
#找出最合适的K
dt = dt.reshape(-1, 1)
oz = kelbow_visualizer(KMeans(random_state=1, n_init='auto'), dt, k=(2, n), show=False)
k = oz.elbow_value_
# kmeans
kmeans = KMeans(n_clusters=k*t, random_state=1, n_init='auto')
n_clu = kmeans.fit(dt)
# 得到聚类中心点和相应的标签
clu_center = n_clu.cluster_centers_.tolist()
label = n_clu.labels_
# 取上下限
data_ = 0
downedge = get_downedge(clu_center,label)
upedge = get_upedge(clu_center,label)
if downedge == upedge:
slabel= clu_center.index(downedge) # 相同上下界的对应的值的标签
tlabel = np.where(label == slabel)
data_ = dt[tlabel]
return downedge, upedge, data_
# 判断该样本是否值不变
def ifnext(data):
if len(set(data)) == 1:
return False
else:
return True
def run1(file):
dic = {}
for i in file.columns:
print(i)
c_data = file[i].dropna()
if ifnext(np.array(c_data)):
downedge, upedge, data = kmeans_(np.array(c_data))
if downedge == upedge:
if len(set(pd.DataFrame(data))) == 1: # 分了一次之后,data如果出现所有值都不变的情况,按分不开处理, 即认为该指标合格率>99%
dic[i] = 1
continue
else:
downedge, upedge, data = kmeans_(data)
print(downedge, upedge)
dic[i] = [downedge, upedge]
else:
dic[i] = 0 # 所有值都是一个值,不变的情况下,说明该指标对找出异常值没有帮助,赋值0,后续删除该指标
return dic
if __name__ == '__main__':
# 读取样本数据
file_path = r'data/demo.csv'
file = pd.read_csv(file_path)
# 样本数据压缩
reduce_mem_usage(file)
# 过滤空列和重复样本
file_p = file
file_p = file_p.dropna(axis=1,how='all') # 去掉空列
file_p.drop_duplicates(inplace=True) # 去掉重复行
# 界定上下界
ret = run1(file_p)
#
edge = pd.DataFrame(ret)
col = ['downedge', 'upedge']
edge.index = col
print(edge)
# 删除所有值不变的指标,即删除0列
df = edge.loc[:, (edge != 0).any(axis=0)]
# 保存数据
df.to_csv('data/edge.csv',index=False)