1、用kmeans对标签进行分类
import pandas as pd
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
import numpy as np
2、
path = 'C:/Users/S/Desktop/其他案例数据/UF_RATIO(EAC1)/data/4CleanData--csv大宽表+0-1标签/csvAll.csv'
df = pd.read_csv(path,encoding='utf8')
df
ratio = df.loc[:,'UF_RATIO']
a = np.array(ratio).reshape(-1,1)
a
kmeans = KMeans(n_clusters=2, random_state=10).fit(a)
df22= pd.DataFrame(columns=['ratio','label'])
df22.loc[:,'ratio'] = ratio
df22.loc[:,'label'] = kmeans.labels_
kmeans.labels_
df33 = df22.sort_values('ratio')
df33
for i in range(len(df33)):
if i != len(df33):
if df33.iloc[i,:][1] == 1 and df33.iloc[i+1,:][1] == 0:
print('ratio1:',df33.iloc[i,:][0],' ratio2:',df33.iloc[i+1,:][0])
threshold = (df33.iloc[i,:][0] + df33.iloc[i+1,:][0]) / 2
print('阈值:',threshold)
plt.scatter(a,kmeans.labels_)
df.insert(loc=3,column='label',value='NaN')
threshold
for i in range(len(df)):
if df.loc[i,'UF_RATIO'] < threshold:
df.loc[i,'label'] = 0
else:
df.loc[i,'label'] = 1
df.loc[:,'label'] = df.loc[:,'label'].astype('int64')
df.to_csv('C:/Users/S/Desktop/其他案例数据/UF_RATIO(EAC1)/data/4CleanData--csv大宽表+0-1标签/UF_RATIO-'+ str(threshold)+ '.csv',index=False,encoding='utf_8_sig')