非均衡性问题
0导入相关库
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics # 评估
from sklearn.linear_model import LogisticRegression
1 加载数据
df=pd.read_csv('Regression/Regression7/imbalance.csv',header=None)
df.columns = ["x1", "x2","y"]
features=["x1","x2"]
labels = ["y"]
df.info()
# df.sample(frac=0.05) # 随机取样(分数 四舍五入)
df.sample(n=5) # 5行
2 不调整权重
model = LogisticRegression(C=1e4)
model.fit(df[features], df[labels])
2.1 分类概率
prob = model.predict_proba(df[features])
pd.DataFrame(prob)
2.2 分类汇总情况
pred = model.predict(df[features])
print(metrics.classification_report(df['y'], pred))
2.3 混淆矩阵
confusion = metrics.confusion_matrix(df['y'], pred)
confusion
plt.matshow(confusion)
plt.title('混淆矩阵')
plt.colorbar()
plt.ylabel('预测')
plt.xlabel('实际')
plt.show()
3 加权方法#1
使用比例的倒数手动调节权重
y=df['y']
positiveWeight = len(y[y>0]) / float(len(df['y']))
classWeight = {1: 1. / positiveWeight, 0: 1. / (1 - positiveWeight)}
为了消除惩罚项的干扰,将惩罚系数设为很大
3.1 分类汇总情况
model1 = LogisticRegression(class_weight=classWeight, C=1e4)
model1.fit(df[features], df[labels])
pred1 = model1.predict(df[features])
pred1 = pd.DataFrame(pred1)
print(metrics.classification_report(df['y'], pred1))
3.2 混淆矩阵
metrics.confusion_matrix(df['y'], pred1)
plt.matshow(metrics.confusion_matrix(df['y'], pred1))
plt.title('混淆矩阵')
plt.colorbar()
plt.ylabel('预测')
plt.xlabel('实际')
plt.show()
4 加权方法#2
balanced方法
4.1 分类汇总情况
model2 = LogisticRegression(class_weight='balanced', C=1e4)
model2.fit(df[features], df[labels])
pred2=model2.predict(df[features])
pred2=pd.DataFrame(pred2)
print(metrics.classification_report(df['y'],pred2))
4.2 混淆矩阵
metrics.confusion_matrix(df['y'],pred2)
plt.matshow(metrics.confusion_matrix(df['y'], pred2))
plt.title('混淆矩阵')
plt.colorbar()
plt.ylabel('预测')
plt.xlabel('实际')
plt.show()