目录
本代码将实现对名为data.xls的数据集进行逻辑回归预测,列名分别为年龄 教育水平 当前工作年限 当前居住年限 家庭收入 债务占收入比例 信用卡负债 其他负债 还款拖欠情况,以还款拖欠情况为标签,并在训练前对数据进行标准化。
1.导入库
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import numpy as np
2.数据处理
# 读取数据
df = pd.read_excel('data.xls')
# 数据预处理
X = df.iloc[:, :-1] # 特征列
y = df.iloc[:, -1] # 标签列
# 对特征进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
3.交叉验证选择较优惩罚因子
scores = []
c_param_range = [0.01, 0.1, 1, 10, 100]
for i in c_param_range:
lr = LogisticRegression(C = i, penalty = 'l2', solver = 'lbfgs', max_iter = 1000)
score = cross_val_score(lr, X_train, y_train, cv = 10, scoring = 'recall') # 交叉验证
score_mean = sum(score) / len(score)
scores.append(score_mean)
print(score_mean) # 将不同的c参数分别传入模型,分别看看哪个模型效果更好
##
best_c = c_param_range[np.argmax(scores)] #寻找到scores中最大值的对应的序号所对应的参数
print("最优惩罚因子为: {}".format(best_c))
4.训练和预测
# 训练模型
model = LogisticRegression(C = best_c, penalty = 'l2', max_iter = 1000)
model.fit(X_train, y_train)
# 模型预测
y_pred = model.predict(X_test)
5.评估模型
accuracy = accuracy_score(y_test, y_pred)
print(f'模型准确率: {accuracy * 100:.2f}%')