import csv import pandas as pd from sklearn.linear_model import LogisticRegression import warnings warnings.filterwarnings('ignore') #忽视 # 加载数据源,并etl loans_2020 = pd.read_csv('/Users/yee/Desktop/LoanStats3a.csv', skiprows=1) #第一行是字符串,所以要skiprows=1跳过第一行 half_count = len(loans_2020) / 2 # 4万行除以2 = 19767.5行 loans_2020 = loans_2020.dropna(thresh=half_count, axis=1)#2万行中剔除空白值超过一半的列,thresh:剔除 loans_2020 = loans_2020.drop(['desc', 'url'],axis=1) #按照列中,删除描述和URL链接 print("原始数据 = ",loans_2020.shape) #删除不需要的列,只需要特征和标签 loans_2020 = loans_2020.drop( ["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d", "zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp", "total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"] , axis=1) print(loans_2020.iloc[1])#第一行数据 print("现在的列数 = ",loans_2020.shape[1 ]) #标签列 #Fully Paid:批准了客户的贷款,后面给他打个“1” #Charged Off:没有批准了客户的贷款,后面给他打个“0” #Does not meet the credit policy. Status:Fully Paid:,没有满足要求的有1988个,也不要说清楚不贷款,就不要这个属性了 #Late (16-30 days) :延期了16-30 days #Late (31-120 days):延期了31-120 days , 所以这些都不确定的属性,相当于“取保候审” print(loans_2020['loan_status'].value_counts())#计算该列特征的属性的个数 loans_2020 = loans_2020[(loans_2020['loan_status'] == "Fully Paid") | (loans_2020['loan_status'] == "Charged Off")] status_replace = { "loan_status": { "Fully Paid": 1, #完全支付 "Charged Off": 0 #违约 } } loans_2020 = loans_2020.replace(status_replace) #值都是一样的列没有意义,删除这些列 orig_columns = loans_2020.columns #展现出所有的列 drop_columns = [] #需要删除的列放在此列表 for col in orig_columns: col_series = loans_2020[col].dropna().unique() # 对每一列的数据去空去重 if len(col_series) == 1: # 如果该特征的属性只有一个属性,就给过滤掉该特征 drop_columns.append(col) loans_2020 = loans_2020.drop(drop_columns, axis=1) print(loans_2020.shape) #对存在None的列进行分析,不要这一列或者删除None的那一行 loans = loans_2020 null_counts = loans.isnull().sum() #统计一下每列的缺失值有多少 print("每列的缺失值",null_counts) loans = loans.drop("pub_rec_bankruptcies", axis=1) #因为None的太多,这列直接删除 loans = loans.dropna(axis=0) #其他为None的删除行 print(loans_2020.shape) #由于sklearn库不接受字符型的数据,所以还需将上面特征中12个字符型的数据进行处理。 object_columns_df = loans.select_dtypes(include=["object"]) print(object_columns_df.iloc[0]) #删除:last_credit_pull_d:LC撤回最近的月份 #删除:earliest_cr_line:第一次借贷时间 #删除:addr_state:家庭邮编 #删除:title:URL的标题 loans = loans.drop( ["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1) #int_rate:利息,10.65%,后面还要把%去掉 loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float") #revol_util:透支额度占信用比例 loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float") #emp_length:10年的映射成10,9年的映射成9 mapping_dict = { "emp_length": { "10+ years": 10, "9 years": 9, "8 years": 8, "7 years": 7, "6 years": 6, "5 years": 5, "4 years": 4, "3 years": 3, "2 years": 2, "1 year": 1, "< 1 year": 0, "n/a": 0 } } loans = loans.replace(mapping_dict) print("-----------------------------特征工程:对这些列的数据行转列,打上0/1的独热标签----------------------------------------") #home_ownership:房屋所有权 #verification_status:身份保持证明 #emp_length:客户公司名称 #purpose:贷款的意图 #term:贷款分期的时间 cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"] # 哑变量 dummy_df = pd.get_dummies(loans[cat_columns]) loans = pd.concat([loans, dummy_df], axis=1) #连接两个或多个数组, loans = loans.drop(cat_columns, axis=1) #pymnt_plan 指示是否已为贷款实施付款计划 ,里面都为N,删掉这一列 loans = loans.drop("pymnt_plan", axis=1) print(loans.iloc[0]) print("-----------------------------使用逻辑回归算法库---------------------------------------") cols = loans.columns #所有列 train_cols = cols.drop("loan_status") # 删除loan_status这一列作为目标值 features = loans[train_cols] # 特征矩阵 target = loans["loan_status"] # 作为标签矩阵 lr = LogisticRegression() # 调用逻辑回归的算法包 lr.fit(features, target) #开始训练 predictions = lr.predict(features) # 开始预测,结果是个ndarray proba = lr.predict_proba(features) print(proba[:10]) #前十行模型算出的值,<=0.5为0,>0.5为1 print(predictions[:10])#0:代表不会偿还 1:代表偿还 print("-----------------------------验证模型的好坏---------------------------------------") #用4个指标 fp tp fn tn,统计四种情况的数量 # 假正类(False Positive,FP):将负类预测为正类 fp_filter = (predictions == 1) & (loans["loan_status"] == 0) fp = len(predictions[fp_filter]) print(fp) print("----------------------------------------") # 真正类(True Positive,TP):将正类预测为正类 tp_filter = (predictions == 1) & (loans["loan_status"] == 1) tp = len(predictions[tp_filter]) print(tp) print("----------------------------------------") # 假负类(False Negative,FN):将正类预测为负类 fn_filter = (predictions == 0) & (loans["loan_status"] == 1) fn = len(predictions[fn_filter]) print(fn) print("----------------------------------------") # 真负类(True Negative,TN):将负类预测为负类 tn_filter = (predictions == 0) & (loans["loan_status"] == 0) tn = len(predictions[tn_filter]) print(tn) #真正率:TPRate的意义是所有真实类别为1的样本中,预测类别为1的比例。越高越好! #假正率:FPRate的意义是所有真实类别为0的样本中,预测类别为1的比例。越低越好! tpr = tp / float((tp + fn)) fpr = fp / float((fp + tn)) print(tpr)#真正率 print(fpr)#假正率 print("-------------------改变权重重新计算--------------------------------------------------") """ 权重项可以自己定义的 0代表5倍的 """ from sklearn.model_selection import cross_val_predict penalty = { 0: 5, 1: 1 } lr = LogisticRegression(class_weight=penalty) predictions = cross_val_predict(lr, features, target, cv=10) predictions = pd.Series(predictions) # 建立混淆矩阵 # False positives. fp_filter = (predictions == 1) & (loans["loan_status"] == 0) fp = len(predictions[fp_filter]) # True positives. tp_filter = (predictions == 1) & (loans["loan_status"] == 1) tp = len(predictions[tp_filter]) # False negatives. fn_filter = (predictions == 0) & (loans["loan_status"] == 1) fn = len(predictions[fn_filter]) # True negatives tn_filter = (predictions == 0) & (loans["loan_status"] == 0) tn = len(predictions[tn_filter]) # Rates tpr = tp / float((tp + fn)) fpr = fp / float((fp + tn)) print(tpr) print(fpr)
逻辑回归简单应用
最新推荐文章于 2024-10-11 16:17:26 发布
本文通过Python实现贷款违约预测,先从CSV数据预处理开始,包括数据清洗、特征选择、标签转化和缺失值处理。接着,利用逻辑回归模型进行训练,并通过特征工程创建哑变量和调整权重。最后,展示了模型的评估指标和权重对结果的影响。
摘要由CSDN通过智能技术生成