1.去掉一些明显没有用的特征,如 如’desc’,'url’,并将剩下特征保存到一个新的csv文件中
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
loans_2007 = pd.read_csv("C:/Users/lb/Desktop/test/LoanStats3a.csv",skiprows=1 )
loans_2007.head(5)
half_count = len(loans_2007)/2
loans_2007 = loans_2007.dropna( thresh = half_count,axis =1)
loans_2007 = loans_2007.drop(['desc','url'],axis = 1)
loans_2007.to_csv('loans_2007.scv',index = False)
2.Step.2 输出数据标签,初判断无用特征
loans_2007 = pd.read_csv("./loans_2007.scv")
loans_2007.drop_duplicates()
print(loans_2007.iloc[0])
print("--------------------------------------------")
print(loans_2007.shape[1])
loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
"""
#删除:被模型预测后的值
#zip_code:邮编 美国常用
#out_prncp和out_prncp_inv都是一样的:总资金中剩余的未偿还本金
#out_prncp_inv:实际未偿还的本金
#total_rec_prncp:迄今收到的本金
"""
loans_2007 = loans_2007.drop(["zip_code", "out_prncp&