删除重复行
# 查看重复值
isDuplicated = df.duplicated(
['MonthlyIncome_rf', 'age', 'DebtRatio', 'SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines'
, 'NumberOfTime30-59DaysPastDueNotWorse'
, 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate'
, 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse'])
# print(isDuplicated[isDuplicated==True])
# 删除重复值,选择多个字段判断是否完全重复
df = df.drop_duplicates(
['MonthlyIncome_rf', 'age', 'DebtRatio', 'SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines'
, 'NumberOfTime30-59DaysPastDueNotWorse'
, 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate'
, 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse'])
# df.info()
用箱型图查看数个属性的分布情况
#查看相关字段箱型图,查看30-59,60-89,90以上逾期次数三者的分布,得知有少量过大的异常点
plt.figure(figsize=(12,12))
plt.ylim(0, 100)
plt.boxplot(df[['NumberOfTime30-59DaysPastDueNotWorse',
'NumberOfTime60-89DaysPastDueNotWorse',
'NumberOfTimes90DaysLate']],
labels=['30-59','60-89', '90'])
plt.grid(linestyle='--')
plt.show()
#查看年龄分布
plt.figure(figsize=(12,12))
plt.ylim(20, 120)
plt.boxplot(df[['age']],
labels=['age'])
plt.title("age distribution",fontsize = 20)
plt.grid(linestyle='--')
plt.show()
发现30-59,60-89,90天数逾期次数有个别异常极大值,这里作删除处理,发现存在age=0的点也进行删除
# 删除异常数据:年龄等于0的数据行,逾期次数过高的数据行#负债率过高的数据行离群点但不是异常点不删除
df = df[df['age'] > 0]
df = df[df['NumberOfTime30-59DaysPastDueNotWorse'] < 50]
df = df[df['NumberOfTime60-89DaysPastDueNotWorse'] < 50]
df = df[df['NumberOfTimes90DaysLate'] < 50]
#df = df[df['DebtRatio'] < 100000]