import pandas as pd
import numpy as np
loans_2007=pd.read_csv("LoanStats3a.csv",skiprows=1)
half_count=len(loans_2007)/2#len()可以计算出样本总数
half_count
loans_2007=loans_2007.dropna(thresh=half_count,axis=1)#thresh=n,保留至少有 n 个非 NA 数的行
loans_2007.head()
#对数据进行预处理,去掉一些无关紧要的特征,或者相关程度太高的特征 得到loans_2007.csv
loans_2007=pd.read_csv("loans_2007.csv")
loans_2007.drop_duplicates()
loans_2007.iloc[0]
loans_2007["grade"].value_counts()
loans_2007.shape[1]
#像id等等这种特征对整个数据是没有用处的,选择删除掉
loans_2007=loans_2007.drop(["id"],axis=1)#drop函数默认删除行,axis=1才删除列
#然后发现loan_status可以作为label使用,意思就是借给了他钱,或者不借
#将其转化为数值型0,1 使用map函数
a={"Fully Paid" : 1,"Charged Off" : 0}
loans_2007["loan_status"]=loans_2007["loan_status"].map(a)
loans_2007["loan_status"]
loans_2007=loans_2007.dropna(axis=0)
loans_2007
loans_2007["loan_status"].value_counts()
#去掉全是相同的值的特征
orig_columns=loans_2007.columns
drop_columns=[]
for col in orig_columns:
col_series=loans_2007[col].dropna().unique()#首先去掉空值,然后看该特征中的类别
if len(col_series)==1:#如果类别的个数为1,也就是说只有一类
drop_columns.append(col)#加入到要删除的类中
loans_2007=loans_2007.drop(drop_columns,axis=1)
import pandas as pd
loans=pd.read_csv("filtered_loans_2007.csv")
null_counts=loans.isnull().sum()#计算每一列数据的空值的个数
print(null_counts)
loans=loans.drop("pub_rec_bankruptcies",axis=1)
loans=loans.dropna(axis=0)
#选取出字符型数据,对字符型的进行预处理转化为数值型
object_columns_df=loans.select_dtypes(include=["object"])#select_dtype就是选择出某个类型的数据
print(object_columns_df.iloc[0])
cols=["home_ownership","verification_status","emp_length","term","addr_state"]#这些都是字符型的数据
for c in cols:
print(loans[c].value_counts())
print(loans["purpose"].value_counts())
print(loans["title"].value_counts())#这两列数据的含义类似
mapping_dict={
"emp_length":{
"10+years":10,
"9 years":9,
"8 years":8,
"7 years":7,
"6 years":6,
"5 years":5,
"4 years":4,
"3 years":3,
"2 years":2,
"1 years":1,
"<1 years":0,
"n/a":0
}
}
loans=loans.drop(["last_credit_pull_d","earliest_cr_line","addr_state","title"],axis=1)
#这句话就是说要把字符型的数据调用出来,然后用rstrip()去除掉百分号,然后将数据类型转化为浮点型
loans["int_rate"]=loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"]=loans["revol_util"].str.rstrip("%").astype("float")
loans=loans.replace(mapping_dict)
#其实也可以,同上
a={"10+years":10,
"9 years":9,
"8 years":8,
"7 years":7,
"6 years":6,
"5 years":5,
"4 years":4,
"3 years":3,
"2 years":2,
"1 years":1,
"<1 years":0,
"n/a":0
}
loans["emp_length"]=loans["emp_length"].map(a)
#不能拿精度作为衡量结果好坏的标准
python--银行反欺诈模型--数据预处理
最新推荐文章于 2022-12-04 12:20:27 发布