加密数据的预处理:将数据预处理的程序以及注释

目录

缺失值处理

去噪

删除冗余特征和一些日期特征

百分比字符串转浮点型

数值型特征归一化

字符串转数字

保存


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager
from matplotlib.ticker import FormatStrFormatter
# root = os.path.dirname(os.getcwd())
data_path = '.'
with ZipFile('data/data24164/lendingclub.zip') as z: z.extractall()
def read_data(path):
    """
    :params path: str 存放着数据集的文件夹路径
    
    :return data: list 存着多个数据集 [df1, df2, ...]
    """
    data = []
    for f in os.listdir(path):
        if f[-3:] != 'zip':
            continue
        df = pd.read_csv(os.path.join(data_path, f),
             compression='zip', low_memory=False, skiprows=1)[:-2]  # 不要最后2行数据

        data.append(df)
        print('读取{}, {:6d}个样本, {}个特征'.format(f, df.shape[0], df.shape[1]))
    return data

data = read_data(data_path)
读取LoanStats_securev1_2017Q2.csv.zip, 105451个样本, 150个特征 读取LoanStats_securev1_2016Q4.csv.zip, 103546个样本, 150个特征 读取LoanStats_securev1_2017Q1.csv.zip, 96779个样本, 150个特征 读取LoanStats_securev1_2019Q4.csv.zip, 128258个样本, 150个特征 读取LoanStats_securev1_2019Q2.csv.zip, 131139个样本, 150个特征 读取LoanStats_securev1_2018Q4.csv.zip, 128412个样本, 150个特征 读取LoanStats_securev1_2018Q1.csv.zip, 107864个样本, 150个特征 读取LoanStats_securev1_2017Q4.csv.zip, 118648个样本, 150个特征 读取LoanStats_securev1_2016Q1.csv.zip, 133887个样本, 150个特征 读取LoanStats_securev1_2019Q3.csv.zip, 143035个样本, 150个特征 读取LoanStats_securev1_2018Q3.csv.zip, 128194个样本, 150个特征 读取LoanStats_securev1_2020Q1.csv.zip, 105012个样本, 150个特征 读取LoanStats_securev1_2018Q2.csv.zip, 130772个样本, 150个特征 读取LoanStats_securev1_2016Q2.csv.zip, 97854个样本, 150个特征 读取LoanStats_securev1_2016Q3.csv.zip, 99120个样本, 150个特征 读取LoanStats_securev1_2017Q3.csv.zip, 122701个样本, 150个特征 读取LoanStats_securev1_2019Q1.csv.zip, 115675个样本, 150个特征

data = pd.concat(data).reset_index(drop=True)

lending club 的数据集会随时间改变

loan_status(贷款状态):

  • Current
  • Fully Paid (全部偿还)
  • Charged Off (冲销,投资人有损失)
  • Default 违约
  • In Grace Period(在宽限期)
  • Late (16-30 days)(延期16-30天)
  • Late (31-120 days)(延期31-120天)

状态Current(贷款还款中),不能确定是否违约,所以这部分数据不是有效数据,应该去掉

https://www.lendingclub.com/info/demand-and-credit-profile.action

剔除CurrentIn Grace PeriodIssuedFully Paid是好账,其他的作为坏账。

data.groupby('loan_status').size()

loan_status Charged Off 205754 Current 974958 Default 410 Fully Paid 786868 In Grace Period 9539 Issued 518 Late (16-30 days) 2604 Late (31-120 days) 15696 dtype: int64

data.groupby('loan_status').size()
 

# 1是坏账 0 是好账 -1表示需要剔除的数据
loan_status_dict = {"Fully Paid": 0,
                    "Charged Off": 1,
                    "Late (31-120 days)": 1,
                    "Late (16-30 days)": 1,
                    "Default": 1,
                    "Current": -1,
                    "In Grace Period": -1,
                    "Issued": -1}
data["loan_status"] = data["loan_status"].map(loan_status_dict)
# 删掉
data = data[data["loan_status"]!=-1]

缺失值处理

total_misval = data.isna().sum().sort_values(ascending=False) # 缺失值个数从高到低
total_misval = total_misval[total_misval != 0] # 删除没有缺失值的特征
per_misval = total_misval / total_misval.max()  # 百分比形式

# 绘制缺失值占比情况图
def draw_per_misval():
    f, ax = plt.subplots(figsize=(10, 10),dpi=100)
    sns.set_style("whitegrid")
    
    # 只显示缺失值占比大于10%的
    sns.barplot(per_misval[per_misval>0.1]*100, 
                per_misval[per_misval>0.1].index, 
                ax=ax,
                palette="GnBu_r")
    
    ax.xaxis.set_major_formatter(FormatStrFormatter("%2.f%%")) # 格式化字符串
    ax.set_title("missing value")
    plt.show()
draw_per_misval()

# 获得缺失值占比大于threshold的特征,删掉该特征
def drop_misval_ft(data, threshold, per_misval):
    misval_ft = per_misval[per_misval > threshold].index
    data.drop(misval_ft, axis=1, inplace=True)
    print("删掉了{}个特征".format(len(misval_ft)))
    return data

# 对于缺失值比例小于threshold的特征,删除含有这些特征的缺失值的样本:
def drop_samples(data, threshold, per_misval):
    features = per_misval[per_misval < threshold].index

    print("当前共有{}个样本".format(data.shape[0]))
    data.dropna(subset=features, inplace=True) 
    print("删除完毕,当前共有{}个样本".format(data.shape[0]))
    return data


data = drop_misval_ft(data, 0.15, per_misval)
data = drop_samples(data, 0.05, per_misval)
删掉了44个特征
当前共有1011332个样本
删除完毕,当前共有892821个样本

data["il_util"].hi()st
<matplotlib.axes._subplots.AxesSubplot at 0x7f16232fdc50>

data["mths_since_recent_inq"].hist()
<matplotlib.axes._subplots.AxesSubplot at 0x7f1623337550>

data["il_util"].fillna(0,inplace=True) # 0填补空缺值
data["mths_since_recent_inq"].fillna(0,inplace=True) # 0填补空缺值
data.drop("emp_title",axis=1,inplace=True)


emp_length_dict = {"10+ years": 10, "2 years": 2, "< 1 year": 0.5, "3 years": 3, "1 year": 1, "5 years": 5,
                   "4 years": 4, "6 years": 6, "7 years": 7, "8 years": 8, "9 years": 9}
data["emp_length"] = data["emp_length"].map(emp_length_dict)
data["emp_length"].fillna(value=0, inplace=True)

去噪

# 删除取值频率过高的特征
def drop_high_freq_features(df, freq_limit):
    high_freq_features = []
    for feature in df.columns:
        n = df.shape[0] # 总样本数
        most_ft_val = df[feature].value_counts().max() # 某特征取值频率最大的
        per = most_ft_val/n # 频率占比
        if per >freq_limit:
            high_freq_features.append(feature)
    
    df.drop(high_freq_features,axis=1,inplace=True) # 删除取值高频特征
    print("删掉了{}个特征".format(len(high_freq_features)))
    print("还剩{}个特征".format(df.shape[1]))
    return df


data = drop_high_freq_features(data, freq_limit=0.95)
删掉了14个特征
还剩91个特征

# 日期调一下,去除月份,只保留年份
data.issue_d = data.issue_d.apply(lambda x:x[-4:])

删除冗余特征和一些日期特征

这几个特征和借款人信息无关吧

  • total_rec_prncp(迄今收到的本金)
  • total_rec_int (迄今收到的利息)
  • out_prncp (总资金中剩余的未偿还本金)

手动删除一些特征

# 冗余特征 addr_state zip_code
# 相关度高的特征 loan_amnt funded_amnt_inv total_pymnt_inv out_prncp_inv
dorp_features = [
                "id",
                "funded_amnt",
                "funded_amnt_inv",
                # "issue_d",  # 贷款月份
                "url",
                "zip_code",
                "addr_state",
                "earliest_cr_line",  # 借款人最早报告的信贷额度开通的月份
                "total_pymnt",
                "total_pymnt_inv",
                "total_rec_prncp",
                "total_rec_int",
                "total_rec_late_fee",
                
                "last_pymnt_amnt",  # 最后收到的付款总额
                "last_pymnt_d",  # 上个月收到付款
                "last_credit_pull_d",  # 最近一个月信用证收回了这笔贷款的信贷
                "loan_status"
                # "out_prncp_inv",
                 ]
labels = data['loan_status'].copy()


data.drop(dorp_features, axis=1, inplace=True)

百分比字符串转浮点型

# 百分比转浮点数
def per2float(df):
    # 找到取值为百分比的特征
    for feature in df.columns:
        if data[feature].dtype != 'O':
            continue
        if "%" in str(df[feature].iloc[0]):
            # 把这列值转换为浮点型
            print(feature)
            df[feature] = df[feature].apply(lambda x: float(x.strip("%"))) / 100
    return data


data = per2float(data)
int_rate
revol_util

数值型特征归一化

num_features = data.select_dtypes('number').columns
data[num_features] = (data[num_features] - data[num_features].mean()) / data[num_features].std()

字符串转数字

from sklearn.preprocessing import LabelEncoder
category_features = data.select_dtypes('object').columns
data[category_features] = data[category_features].apply(lambda x: LabelEncoder().fit_transform(x))

保存

data = pd.concat([data, labels], axis=1)


# test = data.sample(10000, random_state=2020)
# train = data.drop(axis=0, index=test.index)
# train.to_csv(os.path.join(data_path, 'train.csv.zip'), index=None, compression='zip')
# test.to_csv(os.path.join(data_path, 'test.csv.zip'), index=None, compression='zip')


data.to_csv(os.path.join(data_path, 'dataset.csv.zip'), index=None, compression='zip')

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

懒羊羊夸夸~

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值