python-缺失值处理

#coding:utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import  train_test_split
from multiprocessing import Pool
import xgboost as xgb

pool = Pool(8)
feature = [u'age', u'sex', u'expect_quota', u'max_month_repay', u'occupation', u'education', u'marital_status_min', u'live_info', u'local_hk', u'money_function_mean', u'company_type', u'salary_max', u'school_type', u'flow', u'gross_profit', u'business_type', u'business_year', u'personnel_num', u'pay_type', u'product_id', u'tm_encode', u'info_num', u'time_gap', u'tm_encode_max_max_month_repay_min', u'education_max_info_num', u'num', u'prior_period_bill_amt_max', u'prior_period_bill_amt_min', u'prior_period_bill_amt_mean', u'prior_period_bill_amt_std', u'prior_period_repay_amt_max', u'prior_period_repay_amt_min', u'prior_period_repay_amt_mean', u'prior_period_repay_amt_std', u'credit_lmt_amt_max', u'credit_lmt_amt_min', u'credit_lmt_amt_mean', u'credit_lmt_amt_std', u'curt_jifen_max', u'curt_jifen_min', u'curt_jifen_mean', u'curt_jifen_std', u'current_bill_bal_max', u'current_bill_bal_min', u'current_bill_bal_mean', u'current_bill_bal_std', u'current_bill_min_repay_amt_max', u'current_bill_min_repay_amt_min', u'current_bill_min_repay_amt_mean', u'current_bill_min_repay_amt_std', u'is_cheat_bill_max', u'is_cheat_bill_min', u'is_cheat_bill_mean', u'is_cheat_bill_std', u'cost_cnt_max', u'cost_cnt_min', u'cost_cnt_mean', u'cost_cnt_std', u'current_bill_amt_max', u'current_bill_amt_min', u'current_bill_amt_mean', u'current_bill_amt_std', u'adj_amt_max', u'adj_amt_min', u'adj_amt_mean', u'adj_amt_std', u'circle_interest_max', u'circle_interest_min', u'circle_interest_mean', u'circle_interest_std', u'prior_period_jifen_bal_max', u'prior_period_jifen_bal_min', u'prior_period_jifen_bal_mean', u'prior_period_jifen_bal_std', u'nadd_jifen_max', u'nadd_jifen_min', u'nadd_jifen_mean', u'nadd_jifen_std', u'current_adj_jifen_max', u'current_adj_jifen_min', u'current_adj_jifen_mean', u'current_adj_jifen_std', u'avlb_bal_usd_max', u'avlb_bal_usd_min', u'avlb_bal_usd_mean', u'avlb_bal_usd_std', u'avlb_bal_max', u'avlb_bal_min', u'avlb_bal_mean', u'avlb_bal_std', u'card_type_max', u'card_type_min', u'card_type_mean', u'card_type_std', u'pre_borrow_cash_amt_usd_max', u'pre_borrow_cash_amt_usd_min', u'pre_borrow_cash_amt_usd_mean', u'pre_borrow_cash_amt_usd_std', u'credit_lmt_amt_usd_max', u'credit_lmt_amt_usd_min', u'credit_lmt_amt_usd_mean', u'credit_lmt_amt_usd_std', u'pre_borrow_cash_amt_max', u'pre_borrow_cash_amt_min', u'pre_borrow_cash_amt_mean', u'pre_borrow_cash_amt_std', u'curr_max', u'curr_min', u'curr_mean', u'curr_std', u'repay_stat_max', u'repay_stat_min', u'repay_stat_mean', u'repay_stat_std', u'current_min_repay_amt_usd_max', u'current_min_repay_amt_usd_min', u'current_min_repay_amt_usd_mean', u'current_min_repay_amt_usd_std', u'current_repay_amt_usd_max', u'current_repay_amt_usd_min', u'current_repay_amt_usd_mean', u'current_repay_amt_usd_std', u'current_convert_jifen_max', u'current_convert_jifen_min', u'current_convert_jifen_mean', u'current_convert_jifen_std', u'current_award_jifen_max', u'current_award_jifen_min', u'current_award_jifen_mean', u'current_award_jifen_std', u'relation1_num', u'type0_num', u'type0_sum', u'type1_num', u'type1_sum', u'type2_num', u'type2_sum', u'type3_num', u'type3_sum', u'time_mean', u'time_min', u'time_max', u'time_std',  u'tag_num', u'tag_3', u'tag_4', u'tag_5', u'tag_6', u'tag_7', u'tag_8', u'tag_9', u'300028', u'300196', u'300301', u'300385', u'300469', u'300658', u'301036', u'301211', u'301687']#feature will be used

feature1 = [u'age', u'sex', u'expect_quota', u'max_month_repay', u'occupation', u'education', u'marital_status_min', u'live_info', u'local_hk', u'money_function_mean', u'company_type', u'salary_max', u'school_type', u'flow', u'gross_profit', u'business_type', u'business_year', u'personnel_num', u'pay_type', u'product_id', u'tm_encode', u'info_num', u'time_gap', u'tm_encode_max_max_month_repay_min', u'education_max_info_num'] #features from user_info

feature2 = [u'num', u'prior_period_bill_amt_max', u'prior_period_bill_amt_min', u'prior_period_bill_amt_mean', u'prior_period_bill_amt_std', u'prior_period_repay_amt_max', u'prior_period_repay_amt_min', u'prior_period_repay_amt_mean', u'prior_period_repay_amt_std', u'credit_lmt_amt_max', u'credit_lmt_amt_min', u'credit_lmt_amt_mean', u'credit_lmt_amt_std', u'curt_jifen_max', u'curt_jifen_min', u'curt_jifen_mean', u'curt_jifen_std', u'current_bill_bal_max', u'current_bill_bal_min', u'current_bill_bal_mean', u'current_bill_bal_std', u'current_bill_min_repay_amt_max', u'current_bill_min_repay_amt_min', u'current_bill_min_repay_amt_mean', u'current_bill_min_repay_amt_std', u'is_cheat_bill_max', u'is_cheat_bill_min', u'is_cheat_bill_mean', u'is_cheat_bill_std', u'cost_cnt_max', u'cost_cnt_min', u'cost_cnt_mean', u'cost_cnt_std', u'current_bill_amt_max', u'current_bill_amt_min', u'current_bill_amt_mean', u'current_bill_amt_std', u'adj_amt_max', u'adj_amt_min', u'adj_amt_mean', u'adj_amt_std', u'circle_interest_max', u'circle_interest_min', u'circle_interest_mean', u'circle_interest_std', u'prior_period_jifen_bal_max', u'prior_period_jifen_bal_min', u'prior_period_jifen_bal_mean', u'prior_period_jifen_bal_std', u'nadd_jifen_max', u'nadd_jifen_min', u'nadd_jifen_mean', u'nadd_jifen_std', u'current_adj_jifen_max', u'current_adj_jifen_min', u'current_adj_jifen_mean', u'current_adj_jifen_std', u'avlb_bal_usd_max', u'avlb_bal_usd_min', u'avlb_bal_usd_mean', u'avlb_bal_usd_std', u'avlb_bal_max', u'avlb_bal_min', u'avlb_bal_mean', u'avlb_bal_std', u'card_type_max', u'card_type_min', u'card_type_mean', u'card_type_std', u'pre_borrow_cash_amt_usd_max', u'pre_borrow_cash_amt_usd_min', u'pre_borrow_cash_amt_usd_mean', u'pre_borrow_cash_amt_usd_std', u'credit_lmt_amt_usd_max', u'credit_lmt_amt_usd_min', u'credit_lmt_amt_usd_mean', u'credit_lmt_amt_usd_std', u'pre_borrow_cash_amt_max', u'pre_borrow_cash_amt_min', u'pre_borrow_cash_amt_mean', u'pre_borrow_cash_amt_std', u'curr_max', u'curr_min', u'curr_mean', u'curr_std', u'repay_stat_max', u'repay_stat_min', u'repay_stat_mean', u'repay_stat_std', u'current_min_repay_amt_usd_max', u'current_min_repay_amt_usd_min', u'current_min_repay_amt_usd_mean', u'current_min_repay_amt_usd_std', u'current_repay_amt_usd_max', u'current_repay_amt_usd_min', u'current_repay_amt_usd_mean', u'current_repay_amt_usd_std', u'current_convert_jifen_max', u'current_convert_jifen_min', u'current_convert_jifen_mean', u'current_convert_jifen_std', u'current_award_jifen_max', u'current_award_jifen_min', u'current_award_jifen_mean', u'current_award_jifen_std'] #features from user_consumption

feature3 = [u'relation1_num'] #features from relation1

feature4 = [u'type0_num', u'type0_sum', u'type1_num', u'type1_sum', u'type2_num', u'type2_sum', u'type3_num', u'type3_sum', u'time_mean', u'time_min', u'time_max', u'time_std'] #feature from relation2

feature5 = [u'tag_num', u'tag_3', u'tag_4', u'tag_5', u'tag_6', u'tag_7', u'tag_8', u'tag_9', u'300028', u'300196', u'300301', u'300385', u'300469', u'300658', u'301036', u'301211', u'301687','300028_301687','300196_2'] #feature from tag

df = pd.read_csv('completetrain.csv')
train = df.ix[:,df.columns!='label']
df = pd.read_csv('completetest.csv')
df = df.drop_duplicates(cols = 'user_id')
test = df.ix[:,df.columns != 'probability']

df = pd.concat([train,test])
df.loc[df.relation1_num==0,'relation1_num'] = -1
df = df.replace(-1,np.NAN)
#check the feature one by one, use the feature in other feature file to predictt the mission one. not using the features from the same file as it almost always miss together
for label in feature:
    print(label)
    if label in feature1:
        usedfeature = feature2 +feature3 + feature4 + feature5
    elif label in feature2:
        usedfeature = feature1 +feature3 + feature4 + feature5
    elif label in feature3:
        usedfeature = feature2 +feature1 + feature4 + feature5
    elif label in feature4:
        usedfeature = feature2 +feature3 + feature1 + feature5
    elif label in feature5:
        usedfeature = feature2 +feature3 + feature4 + feature1

        new = df[~pd.isnull(df[label])]
        new = new.fillna(new.mean())
        X_train, X_test, y_train, y_test = train_test_split(new[usedfeature], new[label], test_size=0.3,
                                                            random_state=200)
        if len(np.unique(new[label])) < 3:
            pass
            estimator = xgb.XGBClassifier(n_estimators=123, max_depth=3, learning_rate=0.05)
            estimator.fit(X_train, y_train)  # ,eval_metric = 'auc', eval_set=[(X_train, y_train), (X_test, y_test)])
            try:
                df.loc[pd.isnull(df[label]), label] = estimator.predict(
                    df[pd.isnull(df[label])][usedfeature].as_matrix())
            except:
                df.loc[pd.isnull(df[label]), label] = -100
        else:
            estimator = xgb.XGBRegressor(n_estimators=123, max_depth=3, learning_rate=0.05)
            if estimator.score(X_test,
                               y_test) < 0.2:  # if the score higher than 0.2, use the algorithm to predict the missing value, if not, use -100 to fill the missing value
                df.loc[pd.isnull(df[label]), label] = -100
            else:
                try:
                    df.loc[pd.isnull(df[label]), label] = estimator.predict(
                        df[pd.isnull(df[label])][usedfeature].as_matrix())
                except:
                    df.loc[pd.isnull(df[label]), label] = -100
        print(estimator.score(X_test, y_test))

    df.to_csv('xgbimpute.csv', index=None)


方法2


from sklearn.ensemble import RandomForestRegressor
 
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值