python-缺失值处理

最新推荐文章于 2024-06-09 13:19:08 发布

老三是只猫

最新推荐文章于 2024-06-09 13:19:08 发布

阅读量449

点赞数

分类专栏：机器学习算法代码脚本

本文链接：https://blog.csdn.net/zhonglongshen/article/details/96377318

版权

代码脚本同时被 2 个专栏收录

51 篇文章 2 订阅

订阅专栏

机器学习算法

25 篇文章 0 订阅

订阅专栏

#coding:utf-8
import pandas as pd
import numpy as np
from sklearn.model_selection import  train_test_split
from multiprocessing import Pool
import xgboost as xgb

pool = Pool(8)
feature = [u'age', u'sex', u'expect_quota', u'max_month_repay', u'occupation', u'education', u'marital_status_min', u'live_info', u'local_hk', u'money_function_mean', u'company_type', u'salary_max', u'school_type', u'flow', u'gross_profit', u'business_type', u'business_year', u'personnel_num', u'pay_type', u'product_id', u'tm_encode', u'info_num', u'time_gap', u'tm_encode_max_max_month_repay_min', u'education_max_info_num', u'num', u'prior_period_bill_amt_max', u'prior_period_bill_amt_min', u'prior_period_bill_amt_mean', u'prior_period_bill_amt_std', u'prior_period_repay_amt_max', u'prior_period_repay_amt_min', u'prior_period_repay_amt_mean', u'prior_period_repay_amt_std', u'credit_lmt_amt_max', u'credit_lmt_amt_min', u'credit_lmt_amt_mean', u'credit_lmt_amt_std', u'curt_jifen_max', u'curt_jifen_min', u'curt_jifen_mean', u'curt_jifen_std', u'current_bill_bal_max', u'current_bill_bal_min', u'current_bill_bal_mean', u'current_bill_bal_std', u'current_bill_min_repay_amt_max', u'current_bill_min_repay_amt_min', u'current_bill_min_repay_amt_mean', u'current_bill_min_repay_amt_std', u'is_cheat_bill_max', u'is_cheat_bill_min', u'is_cheat_bill_mean', u'is_cheat_bill_std', u'cost_cnt_max', u'cost_cnt_min', u'cost_cnt_mean', u'cost_cnt_std', u'current_bill_amt_max', u'current_bill_amt_min', u'current_bill_amt_mean', u'current_bill_amt_std', u'adj_amt_max', u'adj_amt_min', u'adj_amt_mean', u'adj_amt_std', u'circle_interest_max', u'circle_interest_min', u'circle_interest_mean', u'circle_interest_std', u'prior_period_jifen_bal_max', u'prior_period_jifen_bal_min', u'prior_period_jifen_bal_mean', u'prior_period_jifen_bal_std', u'nadd_jifen_max', u'nadd_jifen_min', u'nadd_jifen_mean', u'nadd_jifen_std', u'current_adj_jifen_max', u'current_adj_jifen_min', u'current_adj_jifen_mean', u'current_adj_jifen_std', u'avlb_bal_usd_max', u'avlb_bal_usd_min', u'avlb_bal_usd_mean', u'avlb_bal_usd_std', u'avlb_bal_max', u'avlb_bal_min', u'avlb_bal_mean', u'avlb_bal_std', u'card_type_max', u'card_type_min', u'card_type_mean', u'card_type_std', u'pre_borrow_cash_amt_usd_max', u'pre_borrow_cash_amt_usd_min', u'pre_borrow_cash_amt_usd_mean', u'pre_borrow_cash_amt_usd_std', u'credit_lmt_amt_usd_max', u'credit_lmt_amt_usd_min', u'credit_lmt_amt_usd_mean', u'credit_lmt_amt_usd_std', u'pre_borrow_cash_amt_max', u'pre_borrow_cash_amt_min', u'pre_borrow_cash_amt_mean', u'pre_borrow_cash_amt_std', u'curr_max', u'curr_min', u'curr_mean', u'curr_std', u'repay_stat_max', u'repay_stat_min', u'repay_stat_mean', u'repay_stat_std', u'current_min_repay_amt_usd_max', u'current_min_repay_amt_usd_min', u'current_min_repay_amt_usd_mean', u'current_min_repay_amt_usd_std', u'current_repay_amt_usd_max', u'current_repay_amt_usd_min', u'current_repay_amt_usd_mean', u'current_repay_amt_usd_std', u'current_convert_jifen_max', u'current_convert_jifen_min', u'current_convert_jifen_mean', u'current_convert_jifen_std', u'current_award_jifen_max', u'current_award_jifen_min', u'current_award_jifen_mean', u'current_award_jifen_std', u'relation1_num', u'type0_num', u'type0_sum', u'type1_num', u'type1_sum', u'type2_num', u'type2_sum', u'type3_num', u'type3_sum', u'time_mean', u'time_min', u'time_max', u'time_std',  u'tag_num', u'tag_3', u'tag_4', u'tag_5', u'tag_6', u'tag_7', u'tag_8', u'tag_9', u'300028', u'300196', u'300301', u'300385', u'300469', u'300658', u'301036', u'301211', u'301687']#feature will be used

feature1 = [u'age', u'sex', u'expect_quota', u'max_month_repay', u'occupation', u'education', u'marital_status_min', u'live_info', u'local_hk', u'money_function_mean', u'company_type', u'salary_max', u'school_type', u'flow', u'gross_profit', u'business_type', u'business_year', u'personnel_num', u'pay_type', u'product_id', u'tm_encode', u'info_num', u'time_gap', u'tm_encode_max_max_month_repay_min', u'education_max_info_num'] #features from user_info

feature2 = [u'num', u'prior_period_bill_amt_max', u'prior_period_bill_amt_min', u'prior_period_bill_amt_mean', u'prior_period_bill_amt_std', u'prior_period_repay_amt_max', u'prior_period_repay_amt_min', u'prior_period_repay_amt_mean', u'prior_period_repay_amt_std', u'credit_lmt_amt_max', u'credit_lmt_amt_min', u'credit_lmt_amt_mean', u'credit_lmt_amt_std', u'curt_jifen_max', u'curt_jifen_min', u'curt_jifen_mean', u'curt_jifen_std', u'current_bill_bal_max', u'current_bill_bal_min', u'current_bill_bal_mean', u'current_bill_bal_std', u'current_bill_min_repay_amt_max', u'current_bill_min_repay_amt_min', u'current_bill_min_repay_amt_mean', u'current_bill_min_repay_amt_std', u'is_cheat_bill_max', u'is_cheat_bill_min', u'is_cheat_bill_mean', u'is_cheat_bill_std', u'cost_cnt_max', u'cost_cnt_min', u'cost_cnt_mean', u'cost_cnt_std', u'current_bill_amt_max', u'current_bill_amt_min', u'current_bill_amt_mean', u'current_bill_amt_std', u'adj_amt_max', u'adj_amt_min', u'adj_amt_mean', u'adj_amt_std', u'circle_interest_max', u'circle_interest_min', u'circle_interest_mean', u'circle_interest_std', u'prior_period_jifen_bal_max', u'prior_period_jifen_bal_min', u'prior_period_jifen_bal_mean', u'prior_period_jifen_bal_std', u'nadd_jifen_max', u'nadd_jifen_min', u'nadd_jifen_mean', u'nadd_jifen_std', u'current_adj_jifen_max', u'current_adj_jifen_min', u'current_adj_jifen_mean', u'current_adj_jifen_std', u'avlb_bal_usd_max', u'avlb_bal_usd_min', u'avlb_bal_usd_mean', u'avlb_bal_usd_std', u'avlb_bal_max', u'avlb_bal_min', u'avlb_bal_mean', u'avlb_bal_std', u'card_type_max', u'card_type_min', u'card_type_mean', u'card_type_std', u'pre_borrow_cash_amt_usd_max', u'pre_borrow_cash_amt_usd_min', u'pre_borrow_cash_amt_usd_mean', u'pre_borrow_cash_amt_usd_std', u'credit_lmt_amt_usd_max', u'credit_lmt_amt_usd_min', u'credit_lmt_amt_usd_mean', u'credit_lmt_amt_usd_std', u'pre_borrow_cash_amt_max', u'pre_borrow_cash_amt_min', u'pre_borrow_cash_amt_mean', u'pre_borrow_cash_amt_std', u'curr_max', u'curr_min', u'curr_mean', u'curr_std', u'repay_stat_max', u'repay_stat_min', u'repay_stat_mean', u'repay_stat_std', u'current_min_repay_amt_usd_max', u'current_min_repay_amt_usd_min', u'current_min_repay_amt_usd_mean', u'current_min_repay_amt_usd_std', u'current_repay_amt_usd_max', u'current_repay_amt_usd_min', u'current_repay_amt_usd_mean', u'current_repay_amt_usd_std', u'current_convert_jifen_max', u'current_convert_jifen_min', u'current_convert_jifen_mean', u'current_convert_jifen_std', u'current_award_jifen_max', u'current_award_jifen_min', u'current_award_jifen_mean', u'current_award_jifen_std'] #features from user_consumption

feature3 = [u'relation1_num'] #features from relation1

feature4 = [u'type0_num', u'type0_sum', u'type1_num', u'type1_sum', u'type2_num', u'type2_sum', u'type3_num', u'type3_sum', u'time_mean', u'time_min', u'time_max', u'time_std'] #feature from relation2

feature5 = [u'tag_num', u'tag_3', u'tag_4', u'tag_5', u'tag_6', u'tag_7', u'tag_8', u'tag_9', u'300028', u'300196', u'300301', u'300385', u'300469', u'300658', u'301036', u'301211', u'301687','300028_301687','300196_2'] #feature from tag

df = pd.read_csv('completetrain.csv')
train = df.ix[:,df.columns!='label']
df = pd.read_csv('completetest.csv')
df = df.drop_duplicates(cols = 'user_id')
test = df.ix[:,df.columns != 'probability']

df = pd.concat([train,test])
df.loc[df.relation1_num==0,'relation1_num'] = -1
df = df.replace(-1,np.NAN)
#check the feature one by one, use the feature in other feature file to predictt the mission one. not using the features from the same file as it almost always miss together
for label in feature:
    print(label)
    if label in feature1:
        usedfeature = feature2 +feature3 + feature4 + feature5
    elif label in feature2:
        usedfeature = feature1 +feature3 + feature4 + feature5
    elif label in feature3:
        usedfeature = feature2 +feature1 + feature4 + feature5
    elif label in feature4:
        usedfeature = feature2 +feature3 + feature1 + feature5
    elif label in feature5:
        usedfeature = feature2 +feature3 + feature4 + feature1

        new = df[~pd.isnull(df[label])]
        new = new.fillna(new.mean())
        X_train, X_test, y_train, y_test = train_test_split(new[usedfeature], new[label], test_size=0.3,
                                                            random_state=200)
        if len(np.unique(new[label])) < 3:
            pass
            estimator = xgb.XGBClassifier(n_estimators=123, max_depth=3, learning_rate=0.05)
            estimator.fit(X_train, y_train)  # ,eval_metric = 'auc', eval_set=[(X_train, y_train), (X_test, y_test)])
            try:
                df.loc[pd.isnull(df[label]), label] = estimator.predict(
                    df[pd.isnull(df[label])][usedfeature].as_matrix())
            except:
                df.loc[pd.isnull(df[label]), label] = -100
        else:
            estimator = xgb.XGBRegressor(n_estimators=123, max_depth=3, learning_rate=0.05)
            if estimator.score(X_test,
                               y_test) < 0.2:  # if the score higher than 0.2, use the algorithm to predict the missing value, if not, use -100 to fill the missing value
                df.loc[pd.isnull(df[label]), label] = -100
            else:
                try:
                    df.loc[pd.isnull(df[label]), label] = estimator.predict(
                        df[pd.isnull(df[label])][usedfeature].as_matrix())
                except:
                    df.loc[pd.isnull(df[label]), label] = -100
        print(estimator.score(X_test, y_test))

    df.to_csv('xgbimpute.csv', index=None)

方法2


from sklearn.ensemble import RandomForestRegressor
 
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
    
    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)
    
    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])
    
    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges 
    
    return df, rfr

def set_Cabin_type(df):
    df.loc[ (df.Cabin.notnull()), 'Cabin' ] = "Yes"
    df.loc[ (df.Cabin.isnull()), 'Cabin' ] = "No"
    return df

data_train, rfr = set_missing_ages(data_train)
data_train = set_Cabin_type(data_train)

老三是只猫

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python-缺失值处理

#coding:utf-8import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom multiprocessing import Poolimport xgboost as xgbpool = Pool(8)feature = [u'age', u's...
复制链接

扫一扫

专栏目录