Pandas多维特征数据预处理及sklearn数据不均衡处理相关技术实践-大数据ML样本集案例实战

本文链接：https://blog.csdn.net/shenshouniu/article/details/85137544

1 机器学习调优步骤（第一行不平衡问题处理）

2 Pandas多维特征数据预处理

数据初始化展示

  import pandas as pd
  
  取出第一行
  loans_2007 = pd.read_csv('C:\\ML\\MLData\\filtered_loans_2007.csv', skiprows=1)
  print(len(loans_2007))
  half_count = len(loans_2007) / 2
  loans_2007 = loans_2007.dropna(thresh=half_count, axis=1)
  #loans_2007 = loans_2007.drop(['desc', 'url'],axis=1)
  loans_2007.to_csv('loans_2007.csv', index=False)
  
  loans_2007.head(3)

显示第0行数据

  import pandas as pd
  loans_2007 = pd.read_csv("loans_2007.csv")
  #loans_2007.drop_duplicates()
  print(loans_2007.iloc[0])
  print(loans_2007.shape[1])
  
  id                                1077501
  member_id                      1.2966e+06
  loan_amnt                            5000
  funded_amnt                          5000
  funded_amnt_inv                      4975
  term                            36 months
  int_rate                           10.65%
  installment                        162.87
  grade                                   B
  sub_grade                              B2
  emp_title                             NaN
  emp_length                      10+ years
  home_ownership                       RENT
  annual_inc                          24000
  verification_status              Verified
  issue_d                          Dec-2011
  loan_status                    Fully Paid
  pymnt_plan                              n
  purpose                       credit_card
  title                            Computer
  zip_code                            860xx
  addr_state                             AZ
  dti                                 27.65
  delinq_2yrs                             0
  earliest_cr_line                 Jan-1985
  inq_last_6mths                          1
  open_acc                                3
  pub_rec                                 0
  revol_bal                           13648
  revol_util                          83.7%
  total_acc                               9
  initial_list_status                     f
  out_prncp                               0
  out_prncp_inv                           0
  total_pymnt                       5863.16
  total_pymnt_inv                   5833.84
  total_rec_prncp                      5000
  total_rec_int                      863.16
  total_rec_late_fee                      0
  recoveries                              0
  collection_recovery_fee                 0
  last_pymnt_d                     Jan-2015
  last_pymnt_amnt                    171.62
  last_credit_pull_d               Nov-2016
  collections_12_mths_ex_med              0
  policy_code                             1
  application_type               INDIVIDUAL
  acc_now_delinq                          0
  chargeoff_within_12_mths                0
  delinq_amnt                             0
  pub_rec_bankruptcies                    0
  tax_liens                               0
  Name: 0, dtype: object
  52

删除无意义列

  loans_2007 = loans_2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
  loans_2007 = loans_2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
  loans_2007 = loans_2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
  print(loans_2007.iloc[0])
  print(loans_2007.shape[1])

查看预测值的状态类型

  print(loans_2007['loan_status'].value_counts())
  
  Fully Paid                                             33902
  Charged Off                                             5658
  Does not meet the credit policy. Status:Fully Paid      1988
  Does not meet the credit policy. Status:Charged Off      761
  Current                                                  201
  Late (31-120 days)                                        10
  In Grace Period                                            9
  Late (16-30 days)                                          5
  Default                                                    1
  Name: loan_status, dtype: int64

根据贷款状态，舍弃部分不清晰结论，给出明确分类0和1，进行替换

  loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]
  
  status_replace = {
      "loan_status" : {
          "Fully Paid": 1,
          "Charged Off": 0,
      }
  }
  
  loans_2007 = loans_2007.replace(status_replace)

去除每一列值都相同的列

  #let's look for any columns that contain only one unique value and remove them
  
  orig_columns = loans_2007.columns
  drop_columns = []
  for col in orig_columns:
      col_series = loans_2007[col].dropna().unique()
      if len(col_series) == 1:
          drop_columns.append(col)
  loans_2007 = loans_2007.drop(drop_columns, axis=1)
  print(drop_columns)
  print loans_2007.shape
  loans_2007.to_csv('filtered_loans_2007.csv', index=False)

  ['initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens']
  (39560, 24)

空值处理

  import pandas as pd
  loans = pd.read_csv('filtered_loans_2007.csv')
  null_counts = loans.isnull().sum()
  print(null_counts)
  
  loan_amnt                 0
  term                      0
  int_rate                  0
  installment               0
  emp_length                0
  home_ownership            0
  annual_inc                0
  verification_status       0
  loan_status               0
  pymnt_plan                0
  purpose                   0
  title                    10
  addr_state                0
  dti                       0
  delinq_2yrs               0
  earliest_cr_line          0
  inq_last_6mths            0
  open_acc                  0
  pub_rec                   0
  revol_bal                 0
  revol_util               50
  total_acc                 0
  last_credit_pull_d        2
  pub_rec_bankruptcies    697
  dtype: int64
  
  loans = loans.drop("pub_rec_bankruptcies", axis=1)
  loans = loans.dropna(axis=0)

String类型分布

  print(loans.dtypes.value_counts())
  object     12
  float64    10
  int64       1
  dtype: int64
  
  object_columns_df = loans.select_dtypes(include=["object"])
  print(object_columns_df.iloc[0])
  
  term                     36 months
  int_rate                    10.65%
  emp_length               10+ years
  home_ownership                RENT
  verification_status       Verified
  pymnt_plan                       n
  purpose                credit_card
  title                     Computer
  addr_state                      AZ
  earliest_cr_line          Jan-1985
  revol_util                   83.7%
  last_credit_pull_d        Nov-2016
  Name: 0, dtype: object
  
  cols = ['home_ownership', 'verification_status', 'emp_length', 'term', 'addr_state']
  for c in cols:
      print(loans[c].value_counts())
      
  RENT        18780
  MORTGAGE    17574
  OWN          3045
  OTHER          96
  NONE            3
  Name: home_ownership, dtype: int64
  
  Not Verified       16856
  Verified           12705
  Source Verified     9937
  Name: verification_status, dtype: int64
  
  10+ years    8821
  < 1 year     4563
  2 years      4371
  3 years      4074
  4 years      3409
  5 years      3270
  1 year       3227
  6 years      2212
  7 years      1756
  8 years      1472
  9 years      1254
  n/a          1069
  Name: emp_length, dtype: int64
  
   36 months    29041
   60 months    10457
  Name: term, dtype: int64
  
  CA    7070
  NY    3788
  FL    2856
  TX    2714
  NJ    1838
  IL    1517
  PA    1504
  VA    1400
  GA    1393
  MA    1336
  OH    1208
  MD    1049
  AZ     874
  WA     834
  CO     786
  NC     780
  CT     747
  MI     722
  MO     682
  MN     611
  NV     492
  SC     470
  WI     453
  AL     446
  OR     445
  LA     435
  KY     325
  OK     298
  KS     269
  UT     256
  AR     243
  DC     211
  RI     198
  NM     188
  WV     176
  HI     172
  NH     172
  DE     113
  MT      84
  WY      83
  AK      79
  SD      63
  VT      54
  MS      19
  TN      17
  IN       9
  ID       6
  IA       5
  NE       5
  ME       3
  Name: addr_state, dtype: int64

String类型分布2

print(loans["purpose"].value_counts())
print(loans["title"].value_counts())

debt_consolidation    18533
credit_card            5099
other                  3963
home_improvement       2965
major_purchase         2181
small_business         1815
car                    1544
wedding                 945
medical                 692
moving                  581
vacation                379
house                   378
educational             320
renewable_energy        103
Name: purpose, dtype: int64

Debt Consolidation                         2168
Debt Consolidation Loan                    1706
Personal Loan                               658
Consolidation                               509
debt consolidation                          502
Credit Card Consolidation                   356
Home Improvement                            354
Debt consolidation                          333
Small Business Loan                         322
Credit Card Loan                            313
Personal                                    308
Consolidation Loan                          255
Home Improvement Loan                       246
personal loan                               234
personal                                    220
Loan                                        212
Wedding Loan                                209
consolidation                               200
Car Loan                                    200
Other Loan                                  190
Credit Card Payoff                          155
Wedding                                     152
Major Purchase Loan                         144
Credit Card Refinance                       143
Consolidate                                 127
Medical                                     122
Credit Card                                 117
home improvement                            111
My Loan                                      94
Credit Cards                                 93
                                           ... 
DebtConsolidationn                            1
 Freedom                                      1
Credit Card Consolidation Loan - SEG          1
SOLAR PV                                      1
Pay on Credit card                            1
To pay off balloon payments due               1
Paying off the debt                           1
Payoff ING PLOC                               1
Josh CC Loan                                  1
House payoff                                  1
Taking care of Business                       1
Gluten Free Bakery in ideal town for it       1
Startup Money for Small Business              1
FundToFinanceCar                              1
getting ready for Baby                        1
Dougs Wedding Loan                            1
d rock                                        1
LC Loan 2                                     1
swimming pool repair                          1
engagement                                    1
Cut the credit cards Loan                     1
vinman                                        1
working hard to get out of debt               1
consolidate the rest of my debt               1
Medical/Vacation                              1
2BDebtFree                                    1
Paying Off High Interest Credit Cards!        1
Baby on the way!                              1
cart loan                                     1
Consolidaton                                  1
Name: title, dtype: int64

类型转换

  mapping_dict = {
      "emp_length": {
          "10+ years": 10,
          "9 years": 9,
          "8 years": 8,
          "7 years": 7,
          "6 years": 6,
          "5 years": 5,
          "4 years": 4,
          "3 years": 3,
          "2 years": 2,
          "1 year": 1,
          "< 1 year": 0,
          "n/a": 0
      }
  }
  loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
  loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
  loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
  loans = loans.replace(mapping_dict)

独热编码

  cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
  dummy_df = pd.get_dummies(loans[cat_columns])
  loans = pd.concat([loans, dummy_df], axis=1)
  loans = loans.drop(cat_columns, axis=1)
  loans = loans.drop("pymnt_plan", axis=1)

查看转换类型

  import pandas as pd
  loans = pd.read_csv("cleaned_loans2007.csv")
  print(loans.info())
  
  <class 'pandas.core.frame.DataFrame'>
  RangeIndex: 39498 entries, 0 to 39497
  Data columns (total 37 columns):
  loan_amnt                              39498 non-null float64
  int_rate                               39498 non-null float64
  installment                            39498 non-null float64
  annual_inc                             39498 non-null float64
  loan_status                            39498 non-null int64
  dti                                    39498 non-null float64
  delinq_2yrs                            39498 non-null float64
  inq_last_6mths                         39498 non-null float64
  open_acc                               39498 non-null float64
  pub_rec                                39498 non-null float64
  revol_bal                              39498 non-null float64
  revol_util                             39498 non-null float64
  total_acc                              39498 non-null float64
  home_ownership_MORTGAGE                39498 non-null int64
  home_ownership_NONE                    39498 non-null int64
  home_ownership_OTHER                   39498 non-null int64
  home_ownership_OWN                     39498 non-null int64
  home_ownership_RENT                    39498 non-null int64
  verification_status_Not Verified       39498 non-null int64
  verification_status_Source Verified    39498 non-null int64
  verification_status_Verified           39498 non-null int64
  purpose_car                            39498 non-null int64
  purpose_credit_card                    39498 non-null int64
  purpose_debt_consolidation             39498 non-null int64
  purpose_educational                    39498 non-null int64
  purpose_home_improvement               39498 non-null int64
  purpose_house                          39498 non-null int64
  purpose_major_purchase                 39498 non-null int64
  purpose_medical                        39498 non-null int64
  purpose_moving                         39498 non-null int64
  purpose_other                          39498 non-null int64
  purpose_renewable_energy               39498 non-null int64
  purpose_small_business                 39498 non-null int64
  purpose_vacation                       39498 non-null int64
  purpose_wedding                        39498 non-null int64
  term_ 36 months                        39498 non-null int64
  term_ 60 months                        39498 non-null int64
  dtypes: float64(12), int64(25)
  memory usage: 11.1 MB

3 准确率理论及不均衡处理

初始定义

  import pandas as pd
  # False positives.
  fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
  fp = len(predictions[fp_filter])
  
  # True positives.
  tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
  tp = len(predictions[tp_filter])
  
  # False negatives.
  fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
  fn = len(predictions[fn_filter])
  
  # True negatives
  tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
  tn = len(predictions[tn_filter])

逻辑回归不处理不均衡

      from sklearn.linear_model import LogisticRegression
      lr = LogisticRegression()
      cols = loans.columns
      train_cols = cols.drop("loan_status")
      features = loans[train_cols]
      target = loans["loan_status"]
      lr.fit(features, target)
      predictions = lr.predict(features)
      
      from sklearn.linear_model import LogisticRegression
      from sklearn.cross_validation import cross_val_predict, KFold
      lr = LogisticRegression()
      kf = KFold(features.shape[0], random_state=1)
      predictions = cross_val_predict(lr, features, target, cv=kf)
      predictions = pd.Series(predictions)
      
      # False positives.
      fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
      fp = len(predictions[fp_filter])
      
      # True positives.
      tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
      tp = len(predictions[tp_filter])
      
      # False negatives.
      fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
      fn = len(predictions[fn_filter])
      
      # True negatives
      tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
      tn = len(predictions[tn_filter])
      
      # Rates
      tpr = tp / float((tp + fn))
      fpr = fp / float((fp + tn))
      
      print(tpr)
      print(fpr)
      print predictions[:20]
      
      0.999084438406
      0.998049299521
      0     1
      1     1
      2     1
      3     1
      4     1
      5     1
      6     1
      7     1
      8     1
      9     1
      10    1
      11    1
      12    1
      13    1
      14    1
      15    1
      16    1
      17    1
      18    1
      19    1
      dtype: int64

逻辑回归balanced处理不均衡

  from sklearn.linear_model import LogisticRegression
  from sklearn.cross_validation import cross_val_predict
  lr = LogisticRegression(class_weight="balanced")
  kf = KFold(features.shape[0], random_state=1)
  predictions = cross_val_predict(lr, features, target, cv=kf)
  predictions = pd.Series(predictions)
  
  # False positives.
  fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
  fp = len(predictions[fp_filter])
  
  # True positives.
  tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
  tp = len(predictions[tp_filter])
  
  # False negatives.
  fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
  fn = len(predictions[fn_filter])
  
  # True negatives
  tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
  tn = len(predictions[tn_filter])
  
  # Rates
  tpr = tp / float((tp + fn))
  fpr = fp / float((fp + tn))
  
  print(tpr)
  print(fpr)
  print predictions[:20]
  
  0.670781771464
  0.400780280192
  0     1
  1     0
  2     0
  3     1
  4     1
  5     0
  6     0
  7     0
  8     0
  9     0
  10    1
  11    0
  12    1
  13    1
  14    0
  15    0
  16    1
  17    1
  18    1
  19    0
  dtype: int64

逻辑回归penalty处理不均衡

  from sklearn.linear_model import LogisticRegression
  from sklearn.cross_validation import cross_val_predict
  penalty = {
      0: 5,
      1: 1
  }
  
  lr = LogisticRegression(class_weight=penalty)
  kf = KFold(features.shape[0], random_state=1)
  predictions = cross_val_predict(lr, features, target, cv=kf)
  predictions = pd.Series(predictions)
  
  # False positives.
  fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
  fp = len(predictions[fp_filter])
  
  # True positives.
  tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
  tp = len(predictions[tp_filter])
  
  # False negatives.
  fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
  fn = len(predictions[fn_filter])
  
  # True negatives
  tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
  tn = len(predictions[tn_filter])
  
  # Rates
  tpr = tp / float((tp + fn))
  fpr = fp / float((fp + tn))
  
  print(tpr)
  print(fpr)

  0.731799521545
  0.478985635751

随机森林balanced处理不均衡

  from sklearn.ensemble import RandomForestClassifier
  from sklearn.cross_validation import cross_val_predict
  rf = RandomForestClassifier(n_estimators=10,class_weight="balanced", random_state=1)
  #print help(RandomForestClassifier)
  kf = KFold(features.shape[0], random_state=1)
  predictions = cross_val_predict(rf, features, target, cv=kf)
  predictions = pd.Series(predictions)
  
  # False positives.
  fp_filter = (predictions == 1) & (loans["loan_status"] == 0)
  fp = len(predictions[fp_filter])
  
  # True positives.
  tp_filter = (predictions == 1) & (loans["loan_status"] == 1)
  tp = len(predictions[tp_filter])
  
  # False negatives.
  fn_filter = (predictions == 0) & (loans["loan_status"] == 1)
  fn = len(predictions[fn_filter])
  
  # True negatives
  tn_filter = (predictions == 0) & (loans["loan_status"] == 0)
  tn = len(predictions[tn_filter])
  
  # Rates
  tpr = tp / float((tp + fn))
  fpr = fp / float((fp + tn))