def calculateIV(train_data,label_columnName):
'''
@description
@param label_columnName: column name of label
@return
train_data: pd.DataFrame,include label
test_data: pd.DataFrame
'''
import math
print("calculate IV")
# WOE Encode For category column
columns1 = [i for i in train_data.columns if i != label_columnName] # column names except label column
# calculate IV value for category column
IV_value = dict()
for col in columns1:
iv_sum = 0 # 统计每个变量的iv值
t = train_data[col].unique().tolist() # origin value of category column
for i in t:
tmp_count0 = train_data[train_data[col]==i][train_data[label_columnName]==0][col].count()
tmp_count1 = train_data[(train_data[col]==i) & (train_data[label_columnName]==1)][col].count()
total_count0 = train_data[train_data[label_columnName]==0][label_columnName].count() # 所有样本中负样本总数
total_count1 = train_data[train_data[label_columnName]==1][label_columnName].count() # 所有样本中正样本总数
good_pcnt = tmp_count1/total_count1
bad_pcnt = tmp_count0/total_count0
k = (good_pcnt+1)/(bad_pcnt+1)
iv_sum += (good_pcnt-bad_pcnt)*math.log(k) # calculate IV
IV_value[col] = iv_sum
return IV_value
iv_values = calculateIV(iv_data0,"label")
iv_values_sort = sorted(iv_values.items(), key=lambda x: x[1],reverse=True)
arr = [list(i) for i in iv_values_sort]
iv_name_value = pd.DataFrame(np.array(arr),columns=["features","IV"])
pandas 计算IV值方法
最新推荐文章于 2024-01-21 17:04:37 发布