史上最全pandas语法汇总，教你一文掌握pandas

最新推荐文章于 2025-04-30 23:14:29 发布
一种tang两种味
最新推荐文章于 2025-04-30 23:14:29 发布
阅读量2.9k
点赞数 4
文章标签： nlp 自然语言处理机器学习 python 数据挖掘
本文链接：https://blog.csdn.net/GFDGFHSDS/article/details/123334377
版权
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author   : honshaofeng
# @Time     : 2021/7/28 17:51
# @File     : notes.py
# @Project  : tianjikit-master

sklearn语法模块汇总
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.ensemble import RandomForestRegressor

【集成学习】sklearn中xgboost模块的XGBClassifier函数
# 常规参数
booster：gbtree时，树模型为基分类器（默认）；gbliner时， 线性模型为基分类器.
silent ：silent=0时，不输出中间过程（默认）； silent=1时，输出中间过程.
nthread： nthread=-1时，使用全部CPU进行并行运算（默认）  nthread=1时，使用1个CPU进行运算。
scale_pos_weight：正样本的权重，在二分类任务中，当正负样本比例失衡时，设置正样本的权重，模型效果更好。例如，当正负样本比例为1:10时，scale_pos_weight=10。
# 模型参数
n_estimatores：总共迭代的次数，即决策树的个数 
early_stopping_rounds ：含义：在验证集上，当连续n次迭代，分数没有提高后，提前终止训练。调参：防止overfitting。 
max_depth：树的深度，默认值为6，典型值3-10。 调参：值越大，越容易过拟合；值越小，越容易欠拟合。
min_child_weight：默认值为1。 调参：值越大，越容易欠拟合；值越小，越容易过拟合（值较大时，避免模型学习到局部的特殊样本）。
subsample：训练每棵树时，使用的数据占全部训练集的比例。默认值为1，典型值为0.5-1。调参：防止overfitting。
colsample_bytree：训练每棵树时，使用的特征占全部特征的比例。默认值为1，典型值为0.5-1。调参：防止overfitting。
gamma：惩罚项系数，指定节点分裂所需的最小损失函数下降值。调参：alpha     L1正则化系数，默认为1
lambda: L2正则化系数，默认为1
# 学习任务参数
learning_rate：学习率，控制每次迭代更新权重时的步长，默认0.3。调参：值越小，训练越慢。典型值为0.01-0.2。0.1左右就很好。
objective 目标函数
回归任务
reg:linear (默认)    reg:logistic
二分类 binary:logistic     概率  binary：logitraw   类别
多分类 multi：softmax  num_class=n   返回类别 multi：softprob   num_class=n  返回概率
rank:pairwise eval_metric
回归任务(默认rmse) rmse--均方根误差  mae--平均绝对误差
分类任务(默认error) auc--roc曲线下面积    error--错误率（二分类）   merror--错误率（多分类）
logloss--负对数似然函数（二分类） mlogloss--负对数似然函数（多分类）

data = pd.DataFrame({'label': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
                         'pred': [0.5, 0.6, 0.7, 0.6, 0.6, 0.8, 0.4, 0.2, 0.1, 0.4, 0.3, 0.9]})  # 0.83
import pandas as pd
from psi import psi
on = ['name', 'idcard_md5', 'phone_md5', 'loan_dt']        on = ['name', 'idcard', 'phone', 'loan_dt']
# print格式化输出  
小数： print('percent: %.2f'%(42/50)            print('percent: {:.2f}'.format(42/50))
百分数： print('percent: {:.2%}'.format(42/50))        print('percent: {:.2f}%'.format(42/50*100)) 
# pycharm:ctrl+shift+e:默认为总项目路径：'E:\\rong360'
# 新建列，同时命名
dfpsi = pd.DataFrame([np.nan], columns=['psi'])
#新建表
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
df = pd.DataFrame(data=['Apple','Banana','Cherry','Dates', 'Eggfruit'], index = [1,2,3,4,5],columns=['Fruits'])
pd.DataFrame(fruits_list)
pd.DataFrame({ 'Fruits':['Apple','Banana','Cherry','Dates','Eggfruit'],'Quantity': [5, 10, 8, 3, 7],  'Color': ['Red', 'Yellow', 'Red', 'Brown', 'Yellow']})
# 读取文件
import pandas as pd
df =pd.read_csv('zhongxin_1206.txt',sep='\t')      df =pd.read_csv('zhongxin_1206.txt',sep=',')      fruits = pd.read_excel('fruits.xlsx',sheet_name='Sheet1')
df = pd.read_csv(path, sep=',', names=columns_name)  # 读取文件的同时重命名列名
# 读取无表头的文件
ranking_name = ['name', 'idcard_md5', 'phone_md5', 'loan_dt', 'score', 'resource_id', 'customer', 'feature_new']
df = pd.read_csv('rong360v1_20210815.txt', header=None, names=ranking_name, sep='\t')
# 读取指定列usecols
df = pd.read_csv('rong360_fq1_10w.txt', sep='\t',usecols=['name', 'idcard_md5', 'phone_md5', 'loan_dt','n21_score'])
df = pd.read_csv("1217_1out.csv",usecols=[i for i in range(1,10000)])  #按列切分!
# 读取前5行
data = pd.read_csv('data.csv',nrows =5)
# 取某几列
df = df[['name', 'idcard_md5', 'phone_md5', 'loan_dt']]  # 要加中括号，不然报错，一次多列要多加个中括号！
# 转为df
pd.DataFrame(miss_data1, index=list_score1，columns= 'shape').round(4) # 保留4位小数
# df保存为csv
df.to_csv('zhongxin_1208.txt', index=False, sep='\t',encoding='utf-8')
# 重命名列名
df.rename(columns={'order': 'id'}, inplace=True)
df = df.rename(index={"TianJin": "tj", "ShangHai": "sh"}, columns={"a": "A"})  
df = df.rename(columns={"n21_score": "n21_score_tmp", "n21_score_new_empty": "n21_score"})
df = df.rename(columns=lambda c: c + "_test")
df = df.rename(columns={"name_sha256": "name", "identity_id_sha256": "idcard","phone_sha256":"phone","apply_dt": "loan_dt"})
# 删除行或列
df.drop(['id', 'phone'], axis=1, inplace=True)
df = df.drop(columns='A')  # 删除columns为A的列
df.drop(df.index[0], inplace=True)  # 删除第1行
df.drop(df.columns[0], axis=1, inplace=True)  # 删除第1列
df.drop(df.columns[0:3], axis=1, inplace=True)  # 删除前3列
df.drop(['label_x'], axis=1, inplace=True)  # axis=0默认行，axis=1为删除列
df = df_new.drop(df[df.label == -1].index) # df = df.drop(df[(df.score < 50) & (df.score > 20)].index)
df.drop(df.columns[[0, 4, 8]], axis=1, inplace=True) # dt.columns[[0, 4, 8]] 直接使用索引查找列
del df['密度'] # 使用del, 一次只能删除一列，不能一次删除多列, del df[['密度', '含糖率']] 报错
# 去重
df.drop_duplicates(subset=['name','idcard','phone','loan_dt'], inplace=True)  # 默认保留第一次出现的重复项
df.drop_duplicates(subset=['name','idcard_md5','phone_md5','loan_dt'], inplace=True)
df.drop_duplicates(subset=['A','B','C'],keep=Fasle,inplace=True) # 删除所有重复项。
# 去除空值大于5个的列
print(df.dropna(axis='columns', thresh=5))
# 排序
df.sort_values(by='排序字段', axis=0, ascending=True, inplace=False, na_position='last') # 升序
df.sort_values(by='loan_dt',inplace=True,ascending=False)  #降序
# 重置索引reset_index
df = df.sort_values(by='y_pred') # 排序后顺序是乱的，
df = df.reset_index(drop=True) #顺序连续了。drop不要原来乱序的索引
result = result.reset_index(drop=True)  # 索引由01210123变为0-10，如果不进行reset_index(drop=True)，index会变得不连续。
# 去空，有一个空，就删除该行
df.dropna(axis=0, how='any', inplace=True)
# merge合并取交集。columns会outer，列会inner
df = pd.merge(df1,df2, how='inner', on=['name', 'idcard', 'phone', 'loan_dt'])  
df=df1.merge(df2[['query_name','query_iden_num','query_mbl_num','etl_dt', 'credit_score']],left_on=['name-正常','iden_num-正常','phone_num-正常','etl_dt-正常'],right_on=['query_name','query_iden_num','query_mbl_num','etl_dt'],how='left')
# 不同类型转数字
df_inner[['score2','score']] = df_inner[['score2', 'score']].apply(pd.to_numeric, errors='ignore')
# DataFrame类型与Numpy中array数组类型转换
test_x = np.array(df_test_x)  # 去掉了索引！效果等同于打df.values,或者df.as_matrix()
df = pd.DataFrame(test_x)  # array转化成dataframe
# iloc
train, test = res.iloc[:90131], res.iloc[90131:]  # 按行
test = N_data.iloc[45000:]['n30_hj_score'].to_list() #iloc之后df变列表  # 行索引，列名称。
df.loc[:, 'a']= df.loc[:, 'b'] # 等价于data.iloc[:, 0] 
df['sum'] = df.loc[df['x'] > 0,['x','y']].sum(axis=1)  # 将x>0的xy两列求和，没有求和的则为空值，可用0填充。df['sum'].fillna(0, inplace=True)
#loc
df.loc[['a', 'f']]  # 不能索引，只能名称。
#按索引取指定行列
df = df[:-1,:-2] 
# 查看数据的分布，唯一值
print('month:', df['loan_dt'].str[:7].unique())
# 统计个数
df['label'].value_counts()
#查看某列缺失个数及缺失率
print(df_raw['v30_dz_huisuV'].isnull().sum(axis=0))  # 4423
print('Vscore缺失率','{:.2%}'.format(df_raw['v30_dz_huisuV'].isnull().sum(axis=0) / df_raw.shape[0])) 
## 删除异常值，记得重置索引。
df_train.drop(df_train[(df_train['OverallQual']<5) & (df_train['SalePrice']>200000)].index,inplace=True)
df_data = df_data[df_data['loanamount']<1000000] 
# 满足条件即替换为对应的值。将sl列替换为flag，如果满足flag=1，就用sl——new。
df['sl']=df['sl'].mask(df['flag']==1，df['sl_new'])
df['flag']=1
df=[df['flag'].notnull()]
#select_dtypes  挑选字符型变量列。
df.select_dtypes(include=['float64'])
# 深拷贝，深复制。对副本的数据或索引的修改不会反映在原始对象中
df_all = df[['name', 'idcard', 'phone','idcard_rsa','phone_rsa','loan_dt']].copy(deep=True)
df_all = df[['name', 'idcard', 'phone','idcard_rsa','phone_rsa','loan_dt']].copy()  # 默认是深拷贝，同上法一致。
# 行拼接，记得提前去重，且保证四要素及特征对齐。
df=pd.concat([df1, df2])
# 列拼接
df=pd.concat([df1, df2], axis=1)
df_igno_idx = pd.concat([df_aa,df_zz], ignore_index=True)  # 重置索引，类似于pd.concat([df1,df2]).reset_index(drop=True))
# 插入新的一列insert，防止出错，长度不对。
V_test.insert(5, 'n30_hj_score_new', res, allow_duplicates=True)  # 插入到主键后面。可行！
# 随机取样，固定种子
V_N_data1 = df_raw.sample(n=df_raw.shape[0]-5000, random_state=1)
# df.replace 字符串替换
df['feature_new'].replace('\\N', np.nan,inplace=True)
df['feature_new'].replace("%","").astype("float"))
items_df["Cost"] = pd.to_numeric(items_df["Cost"].str.replace('$', ''))  # 转换为数字类型的同时将$去掉。
result['prediction'].replace([0,1,2,3,4,5,6,7,8],[870,870,880,898,1300,13117,13298,13690,13691],inplace=True) #写成数组的值，分别替换
data['v'].replace(["\\N",0,'0','None','null'],np.nan,inplace=True)  # 一键替换所有空字符串类型的字符。
# replace多种用法：
s.replace(0, 5)       df.replace([0, 1, 2, 3], 4)     df.replace({0: 10, 1: 100}) df.replace({'A': {0: 100, 4: 400}})
# 正则表达式 df.replace(to_replace=r'^ba.$', value='new', regex=True) # 匹配表格整个字符串ba.，替换为new！
df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True) # 匹配表格A列整个字符串ba.，替换为new！
df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})
df.replace(regex=[r'^ba.$', 'foo'], value='new')
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html?highlight=replace#pandas.DataFrame.replace

# notnull()取出非空值！
fq1 = fq1[fq1['phone_md5'].notnull()]
# .isin() 条件筛选，适合单列判断,多列一起无法保证同时四要素的。
df = df[df['feature_new'] != '\\N']
df.drop(index=df[df['A'].isin([4])].index[0])  # 删除包含4的行
df.drop(index=df[df['A'] == 4].index[0])  # 删除包含4的行
df[df.isin({'D':[0,3],'E':['aa','cc']})] # 多条件筛选 ，和&应该是一样的？
df[df.isin({'D':[0,3],'E':['aa','cc']})] 
#日期格式化 apply  
import datetime
df['loan_dt'] = pd.to_datetime(df['loan_dt']).apply(lambda x: datetime.datetime.strftime(x,"%Y-%m-%d")) #日期格式化
print(df.Math_B.apply(lambda x: x * 2))
def x(a,b):
    return a - b
df['d - a']  = df.apply(lambda f: x(f['d'],f['a']), axis=1)  # 涉及2列数据的多个值比较和处理！！！
# 数字类型转字符串，字符串转int
employees_df["Age"]=employees_df["Age"].apply(str)  
df['HIS_DW_DATE'] = df['HIS_DW_DATE'].apply(int)
df['Full Name'] = df[['First', 'Last']].apply(' '.join, axis=1)  #拼接两列字符
df['Full Name'] = df['First'].str.cat(df['Last'],sep=" ")  #拼接两列字符
df['Full Name'] = df[['First', 'Last']].agg(' '.join, axis=1)  #拼接两列字符

df[df.label.str.startwith('199')]  # 取出字符串以199开头的df
df.apply(np.sum, axis=1)  #按列
df.apply(np.sqrt) #按行
#保留以e结尾或开头的列。保留满足某条件的列。
df.filter(regex='e$', axis=1)  
df.filter(regex='^e', axis=1)
df.filter(like='201'，axis=1)
# columns which has letter 'a' or 'A' in its name. 
df.filter(regex ='[aA]') 
#删除以e结尾或开头的列。三种方法
df = df[df.columns.drop(list(df.filter(regex='e$')))]
df=df[[c for c in df.columns if c.lower()[:1] != 'e']]
df.select(lambda x: not re.search('Test\d+', x), axis=1)
#选择包含‘bbi’的行。
df.filter(like='bbi', axis=0)
#map
data['result'] = data['result'].map(lambda x: x.lstrip('+-').rstrip('aAbBcC'))
最后一个字符：data['result'] = data['result'].map(lambda x: str(x)[:-1])
前两个字符：data['result'] = data['result'].map(lambda x: str(x)[2:])
data['result'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
#dict.get() # 
print ('Salary: %s' % tinydict.get('Salary', 0.0)) # 拿到就返回对应value，没拿到就返回0.0
print ('Salary: %s' % tinydict.get('Salary'))  #  # 拿到就返回对应value，没拿到就返回None
# 时间格式转换
temp['year'] = pd.to_datetime(temp['Date']).dt.year
#unique
print(len(data_te['v30_dz'].unique()), len(data_te['n30_hj_score'].unique()), len(data_te['n30_hj_score_new'].unique()))
# 条件筛选isin，~取反，&，|, map
df = df[~(df['y2'].isin([2]) | df['y1'].isin([2]) | df['y3'].isin([2]) | df['y4'].isin([2]))]
V_test = V_N_raw[~V_N_raw['phone'].isin(V_N_data['phone'].values.tolist())]  #取出手机号不在训练集中的表。
df[(df['label'] == 1) & (df['month'] == '2021-01')].shape
df[[x.startswith('张') for x in df['姓名']]] # df['姓名'].map(lambda x: x.startswith('张'))
altered_series=my_series.map(lambda x: str(x)+".00")  # 只适用于特定列，
df = data[(data['a']<=2) | (data['b']>=5)]
tmp = tmp[tmp['year'].isin([1950, 1960, 1970, 1980, 1990, 2000, 2010])] # 选择几年的数据展示
# groupby分组，map
groups = df.groupby(by='month') # 每一组都是df。
print(groups.get_group('2021-05'))
agg = groups['n21_score'].agg([np.sum, np.mean, np.std])
groups = df.groupby('Team')
score = lambda x: (x - x.mean()) / x.std()*10
print (groups.transform(score))
df= df.groupby('Team').filter(lambda x: len(x) >= 3)
for name, group in groups:
    print(name,group)
    print('{:.2%}'.format(group['n21_score'].isnull().sum(axis=0) /group.shape[0]))  # 2.28%
    
target_mean_dict = df_train.groupby(col)[target_col].mean() # group转为字典。
df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)  # 变量参与命名！
y_true, y_pred = np.array(y_true).reshape(-1), np.array(y_pred).reshape(-1)  # -1自动补齐shape，自动计算得出新的矩阵需要多少行
df['data'].groupby(time_list).sum() # 按月份求和
# np.where,找出同时为空的两列个数np.nan
res = (np.where(np.isnan(df['n30_hj_score_new_empty']) & np.isnan(df['cy20_hj_score']), 1, 0)) #满足为1，不满足为0.
# 筛选需要填充的列，筛选存在缺失值的列
print(df.columns[df.isnull().sum() > 0])
full.isnull().sum()[full.isnull().sum()>0] # 查看有缺失值的。
# 空值填充，用列均值进行填充缺失值NaN
for column in list(df.columns[df.isnull().sum() > 0]):
    df[column].fillna(df[column].mean(), inplace=True)

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32],bins = [18, 25, 35, 60, 100]
# 返回的是一个特殊的Categorical对象 → 一组表示面元名称的字符串
cats = pd.cut(ages, bins)
pd.cut(df_f.积分,bins=3,labels=["低","中","高"]) #分成3箱并指定标签
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']###打上标签
cats1 = pd.cut(ages, bins, labels=group_names)
aa = pd.value_counts(cats)  # 按照区间计数
s = pd.Series(np.random.randn(1000))
cats = pd.qcut(s, 4)  # 按四分位数进行切割
a = pd.value_counts(cats)
#np.where
np.where(condition, x, y) # 满足条件(condition)，输出x，不满足输出y。

print(min(train['loan_dt']), max(train['loan_dt']))  # 直接拿最开始和最后的日期，比较就可以！不用排序！
#切分分箱
 N_new1["N_range1"] = pd.cut(x=N_new1["n21_score"], bins=[300, 400, 500, 600, 700, 800, 900])
# 按日期切分
sep_date = df1['loan_dt'][int(df1.shape[0] * 0.8)]  # 取出切分的日期点！
before = df1[df1['loan_dt'] < sep_date]  # 划分训练——测试集与oot集！，要么oot与test都有，要么只有oot！
# 绘制每年的直方图，以年和平均温度分组，并使用'count'函数进行汇总
temp = temp.groupby(['year', 'Mean_TemperatureC']).agg({'Mean_TemperatureC': 'count'}).rename(columns={'Mean_TemperatureC': 'count'}).reset_index()
# np.isnan(np.nan) 不能用== 或者is！
dftrain = pd.DataFrame(train_data, columns=['x']).dropna().sort_values('x')
dftrain.index = range(len(dftrain))
#utf-8解码
df_test.columns = df_test.columns.map(lambda x:x.decode('utf-8'))
features = df_test[[col for col in model.features if col in df_test.columns]]
# 取10位小数
df_test['diff'] = [round(x, 10) for x in df_test['diff']]
# 判断类型，找出字符串，数字类型，便于处理。
objectList=[]
classList=[]
numericalList=[]
for i in train_data.columns:
    if train_data[i].dtype=='O':
        objectList.append(i)
for i in list(train_data.select_dtypes(exclude=['object']).columns):
    temp=train_data[i].unique()
    if len(temp)<=10:
        classList.append(i)
    else:
        numericalList.append(i)
        
#使用LabelEncoder对等级变量进行处理
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['OPBAL_FLAG']=le.fit_transform(df['OPBAL_FLAG'])
df['CLSBAL_FLAG']=le.fit_transform(df['CLSBAL_FLAG'])

# psi的计算逻辑：分箱，频率，最终计算频率差。
quantile_points = [dftrain['x'][int(np.ceil(float(indexmax * i) / parts))] for i in range(0, parts + 1)]
cut_points = list(pd.Series(quantile_points).drop_duplicates().values)  # 分箱后去重！得到分享结果
train_frequencies = __get_hist(list(dftrain['x'].values), cut_points)
psi_value = sum([(testf[i] - trainf[i]) * math.log(testf[i] / trainf[i]) for i in range(len(testf))])
# 计算psi，同主键
print('%.4f' % (psi(df_inner['score'], df_inner['score2'])))
# 直接调用CALPSI.py脚本算psi！
result_psi2 = psi_analysis(n21_old_ls, n21_new_ls, parts=10)
print(result_psi2.columns)  # 4.65228805e-03
print(result_psi2.values)
# 求相关系数
corr_te = data_te[['n21_score_new', 'n21_score', 'v30_dz_huisuV']].corr(method='spearman')
df_corr = pd.DataFrame(corr_te)
print(df_corr)
# 将字符串引号去除，保留数字
import re
print(float(re.split(r'[\"]?([0-9\.]*)[\"]?','1151226468812.22')[1]))
print(float(re.split(r'[\"]?([0-9\.]*)[\"]?','"1151226468812.22"')[1]))

for col in train.select_dtypes(include=['object']).columns:  # 字符型列唯一值进行查看。
    print ("Column {} has {} unique instances".format( col, len(train[col].unique())) )
matplotlib画图
#散点图，先设置字体
plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一（替换sans-serif字体）
plt.rcParams['axes.unicode_minus'] = False  # 步骤二（解决坐标轴负数的负号显示问题）

fig1 = plt.figure(figsize=(5, 5), dpi=100)
plt.title('HBXJ产品V和N_new关系散点图', fontsize='xx-large')
plt.scatter(v30_dz_huisuV_ls, n21_new_ls, s=2)
plt.xlabel('V')
plt.ylabel('N_new')
# plt.xlim(250, 900)
# plt.ylim(250, 900)
corr_val = "%.6f" % (corr_te.loc['n21_score_new', 'v30_dz_huisuV'])
font = {'family': 'serif', 'style': 'italic', 'weight': 'normal', 'color': 'red', 'size': 20}
ax = plt.gca()
plt.text(0.25, 0.8, r'Pearson: ' + str(corr_val), fontdict=font, transform=ax.transAxes)  # 采用归一化坐标确定text位置
plt.show()

corr_df = employees_df.corr(method='pearson')
# 热力图1
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.heatmap(corr_df, annot=True)
plt.show()
# 2
corr_df = employees_df.corr(method='pearson')
corr_df.style.background_gradient(cmap='coolwarm')
#3
plt.matshow(corr_df)
plt.show()
# 核密度图
#  画一起 ，不显示rug
    fig6 = plt.figure(figsize=(6, 5), dpi=100)
    plt.title('HBXJ产品验证集N_old和N_new密度图', fontsize='xx-large')
    p1 = sns.kdeplot(data_te['n21_score'], shade=True, color="r", label='n21_score')
    p1 = sns.kdeplot(data_te['n21_score_new'], shade=True, color="b", label='n21_score_new')
    plt.ylabel('density')
    plt.xlabel('n21_score + n21_score_new')
    plt.legend(loc=2)
    # corr_val = "%.6f" % (corr_te.loc['score', 'score_new'])
    psi = "%.6f" % (result_psi2.values[0][0])
    font = {'family': 'serif','style': 'italic','weight': 'normal','color': 'red','size': 20}
    ax = plt.gca()
    plt.text(0.02, 0.8, r'PSI: ' + str(psi), fontdict=font, transform=ax.transAxes)  # 采用归一化坐标确定 text 位置
    plt.show()

# 数值变量查看，画子图。
dis_cols=6
dist_rows=len(numericalList)
plt.figure(figsize=(4*dis_cols,4*dist_rows))
i=1
for col in numericalList:
    ax=plt.subplot(dist_rows,dis_cols,i)
    ax=sns.kdeplot(train_data[col],color='Red',shade=True)
    ax=sns.kdeplot(test_data[col],color='Blue',shade=True)
    ax.set_xlabel(col)
    ax.set_ylabel("Frequency")
    ax=ax.legend(["train","test"])
    i+=1
plt.show()
# 拿到不同变量类型特征dtype
#object的变量  ==> objectList
#numerical的变量==>classList
#连续变量 ==>numericalList
objectList=[]
classList=[]
numericalList=[]
for i in train_data.columns:
    if train_data[i].dtype=='O':
        objectList.append(i)
for i in list(train_data.select_dtypes(exclude=['object']).columns):  # 去除类型为对象的列，剩余数字（浮点和整数）及类别
    temp=train_data[i].unique()
    if len(temp)<=10:
        classList.append(i)
    else:
        numericalList.append(i)
# 以下简单粗暴
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
category_fea = list(train.select_dtypes(include=['object']).columns)      
# 字符串替换 
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
# 画出缺失率的图，可以看出，train的缺失值还是不少的，但是占比不是很多。不过缺失值对于xgb，lgb等树模型来说可以直接空缺，树模型会自己优化，要是用别的模型还是要处理的。
missing = train.isnull().sum()
missing = missing[missing > 0]
missing_rate = missing/len(train)
missing_rate.plot.bar()
### 统计数据集缺失情况
missingDf = data.isnull().sum().sort_values(ascending=False).reset_index()
missingDf.columns = ['feature', 'miss_num']    
missingDf['miss_percentage'] = missingDf['miss_num'] / data.shape[0]    # 缺失值比例

# 去除只有1个值的特征！
one_value_fea = [col for col in train.columns if train[col].nunique() <= 1]
# 'policyCode'
one_value_fea_test = [col for col in testA.columns if testA[col].nunique() <= 1]
# 'policyCode'


#每个数字特征得分布可视化
f = pd.melt(train, value_vars=numerical_fea)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")


def employmentLength_to_int(s):
    if pd.isnull(s):
        return s  # 空值跳过！
    else:
        return np.int8(s.split()[0])  # 空格划分取第一个！
        
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['employmentLength'].value_counts()    
# 箱型图
column=data.columns.tolist()
fig=plt.figure(figsize=(80,60),dpi=75)
for i in range(len(column)):
    plt.subplot(7,8,i+1)
    sns.boxplot(data[column[i]].tolist(),orient="v",width=0.5)
    plt.ylabel(column[i],fontsize=36)
plt.show()
# 找出异常值
def find_outliers_by_3segama(data,fea):
    data_std = np.std(data[fea])
    data_mean = np.mean(data[fea])
    outliers_cut_off = data_std * 3
    lower_rule = data_mean - outliers_cut_off
    upper_rule = data_mean + outliers_cut_off
    data[fea+'_outliers'] = data[fea].apply(lambda x:str('异常值') if x > upper_rule or x < lower_rule else '正常值')
    return data

for fea in column:
    data_train = find_outliers_by_3segama(train_data,fea)
    print(train_data[fea+'_outliers'].value_counts())
    print(train_data.groupby(fea+'_outliers')['isDefault'].sum())
    print('*'*10)
    train_data = train_data[train_data[fea+'_outliers']=='正常值']
    train_data = train_data.reset_index(drop=True) 
    train_data.drop([fea+'_outliers'],axis=1,inplace=True)
train_data.info()
# roc曲线绘制ROC曲线
from sklearn.metrics import roc_curve
y_pred = [0, 1, 1, 0, 1, 1, 0, 1, 1, 1]
y_true = [0, 1, 1, 0, 1, 0, 1, 1, 0, 1]
FPR,TPR,thresholds=roc_curve(y_true, y_pred)
plt.title('ROC')
plt.plot(FPR, TPR,'b')
plt.plot([0,1],[0,1],'r--')
plt.ylabel('TPR')
plt.xlabel('FPR')

import pandas as pd
import numpy as np
import datetime
from CalPSI import *  # project所在目录
import matplotlib.pyplot as plt
import seaborn as sns
#  画一起，显示rug，显示psi
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/12/17 13:33
# @Author  : hongshaofeng
# @Project ：rong360 
# @File    : 一致性分析.py

import pandas as pd
import numpy as np
import datetime
from CalPSI import *  # project所在目录
import matplotlib.pyplot as plt
import seaborn as sns


def plot_kde_psi(f_online, f_offline, left_on, right_on, model_score_on, model_score_off):
    df1 = pd.read_csv(f_online, sep='\t')  # encoding='utf-8','GB2312'
    df2 = pd.read_csv(f_offline, sep='\t')
    for i in ['loan_dt', 'apply_dt']:
        if i in df1.columns:
            df1[i] = pd.to_datetime(df1[i]).apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))  # 日期格式化
        if i in df2.columns:
            df2[i] = pd.to_datetime(df2[i]).apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))  # 日期格式化
    print(df1.head(), df2.head(), df1.columns,df2.columns)
    df = df1.merge(df2, how='inner', left_on=left_on, right_on=right_on)
    print(df.shape, df, df.columns)
    print('线上覆盖率: {:.2%}'.format(1 - df[model_score_on].isnull().sum() / df.shape[0]),
          '线下覆盖率: {:.2%}'.format(1 - df[model_score_off].isnull().sum() / df.shape[0]))
    # df.rename(columns={'信用风险等级_x':'model_score_on','信用风险等级_y':'model_score_off'},inplace=True)
    print(df[model_score_on].dtype,df[model_score_off].dtype)  # 查看是否含
    df[model_score_on].replace(["\\N",0,'0','None','null'],np.nan,inplace=True)
    df[model_score_off].replace(["\\N",0,'0','None','null'],np.nan,inplace=True)

    # df[model_score_on] = df[model_score_on].apply(pd.to_numeric, errors='ignore')
    # df[model_score_off] = df[model_score_off].apply(pd.to_numeric, errors='ignore')
    # print(df[model_score_on].isnull().sum(),df[model_score_off].isnull().sum())
    # df[model_score_on] = df[df[model_score_on].notnull()]
    # df[model_score_off] = df[df[model_score_off].notnull()]

    score_online_ls = df[model_score_on].values.tolist()
    score_offline_ls = df[model_score_off].to_list()  # 都可以
    print(df[model_score_on].describe,df[model_score_off].describe,)

    print(score_offline_ls,'\n',score_online_ls)
    result_psi2 = psi_analysis(score_online_ls, score_offline_ls, parts=10)
    # print(result_psi2.columns)
    # print(result_psi2.values)
    print('样本的score-psi：{:.6f}'.format(result_psi2.values[0][0]))

    plt.rcParams['font.sans-serif'] = ['SimHei']  # 步骤一（替换sans-serif字体）
    plt.rcParams['axes.unicode_minus'] = False  # 步骤二（解决坐标轴负数的负号显示问题）
    fig = plt.figure(figsize=(7, 5), dpi=100)
    # plt.title(str(f_offline[7:-4])+'_密度图', fontsize='xx-large')
    plt.title(str(model_score_off) + '_密度图', fontsize='xx-large')
    sns.distplot(df[model_score_on], bins=30, rug=False, kde_kws={"label": "model_score_online"}, color="r")
    sns.distplot(df[model_score_off], bins=30, rug=False, kde_kws={"label": "model_score_offline"},
                 color="b")  # 画一起，显示rug，显示psi
    plt.ylabel('density')
    plt.xlabel('线上线下分布对比')
    plt.legend()

    ax = plt.gca()
    psi = '{:.6f}'.format(result_psi2.values[0][0])
    font = {'family': 'serif', 'style': 'italic', 'weight': 'normal', 'color': 'red', 'size': 20}
    # plt.text(0.6, 0.75, r'PSI: ' + str(psi), fontdict=font, transform=ax.transAxes)  # 采用归一化坐标确定 text 位置
    plt.text(0.1, 0.7, r'PSI: ' + str(psi), fontdict=font, transform=ax.transAxes)  # 采用归一化坐标确定 text 位置
    plt.show()
    # plt.savefig('fig/'+str(column_online)+'.png')  # 要存就不能show


# plot_kde_psi('中信消金_fql_n30xj_1k_93.3%.csv', 'fql_n30xj.txt',
#               ['name', 'phone', 'idcard', 'loan_dt'], ['name', 'phone', 'idcard', 'loan_dt'],
#              'n30_xj_score_x', 'n30_xj_score_y')
# plot_kde_psi('中信消金_fql_v71ty_1k_99.5%.csv', 'fql_v71ty.txt',
#               ['name', 'phone', 'idcard', 'loan_dt'],
#              ['name', 'phone', 'idcard', 'loan_dt',],
#              'v71_ty_x', 'v71_ty_y')
# plot_kde_psi('../ppd/中信消金_ppd_v71ty_1k_98.2%.csv', '../ppd/ppd_v71_ty.txt',
#               ['name', 'phone', 'idcard', 'loan_dt'],
#              ['name', 'phone', 'idcard', 'loan_dt'],
#              'v71_ty_x', 'v71_ty_y')
plot_kde_psi('fql_cy21xj_1k.txt', 'fql_cy21xj_10w.txt',
              ['name','phone','idcard','loan_dt'],
             ['name', 'phone', 'idcard', 'loan_dt'],
             'cy21_xj_score_x', 'cy21_xj_score_y')

    
# tools函数：
def drop_col(df, cutoff=0.1):  # 如果这一列中有90%以上的缺失值，那么就从df中删除这一列
    n = len(df)
    cnt = df.count() # 对列进行非nan值计数
    cnt = cnt / n # 求出非nan值的百分比
    # 根据cnt记录的百分比，过滤出cnt百分百大于等于0.1的（也就是去掉nan值大于0.9的索引），
    # 然后对df进行选择，行所有，列为满足要求的cnt的索引。
    return df.loc[:, cnt[cnt >= cutoff].index]
df = drop_col(df)
print(df)
# 取线上跑批的最新1k条样本。
def sample_online_1k():
    df=pd.read_csv('FQL_key_0111.txt',sep='\t')
    df.drop_duplicates(subset=['name','idcard','phone','loan_dt'],inplace=True)  
    df.sort_values(by='loan_dt',inplace=True,ascending=False)  #降序
    print(df,df.iloc[:1000,:])
    df.iloc[:1000,:].to_csv('fql_new_1k.txt', sep='\t', index=False, encoding='utf-8')

sample_online_1k()

def func():
    import os
    print(os.getcwd())  # 获取当前工作路径
    # pandas 合并多个csv文件
    files = os.listdir(path)                                  # 获取文件夹下所有文件名
    df1 = pd.read_csv(path + '/' + files[0],encoding='gbk')   # 读取首个csv文件，保存到df1中

    for file in files[1:]:     
        df2 = pd.read_csv(path +'/'+file,encoding='gbk')      # 打开csv文件，注意编码问题，保存到df2中
        df1 = pd.concat([df1,df2],axis=0,ignore_index=True)   # 将df2数据与df1合并

    df1 = df1.drop_duplicates()           # 去重
    df1 = df1.reset_index(drop=True)      # 重新生成index
    df1.to_csv(path + '/' + 'total.csv')  # 将结果保存为新的csv文件

def func():
    print(str(list(df1[['loan_dt']].count())[0]))  # 打印总样本数，
    print(str(list(df1[['label']].sum())[0]))  # 打印坏样本数，取第一个既是了！最后都要转化为字符串用tab拼接写入到文件中！
    a = df1[['label']].groupby(df1['loan_dt']).count()  # 一个计数，一个求和。
    b = df1[['label']].groupby(df1['loan_dt']).sum()
    c = pd.concat([a, b], axis=1)
    c.columns = ['cnt', 'ovd']
    c['ovd_ratio'] = c['ovd'] * 1.0 / c['cnt']

def get_pri_key(path,new_path):
    import pandas as pd
    rankings_colname=['name','idcard_md5','phone_md5','loan_dt']
    df = pd.read_csv(path, header=None, names=rankings_colname, sep='\t', usecols=[0, 1, 2, 3])
    print(df.shape)
    df.to_csv(new_path, sep='\t', index=False, encoding='utf-8')
    
import pandas as pd
import json
def dict_to_df(file, file_new):
    # 表头与取数逻辑一致！不能更改！
    rankings_colname = ['name', 'idcard_md5', 'phone_md5', 'loan_dt', 'score', 'resource_id', 'customer', 'feature_new', 'feature_old']
    df = pd.read_csv(file, header=None, names=rankings_colname, sep='\t')  # hubei_xj加上scource字段,jryk不需要old,后面需要drop。
    print(df.shape, '\n', df.head())
    # df.drop(['feature_old'], axis=1,inplace=True)
    print(df.shape, '\n', df.head())
    # feature_new字典转宽表
    feature_new_list = df['feature_new'].apply(lambda x: eval(x)).tolist()
    file_name1 = 'feature_new_'+str(file)+'.json'
    with open(file_name1, 'w+') as f1:
        json.dump(feature_new_list, f1)
    feature_new = pd.read_json(file_name1, orient='records')
    print(feature_new.shape, '\n', feature_new.head())
    # feature_old字典转宽表
    feature_old_list = df['feature_old'].apply(lambda x: eval(x)).tolist()
    file_name2 = 'feature_old_' + str(file) + '.json'
    with open(file_name2, 'w+') as f2:
        json.dump(feature_old_list, f2)
    feature_old = pd.read_json(file_name2, orient='records')
    # 三者拼接
    df2 = pd.concat([df.iloc[:, :-2], feature_new, feature_old], axis=1)  # 适用于wjwl.
    # df2 = pd.concat([df.iloc[:, :-1], feature_new], axis=1) # feature_old为空时，将old提前drop掉，不执行old字典转宽表。否则报错。
    print(df2.shape, df2.head())
    df.drop_duplicates(subset=['name', 'idcard_md5', 'phone_md5', 'loan_dt'], inplace=True)  # 最后去重！不然主键和特征值对不上！
    print(df2.shape, df2.head())
    df2.to_csv(file_new, sep='\t', index=False, encoding='utf-8')
    return feature_new
    
import pandas as pd
import datetime
def get_month_miss(file):
    df = pd.read_csv(file, sep='\t')
    df.drop_duplicates(subset=['idcard_md5'], inplace=True)
    print(df.shape)
    print("n30_hj_score缺失率", '{:.2%}'.format(df['n30_hj_score'].isnull().sum(axis=0) / df.shape[0]),
          "v30_dz缺失率",'{:.2%}'.format(df['v30_dz'].isnull().sum(axis=0) / df.shape[0]))

    # 将日期变为标准格式yyyy-mm-dd。
    df['loan_dt'] = pd.to_datetime(df['loan_dt']).apply(lambda x: datetime.datetime.strftime(x, "%Y-%m-%d"))
    df['month'] = df['loan_dt'].str[:7]
    groups = df.groupby(by='month')
    for name, group in groups:
        print(name, '\t', group.shape[0], '\t',
              '{:.2%}'.format(group['n30_hj_score'].isnull().sum(axis=0) / group.shape[0]), '\t',
              '{:.2%}'.format(group['v30_dz'].isnull().sum(axis=0) / group.shape[0]), '\t',
              # '{:.2%}'.format(group['n30_hj_score_new'].isnull().sum(axis=0) / group.shape[0]), '\t',
              # '{:.2%}'.format(group['n30_hj_score_new_empty'].isnull().sum(axis=0) / group.shape[0]), '\t',
              )
              
def diff_cnt():
    df = pd.read_csv(r"C:\Users\hongshaofeng\Desktop\XL_all.txt", sep='\t')
    print(df.shape, df.head())
    df["N_range1"] = pd.cut(x=df["Nscore"], bins=[300, 400, 500, 600, 700, 800, 900])
    df["N_range2"] = pd.cut(x=df["Nscore_new"], bins=[300, 400, 500, 600, 700, 800, 900])
    def function(a, b):
        if a == b:
            return 0
        else:
            return 1
    df['bool'] = df.apply(lambda x: function(x['N_range1'], x['N_range2']), axis=1)
    print('%.2f%%' % (df['bool'].sum()/df.shape[0]*100))
    print('{:.4f}'.format(df['bool'].sum()/df.shape[0]))
    # print(df.shape,df.head())
    df.to_csv('test.csv',encoding='utf-8',sep='\t',index=False)


# 该函数实现将旧特征映射为新特征，需要的文件包括: 源文件，model特征文件，映射文件，最终新生成表头换好的文件
def fea_map(file, model_lib_fea, fea_map, file_new):
    df = pd.read_csv(file, sep='\t')  # 读取源文件
    print(df.shape, df.head())
    res = df.columns.to_list()
    print(res)
    res_lib = []
    with open(model_lib_fea, 'r') as f:  # 将model_lib特征加入res_lib列表中
        for line in f:
            res_lib.append(line.strip('\n'))

    df2 = pd.read_csv(fea_map, header=None, names=['new_fea', 'fea'], sep='\t')  # 读取映射关系
    res_map1 = df2['new_fea'].to_list()
    res_map2 = df2['fea'].to_list()
    for i in res:
        if i in res_map2:
            res[res.index(i)] = res_map1[res_map2.index(i)]  # 命中的全部改为标准表头！
    print(len(res), res)
    # 将原来的表头替换为新表头，并写入新文件中.
    df.columns = res  # 更换表头！
    print(df.columns)

    # #以下只适用于test,将表头n30_hj_score_new_empty修改名字，便于跑modellib --> jryk_test_empty_fea_new.txt
    # df = df.rename(columns={"n30_hj_score": "n30_hj_score_old", "n30_hj_score_new_empty": "n30_hj_score"})

    df.to_csv(file_new, encoding='utf-8', sep='\t', index=False)

# fea_map('jryk_test.txt', 'features.txt', 'fea_map.txt', 'jryk_test_fea_map.txt')

def jiemi():
    df1 = pd.read_csv('wjwl_20210902_fea.txt', sep='\t')
    print(df1.shape, df1.head())
    df1.drop_duplicates(subset=['name',	'idcard_md5', 'phone_md5', 'loan_dt'], inplace=True)
    print(df1.shape, df1.head())
    # df1[['name', 'idcard_md5', 'phone_md5', 'loan_dt']].to_csv('wjwl_20210902_key_jiemi.txt',encoding='utf-8', sep='\t', index=False)
    df1.to_csv('wjwl_20210902_fea_new.txt', encoding='utf-8', sep='\t', index=False)  # 部分用户特征为空，删除

def rand_sample(file,file_new):
    import pandas as pd
    df_raw = pd.read_csv(file, sep='\t') 
    print(df_raw.shape)
    df_raw.drop_duplicates(subset=['name','idcard_md5','phone_md5','loan_dt'], inplace=True)
    df_sample = df_raw.sample(n=2000, random_state=1)  # 随机取样
    print(df_sample.head(), df_sample.shape)
    df_sample.to_csv(file_new, encoding='utf-8', sep='\t', index=False)
    
    
print('V的空值率：{:.4f}'.format(1-data[V_columns].count()/data.shape[0]))

python切换当前工程目录
import os
print(os.getcwd())  # os.getcwd() == r'D:\tianjikit-master\tj_bank_tools_optimize':
if ~os.getcwd().endswith('./example9/'):
    os.chdir('./example9/')  # 调试使用
    print(os.getcwd())
 os.path.join("just", "do", "python", "dot", "com")  # 'just\\do\\python\\dot\\com'
 os.path.exists(".")
python区间
span_list = ['[%.3f,%.3f]' % (min(data[:, 1]), round(cut_list[1], 3))]
np.vstack(tup)使用 #沿着竖直方向将矩阵堆叠起来。

# 校验是否有label列
assert 'label' in dftrain.columns, 'illegal input,there should be a  "label" column in dftrain!'     
# 校验label列的合法性
assert set(dftrain['label']) == {0,1},'illegal label values,label can only be 0 or 1!'

from datetime import datetime, date
from datetime import timedelta
now_date = (date.today())
one_week_before = (date.today() - timedelta(days=7)).strftime("%Y-%m-%d")
data=data[(data['time']<=str(now_date)) &(data['time']>=str(one_week_before))]
from datetime import datetime, date, timedelta

yesterday = (date.today() + timedelta(days = -1)).strftime("%Y-%m-%d")    # 获取前一天日期
#目标值分析
#我们画出SalePrice的分布图和QQ图（Quantile Quantile Plot）。这里简单说一下QQ图，它是由标准正态分布的分位数为横坐标，样本值为纵坐标的散点图。如果QQ图上的点在一条直线附近，则说明数据近似于正态分布，
# 且该直线的斜率为标准差，截距为均值。对于QQ图的详细介绍可以参考这篇文章：https://blog.csdn.net/hzwwpgmwy/article/details/79178485
def analyseAimVal(data):
    sns.distplot(data , fit=norm)#distplot是画直方图#fit=norm拟合标准正态分布
 
    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(data)#获取数据列正太分布的均值和标准差。
    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
 
    #Now plot the distribution
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
                loc='best')
    plt.ylabel('Frequency')#出现率
    plt.title('SalePrice distribution')
 
    #Get also the QQ-plot
    fig = plt.figure()
    #如果两个分布相似，则该Q-Q图趋近于落在y=x线上。如果两分布线性相关，则点在Q-Q图上趋近于落在一条直线上，但不一定在y=x线上。
    #下面的函数默认dist=norm，也就是说默认你传入的数据train['SalePrice']是与正态分布作比较的。
    #这样的话QQ图鉴别样本数据是否近似于正态分布,只需看QQ图上的点是否近似地在一条直线附近,而且该直线的斜率为标准差,截距为均值.
    res = stats.probplot(data,plot=plt)#函数默认值dist="norm"，即默认与正态分布比较
    plt.show()
analyseAimVal(train['SalePrice'])
#SalePrice的分布呈正偏态，而线性回归模型要求因变量服从正态分布。我们对其做对数变换，让数据接近正态分布。
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
import pandas as pd 
train = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\jiufu_out_data_0512.csv')
train["cy20_hj_score"] = np.log1p(train["cy20_hj_score"])
 
analyseAimVal(train["SalePrice"])

环境变量快速进入： 
方法一： 选“环境变量”——搞定。 
方法二： 按快捷键win+R后，输入“sysdm.cpl”，然后回车，完事。
方法三： 按键盘上的windows键，输入“环境变量”或者“huanjing”，打开“编辑系统环境变量”，完事。
方法四：python环境下
import os
os.environ["path"] += os.pathsep + "D:\Graphviz\bin"
dot_data=xgb.to_graphviz(bst, num_trees=0) #1安装graphviz， 2添加环境变量，graphviz\bin,graphviz\dot.exe


score_slice_arr = np.array_split(sort_score_ls, slice_num)  np.array_split进行不均等划分！
AUC (Area Under Curve) 反映不平衡样本分类模型的性能...计算方法：sklearn中metrics.roc_auc_score，阈值计算TPR/FPR
KS（Kolmogorov-Smirnov）衡量数值型变量或有序分类变量对好坏用户的区分度。KS = max{abs(Fn(x)-F0(x))}
IV(information value)衡量多分类变量对好坏用户的区分度。woe_i = ln( (ln(N_bad_in_i /N_total_bad))/(N_good_in_i /N_total_good) )   woei = ln(badi/badT)-ln(goodi/goodT)
IV = sum ([(N_bad_in_i /N_total_bad)-(ln(N_good_in_i /N_total_good))]*woe_i)
PSI(Population stability index) 衡量变量、模型的稳定性. PSI= sum(（实际占比-预期占比）* ln(实际占比/预期占比))

WOE = ln (第i个分箱的坏人数 / 第i个分箱的好人数) - ln (总坏人数 / 总好人数) 可以理解为：每个分箱里的坏好比(Odds)相对于总体的坏好比之间的差异性。
WOE证据权重，根据贝叶斯理论用以衡量对先验认识修正的增量，
PSI衡量预期分布和实际分布之间的差异性，IV把这两个分布具体化为好人分布和坏人分布。
IV指标是在从信息熵上比较好人分布和坏人分布之间的差异性。

异常值发现outlier detection feature_analysis_tools.py中值域判断<0 均记为1e-10                                                                                                         分位数识别 
缺失值填补imputer xgb里处理，如果缺失，试各分支计算增益选择合适方向

特征选择:1\PSI筛选  2\3月间MaxPSI<0.2q且avgPSI<0.1      3\IV筛选    >0.01或者相对区分度TopN      4\ 特征重要度xgb    5\ 评估：KS,AUC



import random
print(random.randint(1, 50))  # 18
print(random.uniform(1, 50))  # 8.315165416


def jiemi(file_raw,file_key):
    df1 = pd.read_csv(file_raw, sep='\t')
    print(df1.shape, df1.head())
    df1.drop_duplicates(subset=['name',	'idcard_md5', 'phone_md5', 'loan_dt'], inplace=True)
    print(df1.shape, df1.head())
    # df1[['name', 'idcard_md5', 'phone_md5', 'loan_dt']].to_csv('wjwl_20210902_key_jiemi.txt',
    #                                                          encoding='utf-8', sep='\t', index=False)
    df1.to_csv(file_key, encoding='utf-8', sep='\t', index=False)  # 部分用户特征为空，删除
    

pycharm直接run的时候：无法自动创建save目录？、run的时候在当前目录D:\tianjikit-master\tj_bank_tools_optimize\example10
Alt + Shift + E 运行目录为 D:\tianjikit-master\tj_bank_tools_optimize，为初始目录


python正则表达式三：^和$
#coding=utf-8
import re
#^匹配字符串起始部分
re.findall('^ba','abacd')    #[]
re.findall('^ab','abacd')   # ['ab']
#$匹配字符串终止部分
re.findall('ac$','abacd')  # []
re.findall('cd$','abacd')  #['cd']
#匹配整个字符串
m=re.findall('^abacd$','abacd')  # ['abacd']

print('---------------------------test1---------------------------------', file=open('log.txt', 'w')) # 重定向

import numpy as np
import pandas as pd
import sys
from feaAnalysis import *
#
def big_data_split(path):  # 切分为四份文件 主键: name	idcard	phone	loan_dt	sha256_iden	sha256_phone
    # path = sys.argv[1]  终端执行，如果本地执行函数注释掉
    # 也可以直接修改源文件主键，flag改为【0，1，2，3，4，5】+ 。。。
    flag = 0
    for i in range(4):
        if i == 0:
            flag = [0,5,6,3,4] + [j for j in range(7, 10000)]  # 1,2不要。 5，6对上了：sha256——iden，sha256——phone
        elif i == 1 or 2:
            flag = [0,5,6,3,4]+[j for j in range(10000*i, 10000*(1+i))]
        elif i == 3:
            flag =[0,5,6,3,4] + [j for j in range(30000, 34742)]  #34742
        df = pd.read_csv(path, sep='\t', encoding='utf-8', usecols=flag)
        df.rename(columns={'sha256_iden': 'idcard', 'sha256_phone': 'phone'}, inplace=True)  # 重命名好
        file_name = 'yixin_xxx0' + str(i+1) + '.csv'
        print (df.shape, df.head(1))
        df.to_csv(file_name, sep='\t', encoding='utf-8', index=False)


def concat_to_real(inpath1,inpath2,outpath):    # 将9.8w行用户补上缺失的用户，使其成为10w行用户
    df1 = pd.read_csv(inpath1, sep='\t')  # 9w行
    df2 = pd.read_csv(inpath2, sep=',')  # 10w行
    print(df1.shape, df2.shape)
    df_new = df1.merge(df2, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])  # 以df2为目标,注意有无label。
    print(df_new.shape, df_new.isnull().sum())
    df_new.to_csv(outpath, sep='\t', encoding='utf-8', index=False)


# 进行单特征分析需要一个个跑，还未改参数，然后得到特征分析的结果，4份总和6000个。
# 将含6k特征的数据集与数据集进行merge得到4份总和含6k特征的数据集。
def xgb_select_data(inpath1, inpath2, outpath):
    import pandas as pd
    df2 = pd.read_csv(inpath2, sep=',')  # 6k行特征，4列指标 先读这个，看会不会报错！特征不是索引。。。分隔符为","
    df1 = pd.read_csv(inpath1, sep='\t')  # 9w行用户，1w列特征，共有4个小文件。。必须\t否则会粘在一起
    # pubilc_columns = df2.index.values.tolist()  # 拿到这6k的特征去匹配
    tmp1, tmp2 = df1.columns.values.tolist(), df2['feature'].tolist()  # 分别拿出对应的特征
    tmp3 = list(set(tmp1) & set(tmp2))  # 取交集 必须要集合，否则无法取交集
    print len(tmp3)
    pubilcData = df1[['name', 'idcard', 'phone', 'loan_dt', 'label']+tmp3]  # 对应的拿出来即可
    print pubilcData.shape
    pubilcData.to_csv(outpath, sep='\t', encoding='utf-8', index=False)


# 将4份文件特征拼接
def concat_four_data():
    res = []
    for i in range(1, 4):
        df1 = pd.read_csv('jiufu_meta02_xxx0'+str(i)+'_6000.csv', sep='\t')
        res.append(df1)
    df = pd.concat(res, axis=1, ignore_index=False, join="outer")  # 按列拼接
    print df.shape
    df.to_csv('jiufu_meta02_6000.csv', sep='\t', encoding='utf-8', index=False)


# 对样本按时间先后先后排序,划分训练集与测试集
def huisu_data_split(path, out_trian, out_test):
    data = pd.read_csv(path, sep='\t')
    res = data.sort_values(by='loan_dt', ascending=True)
    # train, test = res.iloc[:90131], res.iloc[90131:]  # 训练样本90131取到2020-2-16为止！测试样本取到2020-2-17开始的！
    train, test = res.iloc[:87303], res.iloc[87303:]  # 训练样本90131取到2020-1为止！测试样本取到2020-2开始的！
    print train.shape, test.shape
    train.to_csv(out_trian,  sep='\t')
    test.to_csv(out_test, sep='\t')  # run_model_lr,run_model_xgb只能读取空格符号的文件！


# 上述两个函数可直接写成以下函数。
def split_to_concat(file1, file2, file3, file4, base_trian, base_test):
    df1 = pd.read_csv(file1, sep='\t')
    df2 = pd.read_csv(file2, sep='\t')
    df3 = pd.read_csv(file3, sep='\t')
    df4 = pd.read_csv(file4, sep='\t')
    # df = pd.concat([df1, df2, df3, df4], axis=1, join='left') # 只能out或inner
    df = df1.merge(df2, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])  # 以df2为目标
    df = df.merge(df3, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
    df = df.merge(df4, how='right', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
    res = df.sort_values(by='loan_dt', ascending=True)
    # print(res.head())
    train, test = res.iloc[:90131], res.iloc[90131:]  # 训练样本90131取到2020-2-16为止！测试样本取到2020-2-17开始的！
    print train.shape, test.shape
    train.to_csv(base_trian, sep='\t', encoding='utf-8', index=False)
    test.to_csv(base_test, sep='\t', encoding='utf-8', index=False)  # run_model_lr,run_model_xgb只能读取空格符号的文件！


# 将最终xgb筛选出的832特征与原始6k进行拼接，得到数据集，再将其与外部特征数据分别进行拼接，继续跑xgb得到特征分析结果
def base_outer_concat():
    import pandas as pd
    df_base = pd.read_csv('xgb_832_data.csv', sep='\t')
    # 先进行数据集的merge，保证主键一致。将16个特征，merge到832中？？
    df_outer = pd.read_csv('jiufu_out_data_0512.csv', sep=',')
    df_base_outer1 = pd.concat([df_base, df_outer['cy20_hj_score']], axis=1, join='outer')
    print df_base_outer1.shape
    df_base_outer1.to_csv('base_outer_cy20.csv', sep='\t', encoding='utf-8', index=False)
    # df_base_outer = df_base.merge(df_outer.iloc[:, 8:17], how='outer', on=['name', 'idcard', 'phone', 'loan_dt', 'label'])
    # print df_base_outer.shape
    # df_base_outer.to_csv('base_outer_zt.csv', sep='\t', encoding='utf-8', index=False)

def concat_to_real2():    # 将9.8w行用户补上缺失的用户，使其成为10w行用户
    df1 = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\huisu.csv', sep='\t')  # 9w行
    df2 = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\data_0804.csv', sep=',')  # 10w行
    print(df1.shape, df2.shape)
    df_new = df1.merge(df2, how='outer', on=['name', 'idcard', 'phone', 'loan_dt'])  # 以df2为目标
    print(df_new.shape, df_new.isnull().sum())
    df_new.to_csv(r'C:\Users\hongshaofeng\Desktop\huisu_new.csv', sep='\t', encoding='utf-8', index=False)

# huisu_data_split('xgb_832_data.csv', 'xgb_832__tr_cy20.csv', 'xgb_832_te_cy20.csv')
# huisu_data_split('base_outer_cy20.csv', 'base_outer_tr_cy20.csv', 'base_outer_te_cy20.csv')
# huisu_data_split('base_outer_n30.csv', 'base_outer_tr_n30.csv', 'base_outer_te_n30.csv')
# huisu_data_split('base_outer_v30.csv', 'base_outer_tr_v30.csv', 'base_outer_te_v30.csv')
# huisu_data_split('base_outer_zt.csv', 'base_outer_tr_zt.csv', 'base_outer_te_zt.csv')
# python2.7 run_model_xgb.py ../base_outer_tr_cy20.csv ../base_outer_te_cy20.csv base_outer_xgb_results/ ''
if __name__ == '__main__':
    concat_to_real2()
    
pycharm新建27环境需要用anconda虚拟环境来装。27的就只有基本包。需要手动安装第三方库。
pip freeze > requirements.txt  # 导出依赖包
pip install -r requirements.txt  # 安装依赖包
pip  install  ./matplotlib-2.2.3-cp36-cp36m-win32.whl 

#python保留6位小数  4种方式
print('%.6f' % (psi(df['cy20_hj_score'],df['model_score']))) 
print(round(1.32434,3))
print('{:.3f}'.format(1.23456))
print(format(1.23456, '.2f'))
# python 百分比输出2种方式
print('percent: {:.2%}'.format(42/50))
print('%.2f%%' % (2.322*100) )

import warnings
warnings.filterwarnings("ignore") # 忽略警告


pip install numpy -i http://pypi.douban.com/simple/  --trusted-host pypi.douban.com   #装第三方库
pip install -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com pandas
pip  install  ./matplotlib-2.2.3-cp36-cp36m-win32.whl  #安装whl包方法

# 从文件中拿到特征列表，写入文件
import pandas as pd
df = pd.read_csv(r'C:\Users\hongshaofeng\Desktop\sas\sg_inner_data.csv',sep=',')
df= df.rename(columns={'Unnamed: 0':'seq'})
fea_list = df.columns.tolist()
with open('feature_list.txt','w') as f:
    for i in fea_list:
        f.writelines(str(i)+'\n')

#读取文件
res=[]
with open('feature_list.txt','r',encoding='utf-8') as f:
    for line in f.readlines():
        res.append(line.strip('\n'))  # 可以去掉换行符。
    print(res)
# Python 正则匹配两个特定字符之间的字符
result = re.findall(".*entry(.*)for.*",string)

with open(f_list_file, 'r') as f:
        for line in f:
            line = line.strip()
            if line != '': 
                train_list.append(line)
                
                
#log_jryk
from datetime import datetime, date, timedelta
yesterday = (date.today() + timedelta(days = -1)).strftime("%Y-%m-%d")    # 获取前一天日期
print(yesterday,'\t',df1.shape[0],'\t',df2.shape[0],'\t', #原有的样本,切换后的样本
'{:.3%}'.format(df2.shape[0]/(df1.shape[0]+df2.shape[0])),'\t', # 切换后的样本占比
'{:.3%}'.format(df1['n30_hj_score'].isnull().sum(axis=0) / df1.shape[0]),'\t',  #原有的n缺失率（%）
'{:.3%}'.format(df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0]),'\t', #切换后的n缺失率（%）
abs('{:.3%}'.format((df1['n30_hj_score'].isnull().sum(axis=0) / df1.shape[0] - df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0])),'\t', #diff_n（%）
'{:.3%}'.format(df1['score'].isnull().sum(axis=0) / df1.shape[0]),'\t',  #原有score缺失率（%）
'{:.2%}'.format(df2['score'].isnull().sum(axis=0) / df2.shape[0]),'\t',  #切换后score缺失率（%）
abs('{:.3%}'.format(df1['score'].isnull().sum(axis=0) / df1.shape[0] - df2['score'].isnull().sum(axis=0) / df2.shape[0])),'\t', #diff_score(%)
'{:.3%}'.format((df1['score'].isnull().sum(axis=0) / df1.shape[0]) *0.75 + (df2['score'].isnull().sum(axis=0) / df2.shape[0])*0.25),'\t', #score整体缺失率（%）
psi,'\t', # 切换前后n_psi

abs(df1['n30_hj_score'].skew()-df2['t.v_af_score'].skew()),'\t',  #切换前后n偏度差	
abs(df1['n30_hj_score'].skew() - 0.030766),'\t',abs(df2['t.v_af_score'].skew() - 0.030766), #切换前n与离线n偏度差	切换后n与离线n偏度差
psi2,'\t', # 切换前后score_psi	
file=open('log_jryk.txt','a'))


from datetime import datetime, date, timedelta
yesterday = (date.today() + timedelta(days = -1)).strftime("%Y-%m-%d")    # 获取前一天日期
print(yesterday,'\t',df1.shape[0],'\t',df2.shape[0],'\t', #原有的样本,切换后的样本
'{:.3%}'.format(df2.shape[0]/(df1.shape[0]+df2.shape[0])),'\t', # 切换后的样本占比
'{:.3%}'.format(df1['n21_score'].isnull().sum(axis=0) / df1.shape[0]),'\t',  #原有的n缺失率（%）
'{:.3%}'.format(df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0]),'\t', #切换后的n缺失率（%）
abs('{:.3%}'.format((df1['n21_score'].isnull().sum(axis=0) / df1.shape[0] - df2['t.v_af_score'].isnull().sum(axis=0) / df2.shape[0])),'\t', # diff_n（%）
'{:.3%}'.format(df1['score'].isnull().sum(axis=0) / df1.shape[0]),'\t',  #原有score缺失率（%）
'{:.2%}'.format(df2['t.y_score'].isnull().sum(axis=0) / df2.shape[0]),'\t',  #切换后score缺失率（%）
abs('{:.3%}'.format((df1['score'].isnull().sum(axis=0) / df1.shape[0] - df2['t.y_score'].isnull().sum(axis=0) / df2.shape[0])),'\t', # diff_score(%)
'{:.3%}'.format((df1['score'].isnull().sum(axis=0) / df1.shape[0]) *0.92 + (df2['t.y_score'].isnull().sum(axis=0) / df2.shape[0])*0.08),'\t', #score整体缺失率（%）
psi,'\t', # 切换前后n_psi
abs(df1['n21_score'].skew()-df2['t.v_af_score'].skew()),'\t',  #切换前后n偏度差	
abs(df1['n21_score'].skew() + 0.211146),'\t',abs(df2['t.v_af_score'].skew() + 0.211146), #切换前n与离线n偏度差	切换后n与离线n偏度差
psi2,'\t', # 切换前后score_psi	
file=open('log_wjwl.txt','a'))


import logging  # 记录日志
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO,filename='xf.log', filemode='w')
logging.info('data loading...')  # 自动写入到xf.log

def concat_data():
    df1=pd.read_csv('360_key_1w.txt',sep='\t')
    df1['flag']=1
    df1.drop_duplicates(subset=['name','idcard','phone','loan_dt'],inplace=True)
    df=df.merge(df1, how='left', on=['name', 'idcard', 'phone', 'loan_dt'])
    print(df)
    df=df[df['flag'].isnull()]
    df[['name', 'idcard', 'phone', 'loan_dt']].to_csv('360_key_9w.txt', sep='\t', index=False, encoding='utf-8')
    print(df)