旧song_tools(画图,数据处理,机器学习数据抽样)

"""
    这是宋建国自己的简易工具包

    导入jupyter时
    from imp import reload   更改文件时重新导入函数
    import song_tools.song_tools     导入py文件
    reload(song_tools.song_tools)    重新导入py文件
    from song_tools.song_tools import *    只能通过导入函数的形式
"""
"""  运行库导入   """
import seaborn as sns
import pandas as pd
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE  #非平衡数据处理
from sklearn import preprocessing

"""   测试函数  """
def ceshi():
    print("调用成功")



"""   图像化展示部分(SHOW)   """
# 混淆矩阵绘制
# 方法一 利用热力图绘制混淆矩阵
def SHOW_cm_hot(l_predict, l_test):
    """

    :param l_predict: 输入预测的标签值
    :param l_test: 输入测试数据的真实标签值
    :return: 
    """
    cm = pd.crosstab(l_predict, l_test)
    # 将混淆矩阵构造成数据框,并加上字段名和行名称,用于行或列的含义说明
    cm = pd.DataFrame(cm)
    # 绘制热力图
    sns.heatmap(cm, annot=True, cmap='GnBu')
    # 添加x轴和y轴的标签
    plt.xlabel(' Real Lable')
    plt.ylabel(' Predict Lable')
    # 图形显示
    plt.show()


# 方法二--百分比
def SHOW_cm_proba(labels, y_true, y_pred):
    """

    :param labels: 输入标签列表
    :param y_true: 输入真实标签值
    :param y_pred: 输入测试标签值
    :return: 
    """
    tick_marks = np.array(range(len(labels))) + 0.5

    def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.binary):
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        xlocations = np.array(range(len(labels)))
        plt.xticks(xlocations, labels, rotation=90)
        plt.yticks(xlocations, labels)
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

    cm = confusion_matrix(y_true, y_pred)
    np.set_printoptions(precision=2)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print(cm_normalized)
    plt.figure(figsize=(8, 4), dpi=120)

    ind_array = np.arange(len(labels))
    x, y = np.meshgrid(ind_array, ind_array)

    for x_val, y_val in zip(x.flatten(), y.flatten()):
        c = cm_normalized[y_val][x_val]
        if c > 0.01:
            plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=7, va='center', ha='center')
    # offset the tick
    plt.gca().set_xticks(tick_marks, minor=True)
    plt.gca().set_yticks(tick_marks, minor=True)
    plt.gca().xaxis.set_ticks_position('none')
    plt.gca().yaxis.set_ticks_position('none')
    plt.grid(True, which='minor', linestyle='-')
    plt.gcf().subplots_adjust(bottom=0.15)

    plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
    # show confusion matrix
    # plt.savefig('../Data/confusion_matrix.png', format='png')
    plt.show()


# 绘制ROC曲线并且标注出AUC值
def SHOW_ROC(test_label_true, test_label_predict):
    """

    :param test_label_true:输入真实标签
    :param test_label_predict: 输入测试标签
    :return: 
    """
    # 计算ROC曲线的x轴和y轴的数据
    fpr, tpr, _ = metrics.roc_curve(test_label_true, test_label_predict)
    # 绘制ROC曲线
    plt.plot(fpr, tpr, linestyle='solid', color='red')
    # 添加阴影
    # plt.stackplot(fpr, tpr, color = 'streeblue')
    # 绘制参考线
    plt.plot([0, 1], [0, 1], linestyle='dashed', color='black')
    # 向图中添加文本
    plt.text(0.6, 0.4, 'AUC=%.3f' % metrics.auc(fpr, tpr), fontdict=dict(size=18))

    plt.show()

def SHOW_tree_imp(tree, label):
    """
        进行树中重要性的排序并返回对应好的列表
    """
    res_label = list(label)
    res_imp = list(tree.feature_importances_)
    res = dict(zip(res_label, res_imp))

    # 2.按照重案性排序
    res_sort = sorted(res.items(), key=lambda x: x[1], reverse=True)

    imp_series = pd.Series(tree.feature_importances_, index=label)
    imp_series.sort_values(ascending=True).plot('barh')
    plt.show()
    return res_sort

def SHOW_xiangguan(df):
    """
    :param df: 原始数据
    :return: 展示一个相关系数热力图
    """
    dfData = df.corr()
    plt.subplots(figsize=(9, 9)) # 设置画面大小
    sns.heatmap(dfData, annot=True, vmax=1, square=True, cmap="Blues")
    plt.show()

def SHOW_leida(datas, labels, maxnum, title):
    """
    
    :param datas:输入数据(数值) 
    :param labels: 输入圆周围的标签
    :param maxnum: 输入要显示的最大刻度(0-maxnum)
    :param title: 输入要显示的图像标题
    :return: 无返回
    """
    if len(datas) != len(labels):
        print("输入的数据长度和标签长度不一样!")
        return
    # 标签
    labels = np.array(labels)
    # 数据个数
    dataLenth = len(labels)
    # 数据
    data = np.array(datas)
    # ========自己设置结束============

    angles = np.linspace(0, 2 * np.pi, dataLenth, endpoint=False)
    data = np.concatenate((data, [data[0]]))  # 闭合
    angles = np.concatenate((angles, [angles[0]]))  # 闭合

    fig = plt.figure()
    ax = fig.add_subplot(111, polar=True)  # polar参数!!
    ax.plot(angles, data, 'bo-', linewidth=2)  # 画线
    ax.fill(angles, data, facecolor='r', alpha=0.25)  # 填充
    ax.set_thetagrids(angles * 180 / np.pi, labels, fontproperties="SimHei")
    ax.set_title(title, va='bottom', fontproperties="SimHei")
    ax.set_rlim(0, maxnum)
    ax.grid(True)
    plt.show()

#显示数据特征的相似热力图-并给出大于某阈值相似度的变量
def SHOW_xiangsi(data, threshold):
    #显示相似度热力图
    cols=data.columns 
    data_corr = data.corr().abs()
    plt.subplots(figsize=(13, 9))
    sns.heatmap(data_corr,annot=True)
    sns.heatmap(data_corr, mask=data_corr < 1, cbar=False)
    plt.show()
    
    #挑选出阈值高于某值的相似变量-便于数据清洗
    threshold = threshold #对阈值进行赋值

    corr_list = []
    size = len(data.columns)
    #Search for the highly correlated pairs
    for i in range(0,size): #for 'size' features
        for j in range(i+1,size): #avoid repetition
            if (data_corr.iloc[i,j] >= threshold and data_corr.iloc[i,j] < 1) or (data_corr.iloc[i,j] < 0 and data_corr.iloc[i,j] <= -threshold):
                corr_list.append([data_corr.iloc[i,j],i,j]) #store correlation and columns index

    #Sort to show higher ones first            
    s_corr_list = sorted(corr_list,key=lambda x: -abs(x[0]))

    #Print correlations and column names
    for v,i,j in s_corr_list:
        print ("%s and %s = %.2f" % (cols[i],cols[j],v))

"""   数据处理部分(DATA)   """
#对于正负样本超不平衡的数据进行处理(过采样)
def DATA_unbalance(f_train, l_train, random_state=None):
    over_samples = SMOTE(random_state=random_state)
    over_samples_F, over_samples_L = over_samples.fit_sample(f_train, l_train)
    # over_samples_X, over_samples_y = over_samples.fit_sample(X_train.values,y_train.values.ravel())
    # 重抽样前的类别比例
    print(l_train.value_counts() / len(l_train))
    # 重抽样后的类别比例
    print(pd.Series(over_samples_L).value_counts() / len(over_samples_L))
    print(l_train.value_counts(), pd.Series(over_samples_L).value_counts())

    return over_samples_F, over_samples_L

#计算变量因子个数(每个变量的分类数统计)
def DATA_yinzi(df):
    """
    :param df: DateFrame数据
    :return: 整理后的因子Series数据
    """
    colname_list = df.columns
    colnum_list = []
    for name in colname_list:
        colnum_list.append(len(df[name].unique()))
    #拼接成series
    yinzi = pd.Series(colnum_list, index = colname_list)
    print(yinzi)
    return yinzi

#DataFrame数据进行哑变量处理(带标准化)(非分开独热)--可以用与哑变量部分特征或者全部特征
def DATA_yabianliang(df, feature, flag=False):  # flag--是否深拷贝
    """
    
    :param datas:   完整数据
    :param feature:    name_list = list(df.columns)-可以是部分特征也可以是全部特征
    :param flag:  是否深拷贝
    :return:  
    """
    if flag:
        from copy import deepcopy
        datas = deepcopy(df)
    # name_list = list(df.columns)
    for f in feature:
        datas[f] = pd.Categorical(datas[f]).codes
    return datas




"""   机器学习模型部分(ML)   """
# 指定机器学习模型的进行多折交叉验证
def ML_CV(model, feature, label, cv_num=3, shuffle=False, random_state=None):
    kf = StratifiedKFold(3, True, 10)  # n_splits='warn', shuffle=False, random_state=None
    model = model
    queue1 = []
    for train, test in kf.split(feature, label):
        print('Train: %s | test: %s' % (train, test))
        ff_train = feature.iloc[train]
        ll_train = label.iloc[train]
        ff_test = feature.iloc[test]
        ll_test = label.iloc[test]
        model.fit(ff_train, ll_train)
        predic_proba = model.score(ff_test, ll_test)
        print("准确率为", predic_proba)
        queue1.append(predic_proba)
    predic_proba = sum(queue1) / len(queue1)
    print("平均准确率为:", predic_proba)



"""   备用代码部分   """
#查看执行时间的装饰器
"""
import time
def  set_func(func):
    print("开始装饰")
    def call_func(*args, **kwargs):
        start_time = time.time()
        func(*args, **kwargs)
        end_time = time.time()
        d_time = end_time - start_time
        print("函数执行用时:", d_time)
    return call_func
"""

#标准化处理后保持原有结构
#保持其dataframe格式
"""
col_name = feature.columns
feature = pd.DataFrame(data, columns = col_name)

label = pd.Categorical(label).codes
label = pd.Series(label)
"""

#分层抽样
#分层抽样实现(data--原数据, Class--依据什么列抽样, 写出对应抽样比例字典)
"""
gbr = data.groupby("Class")
# print(gbr.count())
gbr.groups #显示分组xinxi
typicalFracDict = {
    0: 0.2, #这里的比例是抽取特定种类的百分之多少的数据,在基数相同的情况下才可以认为是2:8, 在基数不相同时需要人工调节比例并不代表2/8
    1: 0.8
}

def typicalSampling(group, typicalFracDict):
    name = group.name
    frac = typicalFracDict[name]
    return group.sample(frac=frac)

result = data.groupby(
    'Class', group_keys=False #设置后不在dataframe前设置二级分类
).apply(typicalSampling, typicalFracDict)

result['Class'].count() #显示分层后的总数
result.groupby('Class').count()#  2:8的比例(基数相同的情况下)
"""

"""
import os
import shutil

tag = ".JPG"
copy_dir = "./copy/"
targetnames = os.listdir()
print(targetnames)

file_list = []
for name in targetnames:
    if tag == name[-4:]:
        file_list.append(name)

if file_list != []:
    #生成文件夹
    if not os.path.exists(copy_dir):
        os.makedirs(copy_dir)
    for name in file_list:
                print(name)
                if tag == name[-4:]:
                    shutil.copyfile(name, './copy/' + name)
else:
    print("没有匹配的后缀文件")

"""
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值