对阿里集群(alibaba cluster)数据集中的container_usage表格进行cpu_time进行拟合

参考文献:Python曲线拟合详解 - 知乎 (zhihu.com)

参考文献【Python】np.polyfit点拟合曲线_mjiansun的专栏-CSDN博客

参考文献:python_从excel表格中读取数据并且做数据拟合_不好好学通信的人工智能爱好者-CSDN博客

参考文献:numpy.polyfit — NumPy v1.21 Manual

参考文献:python 对于任意数据和曲线进行拟合并求出函数表达式的三种方案。_changdejie的专栏-CSDN博客_python 曲线拟合

  • 首先在container_usage数据表中找出机器ID为m_3044的容器数据,同时我们为了我们将之前的寻找net_in和net_out最大值和最小值代码进行了整合:

    def __start__():
        columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
                   'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
                   'disk_io_percent']
        container_usage = pd.read_csv("D:\\experiment\\container_usage.csv",names=columns,chunksize=1000000)
        max_in = -1
        min_in = 101
        max_out = -1x
        min_out = 101
    
        for df in container_usage:
            temp_out = df[['net_out']]
            temp_in = df[['net_in']]
            # temp_out = df[['net_out']]
            # 进入的数据包
            temp_in_min = temp_in.min()
            temp_min_in = temp_in_min['net_in']
            temp_in_max = temp_in.max()
            temp_max_in = temp_in_max['net_in']
            if max_in < temp_max_in:
                max_in = temp_max_in
            if min_in > temp_min_in:
                min_in = temp_min_in
            # 出去的数据包
            temp_out_min = temp_out.min()
            temp_min_out = temp_out_min['net_out']
            temp_out_max = temp_out.max()
            temp_max_out = temp_out_max['net_out']
            if max_out < temp_max_out:
                max_out = temp_max_out
            if min_out > temp_min_out:
                min_out = temp_min_out
            # cond = df['machine_id'] == 'm_3044'
            df_1 = df.loc[df['machine_id'].str.contains('m_3044'),:]
            df_1.to_csv("D:\\experiment\\container_usage_1.csv",index=False,header=False,mode='a')
    
    
        print(min_in)
        print(max_in)
        print(min_out)
        print(max_out)
    

    去除中间存在空值的数据

    def __drop_na__():
        columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
                   'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
                   'disk_io_percent']
        container_usage = pd.read_csv("D:\\experiment\\container_usage_1.csv", names=columns)
        # list=[[]]
        """
            for df in container_usage:
            df.dropna(axis=0,how=any,inplace=True)
        # pd.set_option('display.max_columns', None)
        # print(container_usage)
            df.to_csv("D:\\experiment\\container_usage_2.csv",index=False,header=False,mode='a')
    
        """
        container_usage.dropna(axis=0,how='any',inplace=True)
        container_usage.to_csv("D:\\experiment\\container_usage_2.csv", index=False, header=False, mode='a')
    
    

    对数据进行pearson、spearman和kendall相关性分析:

    def __correlation_analysis__():
        columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
                   'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
                   'disk_io_percent']
        pd.set_option('display.max_columns', None)
        container_usage = pd.read_csv("D:\\experiment\\container_usage_2.csv", names=columns)
        conrrlation_analysis_1 = container_usage.corr(method='pearson')
    
        conrrlation_analysis_3 = container_usage.corr(method='spearman')
        conrrlation_analysis_2 = container_usage.corr(method='kendall')
        conrrlation_analysis_1.to_csv("D:\\experiment\\conrrlation_analysis_pearson.csv",index=False)
        conrrlation_analysis_3.to_csv("D:\\experiment\\conrrlation_analysis_spearman.csv", index=False)
        conrrlation_analysis_2.to_csv("D:\\experiment\\conrrlation_analysis_kendall.csv", index=False)
    
  • spearman相关系数
    spearman相关系数

  • pearson相关性系数 pearson相关性系数* kendall相关性分析: kendall相关性分析

选取其中c_10997,其实这里也可以使用分组…

def __cpu_time__():
    columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
               'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
               'disk_io_percent']
    pd.set_option('display.max_columns', None)
    container_usage = pd.read_csv("D:\\experiment\\container_usage_1.csv", names=columns,chunksize=100000)
    i = 1
    for df in container_usage:

        df_1 = df.loc[df['container_id'].str.contains('c_10997',na=False),:]
        df_1.to_csv("D:\\experiment\\container_c_10997.csv", index=False,header=False,mode='a')
        print(i)
        i += 1

画出cpu随时间变化的散点图

def __cpu_plot__():
    columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
               'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
               'disk_io_percent']
    pd.set_option('display.max_columns', None)
    container_usage = pd.read_csv("D:\\experiment\\container_c_10997.csv", names=columns)
    # df_1 = df.loc[df['machine_id'].str.contains('m_3044'), :]
    """
    container_usage.sort_values(by='time_stamp', inplace=True, ascending=True)
    container_usage.plot(x='time_stamp', y='cpu_util_percent')
    plt.title('container c_10997', fontsize=16, fontweight='bold')
    plt.ylabel('cpu_util_percent', fontsize=12, fontweight='bold')
    container_usage.to_csv("D:\\experiment\\container_c_10997.csv", index=False)
    # 把横轴重新铺一遍
    plt.xticks(container_usage.time_stamp, fontsize=8)

    plt.show()
    """
    container_usage.sort_values(by='time_stamp', inplace=True, ascending=True)
    container_usage.plot.scatter(y='cpu_util_percent',x='time_stamp',s=0.01)

    # container_usage.cpu_util_percent.plot.hist(bins=100)

    # 把横轴重新铺一遍
    # plt.xticks(range(0,max(container_usage,)))
    plt.xticks(rotation=90)

    plt.show()
    container_usage.to_csv("D:\\experiment\\container_c_10997.csv", index=False,header=False)

请添加图片描述

最后来拟合函数

def __get_function__():
    columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
               'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
               'disk_io_percent']
    pd.set_option('display.max_columns', None)
    container_usage = pd.read_csv("D:\\experiment\\container_c_10997.csv", names=columns)
    # container_usage.sample(n=1000,axis=0)
    # print(container_usage)
    time_stamp = container_usage['time_stamp'].values
    print(time_stamp.size)
    cpu_util_percent = container_usage['cpu_util_percent'].values
    x = time_stamp
    y = cpu_util_percent
    y3 = poly1d(np.polyfit(x,y,3))
    y6 = poly1d(np.polyfit(x, y, 6))
    y9 = poly1d(np.polyfit(x, y, 9))
    # y3 = poly1d(np.polyfit(x, y, 3))
    plt.plot(x,y)
    plt.plot(x,y3(x))
    plt.plot(x, y6(x))
    plt.plot(x, y9(x))
    plt.show()

拟合的效果很差很差
请添加图片描述

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值