参考文献:Python曲线拟合详解 - 知乎 (zhihu.com)
参考文献:【Python】np.polyfit点拟合曲线_mjiansun的专栏-CSDN博客
参考文献:python_从excel表格中读取数据并且做数据拟合_不好好学通信的人工智能爱好者-CSDN博客
参考文献:numpy.polyfit — NumPy v1.21 Manual
参考文献:python 对于任意数据和曲线进行拟合并求出函数表达式的三种方案。_changdejie的专栏-CSDN博客_python 曲线拟合
-
首先在container_usage数据表中找出机器ID为m_3044的容器数据,同时我们为了我们将之前的寻找net_in和net_out最大值和最小值代码进行了整合:
def __start__(): columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent', 'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out', 'disk_io_percent'] container_usage = pd.read_csv("D:\\experiment\\container_usage.csv",names=columns,chunksize=1000000) max_in = -1 min_in = 101 max_out = -1x min_out = 101 for df in container_usage: temp_out = df[['net_out']] temp_in = df[['net_in']] # temp_out = df[['net_out']] # 进入的数据包 temp_in_min = temp_in.min() temp_min_in = temp_in_min['net_in'] temp_in_max = temp_in.max() temp_max_in = temp_in_max['net_in'] if max_in < temp_max_in: max_in = temp_max_in if min_in > temp_min_in: min_in = temp_min_in # 出去的数据包 temp_out_min = temp_out.min() temp_min_out = temp_out_min['net_out'] temp_out_max = temp_out.max() temp_max_out = temp_out_max['net_out'] if max_out < temp_max_out: max_out = temp_max_out if min_out > temp_min_out: min_out = temp_min_out # cond = df['machine_id'] == 'm_3044' df_1 = df.loc[df['machine_id'].str.contains('m_3044'),:] df_1.to_csv("D:\\experiment\\container_usage_1.csv",index=False,header=False,mode='a') print(min_in) print(max_in) print(min_out) print(max_out)
去除中间存在空值的数据
def __drop_na__(): columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent', 'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out', 'disk_io_percent'] container_usage = pd.read_csv("D:\\experiment\\container_usage_1.csv", names=columns) # list=[[]] """ for df in container_usage: df.dropna(axis=0,how=any,inplace=True) # pd.set_option('display.max_columns', None) # print(container_usage) df.to_csv("D:\\experiment\\container_usage_2.csv",index=False,header=False,mode='a') """ container_usage.dropna(axis=0,how='any',inplace=True) container_usage.to_csv("D:\\experiment\\container_usage_2.csv", index=False, header=False, mode='a')
对数据进行pearson、spearman和kendall相关性分析:
def __correlation_analysis__(): columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent', 'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out', 'disk_io_percent'] pd.set_option('display.max_columns', None) container_usage = pd.read_csv("D:\\experiment\\container_usage_2.csv", names=columns) conrrlation_analysis_1 = container_usage.corr(method='pearson') conrrlation_analysis_3 = container_usage.corr(method='spearman') conrrlation_analysis_2 = container_usage.corr(method='kendall') conrrlation_analysis_1.to_csv("D:\\experiment\\conrrlation_analysis_pearson.csv",index=False) conrrlation_analysis_3.to_csv("D:\\experiment\\conrrlation_analysis_spearman.csv", index=False) conrrlation_analysis_2.to_csv("D:\\experiment\\conrrlation_analysis_kendall.csv", index=False)
-
spearman相关系数
-
pearson相关性系数
* kendall相关性分析:
选取其中c_10997,其实这里也可以使用分组…
def __cpu_time__():
columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
'disk_io_percent']
pd.set_option('display.max_columns', None)
container_usage = pd.read_csv("D:\\experiment\\container_usage_1.csv", names=columns,chunksize=100000)
i = 1
for df in container_usage:
df_1 = df.loc[df['container_id'].str.contains('c_10997',na=False),:]
df_1.to_csv("D:\\experiment\\container_c_10997.csv", index=False,header=False,mode='a')
print(i)
i += 1
画出cpu随时间变化的散点图
def __cpu_plot__():
columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
'disk_io_percent']
pd.set_option('display.max_columns', None)
container_usage = pd.read_csv("D:\\experiment\\container_c_10997.csv", names=columns)
# df_1 = df.loc[df['machine_id'].str.contains('m_3044'), :]
"""
container_usage.sort_values(by='time_stamp', inplace=True, ascending=True)
container_usage.plot(x='time_stamp', y='cpu_util_percent')
plt.title('container c_10997', fontsize=16, fontweight='bold')
plt.ylabel('cpu_util_percent', fontsize=12, fontweight='bold')
container_usage.to_csv("D:\\experiment\\container_c_10997.csv", index=False)
# 把横轴重新铺一遍
plt.xticks(container_usage.time_stamp, fontsize=8)
plt.show()
"""
container_usage.sort_values(by='time_stamp', inplace=True, ascending=True)
container_usage.plot.scatter(y='cpu_util_percent',x='time_stamp',s=0.01)
# container_usage.cpu_util_percent.plot.hist(bins=100)
# 把横轴重新铺一遍
# plt.xticks(range(0,max(container_usage,)))
plt.xticks(rotation=90)
plt.show()
container_usage.to_csv("D:\\experiment\\container_c_10997.csv", index=False,header=False)
最后来拟合函数
def __get_function__():
columns = ['container_id', 'machine_id', 'time_stamp', 'cpu_util_percent',
'mem_util_percent', 'cpi', 'mem_gps', 'mpki', 'net_in', 'net_out',
'disk_io_percent']
pd.set_option('display.max_columns', None)
container_usage = pd.read_csv("D:\\experiment\\container_c_10997.csv", names=columns)
# container_usage.sample(n=1000,axis=0)
# print(container_usage)
time_stamp = container_usage['time_stamp'].values
print(time_stamp.size)
cpu_util_percent = container_usage['cpu_util_percent'].values
x = time_stamp
y = cpu_util_percent
y3 = poly1d(np.polyfit(x,y,3))
y6 = poly1d(np.polyfit(x, y, 6))
y9 = poly1d(np.polyfit(x, y, 9))
# y3 = poly1d(np.polyfit(x, y, 3))
plt.plot(x,y)
plt.plot(x,y3(x))
plt.plot(x, y6(x))
plt.plot(x, y9(x))
plt.show()
拟合的效果很差很差