为了更方便做实验,我对container_meta和container_usage表格做了些处理,以便更好写好我想要的仿真引擎。
首先是对container_usage的表格进行处理,目的是找出前十亿行中最大和最小的net_in和net_out中的最大值和最小值
import pandas as pd
def __get_net__():
columns = ['container_id','machine_id','time_stamp','cpu_util_percent',
'mem_util_percent','cpi','mem_gps','mpki','net_in','net_out',
'disk_io_percent']
container_usage = pd.read_csv('D:\\0 云计算与大数据\\总结性文件\\数据集\\2018年阿里巴巴集群数据集\\container_usage.csv',names=columns,chunksize=1000000)
max_in = -1
min_in = 101
max_out = -1
min_out = 101
i = 0
for df in container_usage:
temp_out = df[['net_out']]
temp_in = df[['net_in']]
# temp_out = df[['net_out']]
# 进入的数据包
temp_in_min = temp_in.min()
temp_min_in = temp_in_min['net_in']
temp_in_max = temp_in.max()
temp_max_in = temp_in_max['net_in']
if max_in < temp_max_in:
max_in = temp_max_in
if min_in > temp_min_in:
min_in = temp_min_in
# 出去的数据包
temp_out_min = temp_out.min()
temp_min_out = temp_out_min['net_out']
temp_out_max = temp_out.max()
temp_max_out = temp_out_max['net_out']
if max_out < temp_max_out:
max_out = temp_max_out
if min_out > temp_min_out:
min_out = temp_min_out
i += 1
if i > 1000:
break
print(min_in)
print(max_in)
print(min_out)
print(max_out)
上面取得的结果是:
0.0
21.49
0.0
26.43
然后是对container_mate表格的一些处理,首先是对于一些重复数据的处理,对其中重复的容器ID数据进行处理,让整个数据集中的数据变得整洁,然后使得属于同一个容器组的容器变得清晰
def __get_container_mate_new__():
columns = ['container_id', 'machine_id ', 'time_stamp', 'app_du', 'status', 'cpu_request', 'cpu_limit', 'mem_size']
container_mate = pd.read_csv("D:\\experiment\\container_meta\\container_meta.csv", names=columns)
# drop_duplicates
container_mate_new = container_mate.drop_duplicates(subset='container_id',inplace=False,keep='first')
# groupby
# container_mate_new = container_mate_new.groupby(['app_du']).describe()
container_mate_new.to_csv("D:\\experiment\\container_meta\\container_meta_new.csv",index=False,header=False)
# container_mate_new.to_csv("D:\\experiment\\container_meta\\container_meta_new.csv", index=False)
得到了一个崭新的文件container_meta_new.csv,然后与第一次一样,我们查看属于同一组的容器的个数
def __get_container_num__():
columns = ['container_id','machine_id ','time_stamp','app_du','status','cpu_request','cpu_limit','mem_size']
# container_mate = pd.read_csv("D:\\experiment\\container_meta\\container_meta.csv", names=columns)
container_mate = pd.read_csv("D:\\experiment\\container_meta\\container_meta_new.csv", names=columns)
container_mate['count'] = 1
max = container_mate.groupby(['app_du']).count()['count'].max()
min = container_mate.groupby(['app_du']).count()['count'].min()
print(max)
print(min)
得到的结果如下:
629
1
最后对表格进行标准化处理,同时对app_du进行排序分类处理
def __get_container_mate_new_group__():
columns = ['container_id', 'machine_id ', 'time_stamp', 'app_du', 'status', 'cpu_request', 'cpu_limit', 'mem_size']
container_mate = pd.read_csv("D:\\experiment\\container_meta\\container_meta_new.csv", names=columns)
# container_mate.groupby(['app_du']).describe().reset_index()
container_mate['app_du'] = container_mate['app_du'].map(lambda x: str(x)[4:])
container_mate['app_du'].astype(int)
container_mate.sort_values(by='app_du',inplace=True,ascending=True)
container_mate['app_du'] = 'app_' + container_mate['app_du'].astype(str)
container_mate['cpu_request'] = container_mate['cpu_request'] / 100
container_mate['cpu_limit'] = container_mate['cpu_limit'] / 100
container_mate.to_csv("D:\\experiment\\container_meta\\container_meta_new.csv",index=False,header=False)
pass
同时补充两个操作,删掉数据集中一个组只有一个容器的组别,然后修改得到container_meta_reduce.csv
def __get_container_reduce_one__():
columns = ['container_id', 'machine_id ', 'time_stamp', 'app_du', 'status', 'cpu_request', 'cpu_limit', 'mem_size']
container_mate = pd.read_csv("D:\\experiment\\container_meta\\container_meta_new.csv", names=columns)
# container_mate_group = container_mate.groupby(['app_du']).count()
container_mate['count'] = 1
container_mate_reduce = container_mate.groupby(['app_du']).count()['count']
for index,value in container_mate_reduce.items():
if value == 1:
container_mate.drop(container_mate[container_mate['app_du'] == index].index,inplace=True)
print(1)
# print(container_mate_reduce)
# m = container_mate.loc[container_mate['count'] == 1]
# container_mate_new = container_mate.drop(m)
# print(type(container_mate_reduce))
container_mate.drop(columns=['count'],inplace=True)
container_mate.to_csv("D:\\experiment\\container_meta\\container_meta_reduce.csv",index=False,header=False)
# print(container_mate_group)
最后由于是做容器调度方面的实验,所以删除掉原来的机器ID,把时间戳全部修改为0(这一步看自己),最后按照顺序重排容器ID索引:
def __update__():
columns = ['container_id', 'machine_id', 'time_stamp', 'app_du', 'status', 'cpu_request', 'cpu_limit', 'mem_size']
container_mate = pd.read_csv("D:\\experiment\\container_meta\\container_meta_reduce.csv", names=columns)
container_mate['time_stamp'] = 0
#container_mate.drop(columns=['container_id'], inplace=True)
#df = container_mate['machine_id']
#print(df)
container_mate.drop(columns=['machine_id'], inplace=True)
container_mate['container_id'] = ''
j = 1
for i in container_mate.index:
s1 = str(j)
j += 1
container_mate['container_id'].at[i] = 'c_' + s1
container_mate.to_csv("D:\\experiment\\container_meta\\container_meta_update.csv", index=False, header=False)
最后加上一个函数,用来查看有多少个不同的app_du:
def __get_num__():
columns = ['container_id', 'time_stamp', 'app_du', 'status', 'cpu_request', 'cpu_limit', 'mem_size']
container_mate = pd.read_csv("D:\\experiment\\container_meta\\app_3.csv", names=columns)
s = container_mate['app_du']
# num = container_mate.groupby(['app_du']).sum()
print(s.nunique())
# print(num)