读写csv文件,合并文件
#读取CSV文件
import os
#path1 = os.path.abspath('.')
import numpy as np
import pandas as pd
df = pd.read_csv('C:/Users/Administrator/.spyder-py3/data.csv')
#查看读入DataFrame的头部
printf(df.columns)
#合并多个读入的csv文件
df1 = pd.read_csv('C:/Users/Administrator/.spyder-py3/data1.csv')
df2 = pd.read_csv('C:/Users/Administrator/.spyder-py3/data2.csv')
df= pd.concat([df1,df2],axis=0)
# 写入csv文件
# df.to_csv('C:/Users/Administrator/.spyder-py3/output.csv')
数据清洗
df= df[df['enum_net_type'] == 4 ]
df= df[~(df['province'].isin(['澳门特别行政区']))]
df = df[df['latencey']>0]
df = df[df['latencey']>=0]
df = df[(df['latencey']>0) & (df['jitter']>0)]
#数据去重,按照day,time去重,如果两行数据,day和time都一致,去重保留第一行数据
df= df.drop_duplicates(subset=['day','time',], keep ="first",inplace=False)
#取指定列
df= df[['bytes_self_province','uint32_qos_accelarate_flag','enum_video_type','uint32_avg_video_e2e_delay','kadun', 'uint32_avg_video_decode_bit_rate','uint32_avg_video_score']]
数据处理
groupby并计算
#平均值,最大值,最小值,数量,CDF百分比对应的取值
df = df.groupby(['province','day'],as_index=False).mean()
df = df.groupby(['province','day'],as_index=False).min()
df = df.groupby(['province','day'],as_index=False).max()
df = df.groupby(['province','day'],as_index=False).count()
df = df.groupby(['province','day']).describe(percentiles=[.5,.95, .99]) #自定义CDF百分比
df = df.groupby(['province','day']).describe() #计算得到全部的描述值
groupby后将series转换为dataFrame
#将groupby之后的series变为dataframe
mean = df.groupby(['province','day'],as_index=False).mean()
mean = mean.reset_index()
groupby计算每个series的样本数
df = df.groupby(['province','day'],as_index=False).agg("count")
size = df[['province','day','latency']]
# dataFrame重新定义列名
size = size.rename(columns={'latency':'count'})
merge合并多个series
df = df.groupby(['province','day'],as_index=False).agg("count")
size1 = df[['province','day','latency']]
size1 = size.rename(columns={'latency':'count'})
size2 = size1
size_m = size1
size_m = pd.merge(size_m , size2,on=['province','bytes_self_province','day'],how = "left")