背景:
需要对很多数据进行整理,简单搜了下pandas用法,做个记录
import pandas as pd
#使用pycharm打印dataframe的时候会出现省略输出的情况
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 10)
data = { "counter":"test_counter",
"value":[{"timestamp":1231231231,"values":1.23},{"timestamp":1231231498,"values":2.23},{"timestamp":1231233249,"values":1.23},{"timestamp":1231233249,"values":1.23}],
"end_point":'k8s-test'}
df = pd.DataFrame(data)
print(df)
#把value列的字典分割成两列,列名为key名,行值为字典value值
df[['timestamp', 'values']] = df['value'].apply(pd.Series)
print(df)
换行输出了
#删除value列
df = df.drop(columns=['value'])
print(df)
df1 = pd.DataFrame({ "counter":"test_counter",
"value":[{"timestamp":1231231231,"values":1.23},{"timestamp":1231231498,"values":2.23},{"timestamp":1231233249,"values":1.23},{"timestamp":1231233249,"values":None},{"timestamp":12313433249,"values":3.23}],
"end_point":'k8s-test'})
df2 = pd.DataFrame({ "counter":"test_counter",
"value":[{"timestamp":1231231231,"values":1.24},{"timestamp":1231231498,"values":4.23},{"timestamp":1231233249,"values":3.23},{"timestamp":1231233249,"values":3.23}],
"end_point":'k8s-test'})
# 使用concat()函数合并DataFrame
# df = pd.concat([df1, df2])
df1[['timestamp', 'values']] = df1['value'].apply(pd.Series)
df2[['timestamp', 'values']] = df2['value'].apply(pd.Series)
df1 = df1.drop(columns=['value'])
df2 = df2.drop(columns=['value'])
print('-----'*50,'\n',df1)
# 移除列'A'中包含None值的行
df_filtered = df1.dropna(subset=['values'])
print('-----'*50,'\n',df_filtered)
df1 = df1.groupby(['counter', 'end_point', 'timestamp']).mean().reset_index()
df2 = df2.groupby(['counter', 'end_point', 'timestamp']).mean().reset_index()
#重命名列名
df1_renamed = df1.rename(columns={'values': 'peak'})
df2_renamed = df2.rename(columns={'values': 'avg'})
df_renamed = pd.concat([df1_renamed, df2_renamed])
#默认使用内连接
result = pd.merge(df1_renamed, df2_renamed, on=['counter','end_point','timestamp'])
#指定使用外连接,不存在的column,自动补全为None
result1 = pd.merge(df1_renamed, df2_renamed, on=['counter','end_point','timestamp'],how='outer')