import pandas as pd
from datetime import timedelta
# 原始数据,这里用字典形式给出
data = [
{'userid': 1, 'area_name': 'AreaA', 'ctime': pd.Timestamp('2024-07-01 12:00:00')},
{'userid': 1, 'area_name': 'AreaA', 'ctime': pd.Timestamp('2024-07-01 12:02:30')},
{'userid': 1, 'area_name': 'AreaA', 'ctime': pd.Timestamp('2024-07-01 12:03:00')},
{'userid': 1, 'area_name': 'AreaA', 'ctime': pd.Timestamp('2024-07-01 12:09:00')},
{'userid': 1, 'area_name': 'AreaA', 'ctime': pd.Timestamp('2024-07-01 12:12:30')},
{'userid': 1, 'area_name': 'AreaB', 'ctime': pd.Timestamp('2024-07-01 12:30:00')},
{'userid': 2, 'area_name': 'AreaC', 'ctime': pd.Timestamp('2024-07-01 13:00:00')},
{'userid': 2, 'area_name': 'AreaC', 'ctime': pd.Timestamp('2024-07-01 13:01:00')},
{'userid': 2, 'area_name': 'AreaC', 'ctime': pd.Timestamp('2024-07-01 13:05:00')},
]
# 创建 DataFrame
df = pd.DataFrame(data)
# 将 ctime 转换为 datetime 类型
df['ctime'] = pd.to_datetime(df['ctime'])
# 定义函数来检查时间差是否小于3分钟
def is_within_3_minutes(row1, row2):
return (row2['ctime'] - row1['ctime']) <= timedelta(minutes=3)
# 创建一个空的 DataFrame 来存储结果
result_df = pd.DataFrame()
# 按 userid 分组
for user_id, group in df.groupby('userid'):
# 初始化一个空的列表来存储合并后的记录
merged_records = []
for index, row in group.iterrows():
if not merged_records or not (merged_records[-1]['area_name'] == row['area_name'] and is_within_3_minutes(merged_records[-1], row)):
# 如果是新的记录或不符合条件,则添加新记录
merged_records.append(row.to_dict())
else:
# 否则,更新最后一条记录的时间为两个时间的平均值
time_diff = row['ctime'] - merged_records[-1]['ctime']
avg_time = merged_records[-1]['ctime'] + (time_diff / 2)
merged_records[-1]['ctime'] = avg_time
# 将合并后的记录转换为 DataFrame 并添加到结果 DataFrame 中
merged_df = pd.DataFrame(merged_records)
result_df = pd.concat([result_df, merged_df])
# 显示结果
print(result_df)
合并邻近时间的数据
最新推荐文章于 2024-07-10 20:35:14 发布