def reward_shaping(_obs, _extra_info, act, agent):
fp = agent.preprocess
junction_ids = fp.get_sorted_junction_ids()
j_id_to_act_idx = {j_id: idx for idx, j_id in enumerate(junction_ids)}
rewards = {j_id: {'total': 0.0, 'components': {}} for j_id in junction_ids}
# 基础数据获取
frame_state = _obs["framestate"]
vehicles = frame_state["vehicles"]
phases = frame_state["phases"]
fp.update_traffic_info(_obs, _extra_info)
# 核心指标
all_junction_waiting = fp.get_all_junction_waiting_time(vehicles)
invalid_lanes = fp.get_invalid_lanes()
global_avg_queue = fp.get_all_avg_queue()
is_global_congested = global_avg_queue > 5.0
# 场景状态
weather_map = {0: "晴", 1: "雨", 2: "雪", 3: "雾"}
weather = fp.get_weather()
weather_name = weather_map.get(weather, "未知")
is_peak = fp.is_peak_hour()
# 1. 天气影响系数
weather_impact = {
0: {"delay": 1.0, "waiting": 1.0, "queue": 1.0, "travel": 1.0, "coord": 1.0}, # 晴天:无影响
1: {"delay": 1.15, "waiting": 1.1, "queue": 1.15, "travel": 0.85, "coord": 1.15}, # 雨天:轻度影响
2: {"delay": 1.35, "waiting": 1.25, "queue": 1.35, "travel": 0.75, "coord": 1.35}, # 雪天:中度影响
3: {"delay": 1.5, "waiting": 1.4, "queue": 1.5, "travel": 0.65, "coord": 1.5} # 雾霾天:重度影响
}
weather_factors = weather_impact.get(weather, weather_impact[0])
# 1. 基础指标缓存
if not hasattr(agent, 'prev_metrics'):
agent.prev_metrics = {}
for j_id in junction_ids:
capacity = fp.get_junction_capacity(j_id)
agent.prev_metrics[j_id] = {
"avg_delay": 5.0 + capacity * 0.01,
"avg_waiting": 3.0 + capacity * 0.005,
"avg_queue": 2.0 + capacity * 0.002,
"travel_reward": 0.0
}
# 2. EMA延误平滑缓存
if not hasattr(agent, 'ema_delay'):
agent.ema_delay = {j_id: 0.0 for j_id in junction_ids}
# 3. 相位拥堵状态缓存
if not hasattr(agent, 'prev_phase_speeds'):
agent.prev_phase_speeds = {j_id: 0.0 for j_id in junction_ids}
# 4. 连续趋势缓存(记录最近3步指标,用于额外奖励)
if not hasattr(agent, 'prev_metrics_trend'):
agent.prev_metrics_trend = {
j_id: {"delay": [], "queue": []} for j_id in junction_ids
}
alpha = 0.3 # 滑动平均系数
# 安全sigmoid函数(避免exp溢出)
def sigmoid_scale(x, sensitivity=1.0):
x_clamped = np.clip(x, -1000, 1000)
return 2.0 / (1 + np.exp(-sensitivity * x_clamped)) - 1.0
# 逐路口计算奖励(核心逻辑:应用天气影响)
for j_id in junction_ids:
junction = fp.junction_dict[j_id]
signal_id = junction["signal"]
capacity = fp.get_junction_capacity(j_id)
region_id = fp.get_region(j_id)
# 1. 延误奖励:应用天气影响(连续改善奖励不乘系数)
current_avg_delay = fp.get_junction_avg_delay(j_id)
agent.ema_delay[j_id] = 0.8 * agent.ema_delay[j_id] + 0.2 * current_avg_delay
current_avg_delay_smoothed = agent.ema_delay[j_id]
prev_avg_delay = agent.prev_metrics[j_id]["avg_delay"]
delay_delta = prev_avg_delay - current_avg_delay_smoothed
delay_change = delay_delta / max(1, current_avg_delay) * 10 # 10秒窗口归一化
delay_reward = 0.4 * sigmoid_scale(delay_change, sensitivity=0.8)
delay_reward *= weather_factors["delay"] # 应用天气对延误的差异化影响
# 连续3步延误下降奖励(不乘天气系数)
agent.prev_metrics_trend[j_id]["delay"].append(current_avg_delay)
if len(agent.prev_metrics_trend[j_id]["delay"]) > 3:
agent.prev_metrics_trend[j_id]["delay"].pop(0)
if len(agent.prev_metrics_trend[j_id]["delay"]) == 3:
if (agent.prev_metrics_trend[j_id]["delay"][0] > agent.prev_metrics_trend[j_id]["delay"][1] and
agent.prev_metrics_trend[j_id]["delay"][1] > agent.prev_metrics_trend[j_id]["delay"][2]):
delay_reward += 0.1 # 额外正向激励
rewards[j_id]['components']['delay'] = delay_reward
# 2. 等待时间奖励:用到天气影响
current_avg_waiting = all_junction_waiting.get(j_id, 0.0)
prev_avg_waiting = agent.prev_metrics[j_id]["avg_waiting"]
waiting_delta = prev_avg_waiting - current_avg_waiting
waiting_change = waiting_delta / max(1, capacity) * 5 # 容量归一化
waiting_reward = 0.3 * sigmoid_scale(waiting_change, sensitivity=0.8)
waiting_reward *= weather_factors["waiting"] # 应用天气对等待的影响
rewards[j_id]['components']['waiting'] = waiting_reward
# 3. 排队长度奖励
current_avg_queue = fp.get_junction_avg_queue(j_id)
prev_avg_queue = agent.prev_metrics[j_id]["avg_queue"]
queue_delta = prev_avg_queue - current_avg_queue
normalized_queue = current_avg_queue / max(1, capacity) # 容量归一化
queue_delta_normalized = queue_delta / max(1, capacity)
# 基础排队奖励
queue_reward = (
0.2 * sigmoid_scale(queue_delta_normalized * 5.0, sensitivity=0.8)
- 0.25 * weather_factors["queue"] * sigmoid_scale(normalized_queue, sensitivity=2.0)
)
# 连续3步排队下降奖励(不乘天气系数)
agent.prev_metrics_trend[j_id]["queue"].append(current_avg_queue)
if len(agent.prev_metrics_trend[j_id]["queue"]) > 3:
agent.prev_metrics_trend[j_id]["queue"].pop(0)
if len(agent.prev_metrics_trend[j_id]["queue"]) == 3:
if (agent.prev_metrics_trend[j_id]["queue"][0] > agent.prev_metrics_trend[j_id]["queue"][1] and
agent.prev_metrics_trend[j_id]["queue"][1] > agent.prev_metrics_trend[j_id]["queue"][2]):
queue_reward += 0.1 # 额外正向激励
rewards[j_id]['components']['queue'] = queue_reward
# 4. 相位奖励:天气适配
phase_remaining = fp.get_phase_remaining_time(signal_id)
current_phase = -1
for p in phases:
if p["s_id"] == signal_id:
current_phase = p["phase_id"]
break
# 相位车道获取
j_idx = j_id_to_act_idx[j_id]
current_act = act[j_idx] if len(act) > j_idx else -1
signal_id = junction["signal"]
phase_lanes = fp.get_phase_lanes(signal_id, current_act) # 使用新增接口
if not phase_lanes:
# 降级处理逻辑
if hasattr(agent, 'logger'):
agent.logger.warning(f"Junction {j_id}: 相位{current_act}无映射车道,使用进口道")
phase_lanes = junction.get("cached_enter_lanes", [])
# 基础相位奖励(天气越差,奖励越平缓,避免过度惩罚)
phase_lane_queue = sum(len(fp.lane_volume.get(lane, [])) for lane in phase_lanes)
enter_lanes = junction.get("cached_enter_lanes", [])
total_lane_queue = sum(len(fp.lane_volume.get(lane, [])) for lane in enter_lanes)
demand_ratio = phase_lane_queue / (total_lane_queue + 1e-5) if total_lane_queue > 0 else 0.5
phase_reward = 0.15 * (demand_ratio + 0.1) * max(0, 1 - phase_remaining / 5)
phase_reward *= 1.0 / weather_factors["delay"] # 恶劣天气降低相位奖励波动
# 车辆启动激励(不受天气影响)
if hasattr(fp, 'lane_congestion'):
phase_congestion = [fp.lane_congestion.get(lane, 0) for lane in phase_lanes]
avg_phase_congestion = np.mean(phase_congestion) if phase_congestion else 0.0
prev_phase_congestion = agent.prev_phase_speeds[j_id]
if avg_phase_congestion < 0.5 and prev_phase_congestion >= 0.5:
phase_reward += 0.05 # 额外激励车辆启动
agent.prev_phase_speeds[j_id] = avg_phase_congestion
rewards[j_id]['components']['phase'] = phase_reward
# 5. 通行奖励:应用天气影响(天气差时降低通行奖励)
travel_reward = calculate_travel_reward(j_id, fp, vehicles, invalid_lanes)
travel_reward *= weather_factors["travel"] # 应用天气对通行的影响
rewards[j_id]['components']['travel'] = 0.15 * travel_reward
# 6. 区域协调奖励:应用天气影响(天气差时加强协调需求)
region_avg_queue = fp.get_region_avg_queue(region_id)
queue_deviation = abs(current_avg_queue - region_avg_queue)
region_congestion = region_avg_queue / max(1, fp.get_region_capacity(region_id))
# 天气越差,协调因子越大(加强协调惩罚/奖励)
coordination_factor = (1.0 + 2.0 * min(1.0, region_congestion)) * weather_factors["coord"]
# 基础协调惩罚(偏差越大,惩罚越重)
coordination_penalty = -0.1 * coordination_factor * sigmoid_scale(queue_deviation, sensitivity=0.5)
# 区域相位协同奖励
if region_id != -1 and current_phase != -1 and hasattr(fp, 'region_dict'):
region_junctions = fp.region_dict.get(region_id, [])
if len(region_junctions) > 1:
phase_diffs = []
for j_id_near in region_junctions:
if j_id_near == j_id:
continue
signal_near = fp.junction_dict[j_id_near]["signal"]
phase_near = -1
for p in phases:
if p["s_id"] == signal_near:
phase_near = p["phase_id"]
break
if phase_near != -1:
phase_diffs.append(abs(current_phase - phase_near))
if phase_diffs:
avg_phase_diff = np.mean(phase_diffs)
coordination_penalty += 0.05 * sigmoid_scale(-avg_phase_diff, sensitivity=0.1)
rewards[j_id]['components']['coordination'] = coordination_penalty
# 7. 动态权重与总奖励计算(保障归一化与稳定)
# 高峰期/平峰期动态权重
weights = {
'delay': 0.25, 'waiting': 0.20, 'queue': 0.15,
'phase': 0.15, 'travel': 0.15, 'coordination': 0.10
} if is_peak else {
'delay': 0.20, 'waiting': 0.15, 'queue': 0.15,
'phase': 0.15, 'travel': 0.20, 'coordination': 0.15
}
# 强制权重归一化(避免维度失衡)
weight_sum = sum(weights.values())
for key in weights:
weights[key] /= weight_sum
# 动态基线(前期鼓励探索,后期收敛)
base_reward = 0.2
if hasattr(agent, 'train_step'):
decay_step = min(agent.train_step, 1000)
base_reward = 0.2 - 0.15 * (decay_step / 1000) # 1000步后降至0.05
# 总奖励计算([-1,1],防止梯度爆炸)
total_reward = base_reward + sum(weights[k] * rewards[j_id]['components'][k] for k in weights)
rewards[j_id]['total'] = np.clip(total_reward, -1.0, 1.0)
# 8. 更新历史缓存(滑动平均,平滑指标)
agent.prev_metrics[j_id] = {
"avg_delay": alpha * current_avg_delay + (1 - alpha) * prev_avg_delay,
"avg_waiting": alpha * current_avg_waiting + (1 - alpha) * prev_avg_waiting,
"avg_queue": alpha * current_avg_queue + (1 - alpha) * prev_avg_queue,
"travel_reward": alpha * travel_reward + (1 - alpha) * agent.prev_metrics[j_id]["travel_reward"]
}
# 日志输出
if hasattr(agent, 'train_step') and agent.train_step % 100 == 0 and hasattr(agent, 'logger'):
for j_id in junction_ids:
comp = rewards[j_id]['components']
region_id = fp.get_region(j_id)
region_congestion = fp.get_region_avg_queue(region_id) / max(1, fp.get_region_capacity(region_id))
agent.logger.info(
f"Step {agent.train_step} | Junc {j_id} (Region {region_id}) - "
f"Delay: {comp['delay']:.2f}, Wait: {comp['waiting']:.2f}, "
f"Queue: {comp['queue']:.2f}, Phase: {comp['phase']:.2f}, "
f"Travel: {comp['travel']:.2f}, Coord: {comp['coordination']:.2f} | "
f"Congestion: {region_congestion:.2f}, Peak: {is_peak}, Weather: {weather_name} "
f"(Factors: D:{weather_factors['delay']}, W:{weather_factors['waiting']}, Q:{weather_factors['queue']}) | "
f"Total: {rewards[j_id]['total']:.2f}"
)
# 返回各路口总奖励(tuple格式)
return tuple(rewards[j_id]['total'] for j_id in junction_ids)
def calculate_travel_reward(junction_id, fp, vehicles, invalid_lanes, distance_scale=100.0):
total_distance = 0.0
valid_count = 0
completed_count = 0
base_entry_reward = 5.0 # 基础通行奖励
rewarded_completed = set() # 已奖励的通过车辆
vehicle_dict = {v["v_id"]: v for v in vehicles}
vehicle_type_weight = {1: 1.0, 2: 1.5, 3: 1.2, 4: 0.8, 5: 0.5} # 车型权重(优先保障大车通行)
current_v_ids = {v["v_id"] for v in vehicles}
completed_vehicles = fp.junction_metrics[junction_id]["completed_vehicles"]
# 遍历车辆行驶距离,计算奖励
for v_id, distance in fp.vehicle_distance_store.items():
if v_id not in current_v_ids or v_id not in vehicle_dict:
continue
vehicle = vehicle_dict[v_id]
# 1. 行驶中车辆奖励(进口道/路口内,排除无效车道和异常车辆)
if (vehicle["target_junction"] == junction_id
and (on_enter_lane(vehicle, invalid_lanes) or in_junction(vehicle))
and v_id not in completed_vehicles):
if vehicle["lane"] in invalid_lanes or fp.vehicle_status.get(v_id, 0) != 0:
continue
# 车型加权(大车权重更高,鼓励优先通行)
v_type = fp.vehicle_configs[vehicle["v_config_id"]]["v_type"]
weight = vehicle_type_weight.get(v_type, 1.0)
total_distance += (distance / distance_scale) * weight # 距离归一化(避免数值过大)
valid_count += 1
# 2. 已通过车辆奖励(严格去重,限制最大奖励数避免占比过高)
if v_id in completed_vehicles and v_id not in rewarded_completed and completed_count < 5:
total_distance += base_entry_reward * 2 # 双倍奖励,鼓励车辆快速流出路口
completed_count += 1
rewarded_completed.add(v_id) # 标记为已奖励,避免重复
# 计算最终通行奖励(用tanh限制范围在[-1,1],保障训练稳定)
total_count = valid_count + completed_count
if total_count > 0:
avg_distance = total_distance / total_count
count_bonus = min(0.3, total_count * 0.01) # 车辆数量奖励(上限0.3,避免过度影响)
return np.tanh(avg_distance + count_bonus)
else:
return 0.1根据以上分析帮我修改此代码