10. 自动驾驶模拟器:深度强化学习

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# 创建环境
env = gym.make('CarRacing-v0')

# 定义神经网络模型
def create_q_model():
    inputs = layers.Input(shape=(96, 96, 3,))
    layer1 = layers.Conv2D(32, (8, 8), strides=4, activation='relu')(inputs)
    layer2 = layers.Conv2D(64, (4, 4), strides=2, activation='relu')(layer1)
    layer3 = layers.Conv2D(64, (3, 3), strides=1, activation='relu')(layer2)
    layer4 = layers.Flatten()(layer3)
    layer5 = layers.Dense(512, activation='relu')(layer4)
    action = layers.Dense(env.action_space.n, activation='linear')(layer5)
    return tf.keras.Model(inputs=inputs, outputs=action)

model = create_q_model()
target_model = create_q_model()
target_model.set_weights(model.get_weights())

optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = tf.keras.losses.Huber()

# 定义训练参数
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (epsilon_max - epsilon_min)
batch_size = 64
max_memory_length = 100000
update_target_network = 10000
memory = []

# 训练模型
for episode in range(1000):
    state = np.array(env.reset())
    episode_reward = 0

    for timestep in range(1, 10000):
        if epsilon > np.random.rand(1)[0]:
            action = np.random.choice(env.action_space.n)
        else:
            action_probs = model(np.expand_dims(state, axis=0), training=False)
            action = np.argmax(action_probs[0])
        next_state, reward, done, _ = env.step(action)
        memory.append((state, action, reward, next_state, done))
        if len(memory) > max_memory_length:
            del memory[:1]
        episode_reward += reward

        if done:
            break
        if timestep % 4 == 0 and len(memory) > batch_size:
            indices = np.random.choice(range(len(memory)), size=batch_size)
            state_sample = np.array([memory[i][0] for i in indices])
            action_sample = np.array([memory[i][1] for i in indices])
            reward_sample = np.array([memory[i][2] for i in indices])
            next_state_sample = np.array([memory[i][3] for i in indices])
            done_sample = np.array([memory[i][4] for i in indices])
            future_rewards = target_model.predict(next_state_sample)
            updated_q_values = reward_sample + gamma * np.max(future_rewards, axis=1)
            masks = tf.one_hot(action_sample, env.action_space.n)
            with tf.GradientTape() as tape:
                q_values = model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)
            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))
        if timestep % update_target_network == 0:
            target_model.set_weights(model.get_weights())
        state = next_state

    if epsilon > epsilon_min:
        epsilon -= epsilon_interval / 1000

    print(f'Episode: {episode}, Reward: {episode_reward}, Epsilon: {epsilon:.2f}')

    if episode % 10 == 0:
        model.save(f'car_racing_model_{episode}.h5')
  • 1.
  • 2.
  • 3.
  • 4.
  • 5.
  • 6.
  • 7.
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
  • 45.
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
  • 51.
  • 52.
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
  • 58.
  • 59.
  • 60.
  • 61.
  • 62.
  • 63.
  • 64.
  • 65.
  • 66.
  • 67.
  • 68.
  • 69.
  • 70.
  • 71.
  • 72.
  • 73.
  • 74.
  • 75.
  • 76.
  • 77.
  • 78.
  • 79.
  • 80.
  • 81.
  • 82.
  • 83.
  • 84.