import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import matplotlib.pyplot as plt
# 创建环境
env = gym.make('CarRacing-v0')
# 定义神经网络模型
def create_q_model():
inputs = layers.Input(shape=(96, 96, 3,))
layer1 = layers.Conv2D(32, (8, 8), strides=4, activation='relu')(inputs)
layer2 = layers.Conv2D(64, (4, 4), strides=2, activation='relu')(layer1)
layer3 = layers.Conv2D(64, (3, 3), strides=1, activation='relu')(layer2)
layer4 = layers.Flatten()(layer3)
layer5 = layers.Dense(512, activation='relu')(layer4)
action = layers.Dense(env.action_space.n, activation='linear')(layer5)
return tf.keras.Model(inputs=inputs, outputs=action)
model = create_q_model()
target_model = create_q_model()
target_model.set_weights(model.get_weights())
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
loss_function = tf.keras.losses.Huber()
# 定义训练参数
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.1
epsilon_max = 1.0
epsilon_interval = (epsilon_max - epsilon_min)
batch_size = 64
max_memory_length = 100000
update_target_network = 10000
memory = []
# 训练模型
for episode in range(1000):
state = np.array(env.reset())
episode_reward = 0
for timestep in range(1, 10000):
if epsilon > np.random.rand(1)[0]:
action = np.random.choice(env.action_space.n)
else:
action_probs = model(np.expand_dims(state, axis=0), training=False)
action = np.argmax(action_probs[0])
next_state, reward, done, _ = env.step(action)
memory.append((state, action, reward, next_state, done))
if len(memory) > max_memory_length:
del memory[:1]
episode_reward += reward
if done:
break
if timestep % 4 == 0 and len(memory) > batch_size:
indices = np.random.choice(range(len(memory)), size=batch_size)
state_sample = np.array([memory[i][0] for i in indices])
action_sample = np.array([memory[i][1] for i in indices])
reward_sample = np.array([memory[i][2] for i in indices])
next_state_sample = np.array([memory[i][3] for i in indices])
done_sample = np.array([memory[i][4] for i in indices])
future_rewards = target_model.predict(next_state_sample)
updated_q_values = reward_sample + gamma * np.max(future_rewards, axis=1)
masks = tf.one_hot(action_sample, env.action_space.n)
with tf.GradientTape() as tape:
q_values = model(state_sample)
q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
loss = loss_function(updated_q_values, q_action)
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
if timestep % update_target_network == 0:
target_model.set_weights(model.get_weights())
state = next_state
if epsilon > epsilon_min:
epsilon -= epsilon_interval / 1000
print(f'Episode: {episode}, Reward: {episode_reward}, Epsilon: {epsilon:.2f}')
if episode % 10 == 0:
model.save(f'car_racing_model_{episode}.h5')
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.