什么是机器学习
Actor-Critic(演员-评论家算法)是一种结合了策略梯度方法(Actor
)和值函数方法(Critic
)的强化学习算法。演员(Actor
)学习策略,决定在给定状态下应该采取什么动作;评论家(Critic
)学习值函数,评估当前状态的价值。
以下是一个使用 Python 和 TensorFlow/Keras
实现简单的Actor-Critic算法的示例。在这个例子中,我们将使用 OpenAI Gym 的 CartPole
环境。
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam
import gym
# 定义Actor-Critic Agent
class ActorCriticAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.gamma = 0.99 # 折扣因子
self.learning_rate = 0.01
# 构建演员(Actor)网络
self.actor = self.build_actor()
# 构建评论家(Critic)网络
self.critic = self.build_critic()
def build_actor(self):
state_input = Input(shape=(self.state_size,))
dense1 = Dense(24, activation='relu')(state_input)
dense2 = Dense(24, activation='relu')(dense1)
output = Dense(self.action_size, activation='softmax')(dense2)
model = Model(inputs=state_input, outputs=output)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=self.learning_rate))
return model
def build_critic(self):
state_input = Input(shape=(self.state_size,))
dense1 = Dense(24, activation='relu')(state_input)
dense2 = Dense(24, activation='relu')(dense1)
output = Dense(1, activation='linear')(dense2)
model = Model(inputs=state_input, outputs=output)
model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate))
return model
def get_action(self, state):
state = np.reshape(state, [1, self.state_size])
action_prob = self.actor.predict(state)[0]
action = np.random.choice(self.action_size, p=action_prob)
return action
def train(self, state, action, reward, next_state, done):
state = np.reshape(state, [1, self.state_size])
next_state = np.reshape(next_state, [1, self.state_size])
# 计算TD误差(Temporal Difference error)
target = reward + self.gamma * self.critic.predict(next_state)[0][0] * (1 - done)
td_error = target - self.critic.predict(state)[0][0]
# 训练演员网络
action_one_hot = tf.keras.utils.to_categorical(action, self.action_size)
actor_grads = -1 * np.log(action_one_hot) * td_error
self.actor.train_on_batch(state, actor_grads)
# 训练评论家网络
critic_target = np.reshape(target, (1, 1))
self.critic.train_on_batch(state, critic_target)
# 初始化环境和Agent
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = ActorCriticAgent(state_size, action_size)
# 训练Actor-Critic Agent
num_episodes = 1000
for episode in range(num_episodes):
state = env.reset()
total_reward = 0
for time in range(500): # 限制每个episode的步数,防止无限循环
# env.render() # 如果想可视化训练过程,可以取消注释此行
action = agent.get_action(state)
next_state, reward, done, _ = env.step(action)
total_reward += reward
agent.train(state, action, reward, next_state, done)
state = next_state
if done:
print("Episode: {}, Total Reward: {}".format(episode + 1, total_reward))
break
# 关闭环境
env.close()
在这个例子中,我们定义了一个简单的Actor-Critic Agent
,包括演员(Actor
)和评论家(Critic
)两个神经网络。Agent在每个时间步中通过演员网络选择动作,然后通过评论家网络进行训练。演员网络输出动作的概率,评论家网络输出当前状态的值。
请注意,Actor-Critic
算法的实现可能因问题的复杂性而有所不同,可能需要更多的技术和调整,如基线(baseline)、归一化奖励、使用更复杂的神经网络结构等。