强化学习实践五 || 表格Q-learning

强化学习实践五_表格Q-learning.md

Q值更新公式:

Q(s,a) = (1-α) * old_Q(s,a) + α * (γ(r+maxQ(s_,a)))
import gym
import collections
from tensorboardX import SummaryWriter

ENV_NAME = "FrozenLake8x8-v1"
GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODES = 20

class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)   # 创建环境
        self.state = self.env.reset()   # 设置初始状态
        self.values = collections.defaultdict(float)    # 初始化字典存放Q值

    # 随机抽样一个动作,然后只玩一步,返回(s,a,r,s_)
    def sample_env(self):
        action = self.env.action_space.sample()
        old_state = self.state
        new_state, reward, is_done, _ = self.env.step(action)
        self.state = self.env.reset() if is_done else new_state
        return old_state, action, reward, new_state

    # 给定一个s,从Q表中找出maxQ(s,a),返回Q(s,a)和动作a
    def best_value_and_action(self, state):
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action

    # 更新Q(s,a) = (1-α) * old_Q(s,a) + α * (γ(r+maxQ(s_,a)))
    def value_update(self, s, a, r, next_s):    # 传入一组(s,a,r,s_) 只更新Q(s,a)
        best_v, _ = self.best_value_and_action(next_s)
        new_v = r + GAMMA * best_v
        old_v = self.values[(s, a)]
        self.values[(s, a)] = old_v * (1-ALPHA) + new_v * ALPHA

    # 以maxQ(s,a)玩一局游戏,返回该局游戏的总奖励值
    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward

# 训练完毕后玩一局游戏,并渲染
def playgame(agent):
    env = gym.make(ENV_NAME)
    s = env.reset()
    env.render()
    while True:
        _,action = agent.best_value_and_action(s)
        s_,r,is_done,_ = env.step(action)
        env.render()
        if is_done:
            break
        s = s_
        
agent = Agent()

if __name__ == "__main__":
    test_env = gym.make(ENV_NAME)
    
    writer = SummaryWriter(comment="table-q-learning")

    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1
        
        s, a, r, next_s = agent.sample_env()    # 随机抽样一步动作
        agent.value_update(s, a, r, next_s)     # 更新(s,a,r,s_)中Q(s,a)的值

        reward = 0.0
        for _ in range(TEST_EPISODES):  # 玩20局游戏,测试Q(s,a)表的性能
            reward += agent.play_episode(test_env)
            
        # 得到该20局游戏的平均奖励
        reward /= TEST_EPISODES
        
        writer.add_scalar("reward", reward, iter_no)
        
        if reward > best_reward:
            print("Best reward updated %.3f -> %.3f" % (
                best_reward, reward))
            best_reward = reward
        if reward > 0.90:
            print("Solved in %d iterations!" % iter_no)
            break
            
    writer.close()

    playgame(agent)

缺陷:

每一次循环只更新一个Q值,收敛速度慢

达到平均奖励0.85 花费56s

image-20220319161849108

改进一:

# 删除sample_env()函数,新增updataQ() 函数
import gym
import collections
from tensorboardX import SummaryWriter

ENV_NAME = "FrozenLake-v1"
GAMMA = 0.9
ALPHA = 0.2
TEST_EPISODES = 20

class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)   # 创建环境
        self.state = self.env.reset()   # 设置初始状态
        self.values = collections.defaultdict(float)    # 初始化字典存放Q值

    # 玩一局,更新每一步中(s,a,r,s_)中的Q(s,a)
    def updataQ(self):
        while True:
            action = self.env.action_space.sample()
            old_state = self.state
            new_state, reward, is_done, _ = self.env.step(action)
            self.value_update(old_state, action, reward, new_state)
            if is_done:
                self.state = self.env.reset()
                break
            self.state = new_state

    # 给定一个s,从Q表中找出maxQ(s,a),返回Q(s,a)和动作a
    def best_value_and_action(self, state):
        best_value, best_action = None, None
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_value, best_action

    # 更新Q(s,a) = (1-α) * old_Q(s,a) + α * (γ(r+maxQ(s_,a)))
    def value_update(self, s, a, r, next_s):    # 传入一组(s,a,r,s_) 只更新Q(s,a)
        best_v, _ = self.best_value_and_action(next_s)
        new_v = r + GAMMA * best_v
        old_v = self.values[(s, a)]
        self.values[(s, a)] = old_v * (1-ALPHA) + new_v * ALPHA

    # 以maxQ(s,a)玩一局游戏,返回该局游戏的总奖励值
    def play_episode(self, env):
        total_reward = 0.0
        state = env.reset()
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ = env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward

def playgame(agent :Agent):
    env = gym.make(ENV_NAME)
    s = env.reset()
    env.render()
    while True:
        _,action = agent.best_value_and_action(s)
        s_,r,is_done,_ = env.step(action)
        env.render()
        if is_done:
            break
        s = s_

if __name__ == "__main__":
    test_env = gym.make(ENV_NAME)
    agent = Agent()
    writer = SummaryWriter(comment="enhance-table-q-learning")

    iter_no = 0
    best_reward = 0.0
    while True:
        iter_no += 1

        agent.updataQ()

        reward = 0.0
        for _ in range(TEST_EPISODES):  # 玩20局游戏,测试Q(s,a)表的性能
            reward += agent.play_episode(test_env)

        # 得到该20局游戏的平均奖励
        reward /= TEST_EPISODES

        writer.add_scalar("reward", reward, iter_no)

        if reward > best_reward:
            print("Best reward updated %.3f -> %.3f" % (
                best_reward, reward))
            best_reward = reward
        if reward > 0.80:
            print("Solved in %d iterations!" % iter_no)
            break

    writer.close()

    playgame(agent)

image-20220319162818534

改进后性能显著提升!

但是还存在一个问题,改进后的代码对Q值做的更新都是采用随机动作来更新,所以有很多重复的Q(s,a)要计算,我们可以再做一次改进,让agent在测验中以Q策略边玩游戏边更新Q值

需要变动的代码,其他部分与第二次改动一样

# 以maxQ(s,a)玩一局游戏,每一步都更新Q(s,a),并返回该局游戏的总奖励值
def play_episode(self, env):
    total_reward = 0.0
    state = env.reset()
    while True:
        _, action = self.best_value_and_action(state)
        new_state, reward, is_done, _ = env.step(action)
        # 改进,边玩边更新
        self.value_update(state, action, reward, new_state)
        total_reward += reward
        if is_done:
            break
            state = new_state
            return total_reward

结果:

image-20220319164336370

相较于前两次,其提升更加明显

即使平均回报到达0.95,其训练时间也短

image-20220319164636149

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值