Q-Learning玩gym Frozen Lake游戏

4 篇文章 0 订阅
1 篇文章 0 订阅

依赖

pip install numpy
pip install pandas
pip install gym

运行图

代码

import os
import time
from itertools import count
import numpy  as np
import pandas as pd
import gym
# import gymnasium as gym
 
NUM_EPISODES       = 10
MAX_EPSILON        = 1
MIN_EPSILON        = 0.05
EPSILON_DECAY_RATE = 0.005
class QLearning(object):
    def __init__(self, env) -> None:
        self.epsilon  = MAX_EPSILON
        self.alpha    = 0.5            # learning rate
        self.gamma    = 0.95
        self.episodes = NUM_EPISODES
        self.env = env
        self.q_table_csv = './q_table_{}x{}.csv'.format(self.env.observation_space.n, self.env.action_space.n)

        self.q_table = pd.DataFrame(
            np.zeros((self.env.observation_space.n, self.env.action_space.n)), \
            index=range(0, self.env.observation_space.n),                      \
            columns=range(0, self.env.action_space.n)                          \
            )
 
        print('qtable\n',self.q_table)
 
    def select_action(self, state, greedy=False):
        e = np.random.uniform()
        action = None
        if (e < self.epsilon or (self.q_table.iloc[state] == 0).all()) and not greedy:
            action = self.env.action_space.sample()
        else:
            action = self.q_table.iloc[state].idxmax()
 
        return action
 
    def update_q_table(self, state, action, reward, next_state):
        q = self.q_table.iloc[state][action]
        q_new = q + self.alpha * (reward + self.gamma * (self.q_table.iloc[next_state].max()) - q)
        self.q_table.iloc[state][action] = q_new
 
    def train(self):
        print('train')

        for i in range(self.episodes):
            observation, info = self.env.reset()
 
            for t in count():
                action = self.select_action(observation)
                observation_new, reward, terminated, truncated, info = self.env.step(action)
                
                done = terminated or truncated
                if done and reward == 0:
                    reward = -1
 
                self.update_q_table(observation, action, reward, observation_new)
                observation = observation_new
 
                if done:
                    self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * np.exp(-EPSILON_DECAY_RATE * i)
                    print(i, t, observation, action, observation_new, reward, terminated, truncated, self.epsilon, info)
                    print('latest q_table:\n',qlearn.q_table)
                    break
            
        # save
        self.q_table.to_csv(self.q_table_csv, index=False)

    def play(self):
        print('play', self.q_table_csv)
        if os.path.exists(self.q_table_csv):
            dtype = dict(zip(np.array([str(x) for x in np.arange(0,self.env.action_space.n)]), np.array(['float64'] * self.env.action_space.n)))
            self.q_table = pd.read_csv(self.q_table_csv, header=0, dtype=dtype)
            print('read q_table\n', self.q_table)

            observation, info = self.env.reset()

            done = False
            while not done:
                action = self.select_action(observation, True)
                observation_new, reward, terminated, truncated, info = self.env.step(int(action))
                done = terminated or truncated
                observation = observation_new
                time.sleep(0.5)
if __name__ == '__main__':
    env = gym.make('FrozenLake-v1', desc=None, map_name='4x4', is_slippery=False, render_mode="human")
    qlearn = QLearning(env)
    qlearn.train()
    qlearn.play()
    print('latest q_table:\n',qlearn.q_table)

参考

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值