Q-learning算法实现

Q-learning算法实现

from random import random
from gym import Env
import gym
from gridworld import *

class Agent():
    def __init__(self,env: Env):
        self.env = env  #个体持有环境的引用
        self.Q = {}  #个体维护一张行为价值表
        self.state = None  #个体当前的观测

    def performPolicy(self, state):  #执行一个策略
        pass

    def act(self,a):  #执行一个行为
        return self.env.step(a)
    def learning(self):  #学习过程
        pass
    def _get_state_name(self,state):  #将观测状态转换为一个字典的键
        return str(state)
    def _is_state_in_Q(self, s):  #判断s的Q值是否存在
        return self.Q.get(s) is not None
    def _init_state_value(self, s_name, randomized = True):  #初始化某状态的Qif not self._is_state_in_Q(s_name):
            self.Q[s_name] = {}
            for action in range(self.env.action_space.n):
                default_v = random() / 10 if randomized is True else 0.0
                self.Q[s_name][action] = default_v
    def _assert_state_in_Q(self, s, randomized=True):  #确保某状态Q的值存在
        if not self._is_state_in_Q(s):
            self._init_state_value(s, randomized)
    def _get_Q(self, s, a):  #获取Q(s,a)
        self._assert_state_in_Q(s, randomized=True)
        return self.Q[s][a]
    def _set_Q(self, s, a, value):  #设置Q(s,a)
        self._assert_state_in_Q(s, randomized=True)
        self.Q[s][a] = value
    def _performPolicy(self, s, episode_num, use_epsilon):
        epsilon = 1.00 / (episode_num+1)
        Q_s = self.Q[s]
        str_act = 'unknown'
        rand_value = random()
        action = None
        if use_epsilon and rand_value < epsilon:
            action = self.env.action_space.sample()
        else:
            str_act = max(Q_s, key=Q_s.get)
            action = int(str_act)
        return action

    def _learning(self, gamma, alpha, max_episode_num):
        total_time, time_in_episode, num_episode = 0,0,0
        while num_episode < max_episode_num:  #设置终止条件
            self.state = self.env.reset()  #环境初始化
            s0 = self._get_state_name(self.state)  #获取个体对于观测的命名
            self._assert_state_in_Q(s0, randomized=True)
            self.env.render()  #显示UI界面
            a0 = self._performPolicy(s0, num_episode, use_epsilon=True) #个体采取的行为epsilon-greedy策略
            time_in_episode = 0
            is_done = False
            while not is_done:  #针对一个Eposide
                s1,r1,is_done,info = self.act(a0)  #执行策略行为
                self.env.render()  #更新UI界面
                s1 = self._get_state_name(s1)  #获取个体对于新状态的命名
                self._assert_state_in_Q(s1, randomized=True)
                a_target = self._performPolicy(s1, num_episode, use_epsilon=False)  #个体目标行为greedy策略
                a1 = self._performPolicy(s1, num_episode, use_epsilon=True) #个体实际执行的epsilon-greedy策略
                old_q = self._get_Q(s0, a0)
                q_prime = self._get_Q(s1, a_target)
                td_target = r1 + gamma*q_prime
                td_error = td_target - old_q
                new_q = old_q + alpha*td_error
                self._set_Q(s0, a0, new_q)
                if num_episode == max_episode_num:
                    print("t:{0:>2}: s:{1}, a:{2:2}, s1:{3}".format(time_in_episode,s0,a0,s1))
                s0,a0 = s1,a1
                time_in_episode += 1
            print("Eposide {0} takes {1} steps.".format(num_episode, time_in_episode))
            total_time += time_in_episode
            num_episode += 1
        return

def main():
    env = SimpleGridWorld()
    agent = Agent(env)
    print("learning...")
    agent._learning(gamma=0.9, alpha=0.1, max_episode_num=800)
    return

if __name__ == '__main__':
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值