强化学习记录一Qlearning第三版

在这里插入图片描述
进行500回合游戏
可以看到凡是踩到⚡的行为
价值会下降
但是发现有很多0
因为他的行为策略导致一旦找到一条通往🍗的路
他只有10%探索没走过的路
在这里插入图片描述
下面把选择最优行为的记录改为50%
让他多去探索地图

训练次数也改为2000次
次数少了q表体现不出来

在这里插入图片描述
再看一个有趣的对比
当选择最优行为的记录改为50%

在这里插入图片描述
当选择最优行为的记录改为90%

在这里插入图片描述
当最优选择越高你获得少个解越快
但是很难再发现其他最优解
这个例子中
训练2000回合都没找到第四种解法

修改e_greedy控制选择最优动作的概率

代码


import torch
import pandas as pd
import numpy as np

class maze_env:
    def __init__(self,row=4,column=4):

        self.done = False
        self.row = row
        self.column = column
        self.maze = torch.zeros(self.row,self.column)
        self.target_x = row-1
        self.target_y = column-1
        self.x = 0
        self.y = 0
        self.maze[self.x][self.y] = 1


    def show_maze(self):
        print(self.maze)

    def step(self,action):
        r = 0
        self.maze[self.x][self.y] = 0

        if action == 'up' and self.y >= 1:# up
            self.y -= 1
        if action == 'right' and self.y <= self.row - 2:  # right
            self.y += 1
        if action == 'left' and self.x >= 1:  # left
            self.x -= 1
        if action == 'down' and self.x <= self.column - 2:  # down
            self.x += 1
        self.maze[self.x][self.y] = 1

        if self.x == self.target_x and self.y == self.target_y:
            self.done = True
            r = 100 - action_cnt + 6

        if (self.x == 1 and self.y == 0) or\
            (self.x == 2 and self.y == 2):
            self.done = True
            r -= 10


        return (self.x,self.y),r,self.done

    def reset(self):
        self.done = False
        self.maze = torch.zeros(self.row,self.column)
        self.x = 0
        self.y = 0
        self.maze[self.x][self.y] = 1
        return (0, 0)


class Qlearning:
    def __init__(self, actions, learning_rate=0.1, reward_decay=0.9, e_greedy=0.9):
        self.actions = actions
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon = e_greedy
        self.q_table = pd.DataFrame(columns=self.actions, dtype=np.float64)


    def choose_action(self, observation):
        self.check_state_exist(observation)
        # 90% 选择q值最大的行为 10% 选择随机行为
        if np.random.uniform() < self.epsilon:
            # choose best action
            state_action = self.q_table.loc[observation, :]
            # some actions may have the same value, randomly choose on in these actions
            action = np.random.choice(state_action[state_action == np.max(state_action)].index)
        else:
            # choose random action
            action = np.random.choice(self.actions)

        return action

    def check_state_exist(self, state):
        if state not in self.q_table.index:
            # append new state to q table
            self.q_table = self.q_table.append(
                pd.Series(
                    [0]*len(self.actions),
                    index=self.q_table.columns,
                    name=str(state),
                )
            )

            #print(self.q_table)

    def learn(self, s, a, r, s_,done):
        self.check_state_exist(s_)
        q_predict = self.q_table.loc[s, a]
        #print('q_predict',q_predict)
        if not done:
            q_target = r + self.gamma * self.q_table.loc[s_, :].max()  # next state is not terminal
        else:
            q_target = r  # next state is terminal
        self.q_table.loc[s, a] += self.lr * (q_target - q_predict)  # update
        #print(self.q_table)


maze = maze_env()
agent = Qlearning(['up', 'down', 'left', 'right'])
zuiyoujie_list = []
for i in range(2000):
    observation = maze.reset()
    action_cnt = 0
    reward = 0
    zuiyoujie = []
    while True:
        action = agent.choose_action(str(observation))
        action_cnt = action_cnt + 1
        zuiyoujie.append(action)
        observation_,r,done = maze.step(action)
        reward += r
        agent.learn(str(observation),action,r,str(observation_),done)
        observation = observation_

        if done:
            # print(agent.q_table)

            if action_cnt == 6 and zuiyoujie not in zuiyoujie_list and reward == 100:
                zuiyoujie_list.append(zuiyoujie)
                print('第',i,'局发现第',len(zuiyoujie_list),'个最优解')
                print(zuiyoujie)
            # print("一共移动了",action_cnt)
            # print("本局的总收益为", reward)
            break
# print(agent.q_table)


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值