python 强化学习Q-Learning 算法简单应用

 Algorithm 3. The SARSA algorithm.

1:Let 𝑆 be a set of states, and 𝐴(𝑠), 𝑠 ∈ 𝑆, be a set of actions available in the state 𝑠.

2:Initialize𝑞(𝑠,𝑎),𝑠∈𝑆,𝑠isnotterminal,𝑎∈𝐴(𝑠)arbitrarily

3:Initialize 𝛼 and 𝛾

4:for each game do

5:        Initialize a nonterminal state 𝑠0 at random

6:        Select 𝑎0 under the policy 𝜋0(𝑎|𝑠0)

7:        𝑡←0

8:        for each step of the game 𝑡 until a stopping criterion is reached or until 𝑠𝑡

is a nonterminal state, do

9:                Take action 𝑎𝑡, find 𝑟𝑡+1, transition to 𝑠𝑡+1

10:               if 𝑠𝑡+1 is a terminal state, then

11:                       𝑞(𝑠𝑡, 𝑎𝑡) ← 𝑞(𝑠𝑡, 𝑎𝑡) + 𝛼(𝑟𝑡+1 − 𝑞(𝑠𝑡, 𝑎𝑡))

12:               else

13:                       Select 𝑎𝑡+1 under the policy 𝜋𝑡+1(𝑎|𝑠𝑡+1)

14:                       𝑞(𝑠𝑡, 𝑎𝑡) ← 𝑞(𝑠𝑡, 𝑎𝑡) + 𝛼(𝑟𝑡+1 + 𝛾𝑞(𝑠𝑡+1, 𝑎𝑡+1) − 𝑞(𝑠𝑡, 𝑎𝑡))

15:                end if

16:                𝑡 ← (𝑡+1)

17:        end if

18:end for

#程序包含以下内容
#使用SARSA方法,进行40000场冰湖游戏
#使用训练完的模型进行游戏
#画出智能体的移动路径
epsilon = 0.1 # Epsilon parameter which is used in epsilon-greedy strategy
gamma = 0.9 # Discount coefficient gamma
random_seed = 2 #Random seed
time_delay = 1 # Time delay when rendering the game process after training (seconds)
lr_rate = 0.9 #Learning rate alpha
import time

# %% md


# %%SARSA

import gym
import numpy as np
import time
from IPython.display import clear_output


def generate_random_map(size, p, sd):
    """Generates a random valid map (one that has a path from start to goal)
    :param size: size of each side of the grid
    :param p: probability that a tile is frozen
    """
    valid = False
    np.random.seed(sd)

    # DFS to check that it's a valid path.
    def is_valid(res):
        frontier, discovered = [], set()
        frontier.append((0, 0))
        while frontier:
            r, c = frontier.pop()
            if not (r, c) in discovered:
                discovered.add((r, c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    if res[r_new][c_new] == 'G':
                        return True
                    if (res[r_new][c_new] not in '#H'):
                        frontier.append((r_new, c_new))
        return False

    while not valid:
        p = min(1, p)
        res = np.random.choice(['F', 'H'], (size, size), p=[p, 1 - p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
    return ["".join(x) for x in res]


# Map generation
random_map = generate_random_map(size=6, p=0.8, sd=random_seed)  # Create our map
env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=False)  # Initialize environment
print("Your map")
env.render()  # Render the map


def choose_action(state):
    action = 0
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0, env.action_space.n)  # ***
    else:
        action = np.random.choice(np.array(np.argwhere(Q[state, :] == np.amax(Q[state, :])).flatten().tolist()))
    return action


def learn(state, state2, reward, action, action2, done):
    if done is True:
        Q[state, action] = Q[state, action] + lr_rate * (reward - Q[state, action])
    else:
        Q[state, action] = Q[state, action] + lr_rate * (reward + gamma *Q[state2, action2] - Q[state, action])
    # Q[state, action] = #Your code here



from tqdm import tqdm

# Inititalization
np.random.seed(random_seed)
total_games = 40000
max_steps = 100
Q = np.zeros((env.observation_space.n, env.action_space.n))
# Main cycle
game1 = [0] * total_games
for game in tqdm(range(total_games)):
    state = env.reset()
    t = 0
    action = choose_action(state)
    while t < max_steps:
        # env.render()
        t += 1

        state2, reward, done, info = env.step(action)
        if t == max_steps:
            done = True
        action2 = choose_action(state2)
        learn(state, state2, reward, action, action2, done)

        state = state2

        action = action2
        if done:
            if reward == 1:
                game1[game] = 1
            break


def get_fg_and_v(game):
    v = 0
    fg = 0
    i = 0
    l = []
    for g in game:
        if g == 1:
            v = v + 1
            if len(l) == 0:
                l.append(i)
            else:
                if len(l) < 5:
                    if l[-1] - i == -1:
                        l.append(i)
                    else:
                        l = [i]
                    if len(l) == 5 and fg == 0:
                        fg = i + 1

        i = i + 1
    return fg, v


fg, v = get_fg_and_v(game1)

# 1
print("The number of victories in a series of 10,000 games: ", v)  # Your code here
# 2
print("Five wins in a row were first won in the game ", fg)  # Your code here

import time


# Greedy action selection
def choose_action_one_game(state):
    action = np.random.choice(np.array(np.argwhere(Q[state, :] == np.amax(Q[state, :])).flatten().tolist()))
    return action


states = []  # Array to save agent states during the game
t = 0
state = env.reset()
wn = 0
while (t < 100):
    # env.render()
    time.sleep(time_delay)
    clear_output(wait=True)
    action = choose_action_one_game(state)
    state2, reward, done, info = env.step(action)
    states.append(state)
    state = state2
    t += 1
    if done and reward == 1:
        wn = 1
    if done:
        break
if wn == 1:
    print("!!!WIN!!!")

import matplotlib.pyplot as plt


def make_maze_pic(maze):
    maze_pic = []
    for i in range(len(maze)):
        row = []
        for j in range(len(maze[i])):
            if maze[i][j] == 'S':
                row.append(0)
            if maze[i][j] == 'F':
                row.append(0)
            if maze[i][j] == 'H':
                row.append(1)
            if maze[i][j] == 'G':
                row.append(0)
        maze_pic.append(row)
    maze_pic = np.array(maze_pic)
    return maze_pic


# Make maze fit to plot
maze_pic = make_maze_pic(random_map)
nrows, ncols = maze_pic.shape

# Arrays of picture elements
rw = np.remainder(states, nrows)
cl = np.floor_divide(states, nrows)
if wn == 1:
    rw = np.append(rw, [nrows - 1])
    cl = np.append(cl, [ncols - 1])

# Picture plotting
fig, ax1 = plt.subplots(1, 1, tight_layout=True)
ax1.clear()
ax1.set_xticks(np.arange(0.5, nrows, step=1))
ax1.set_xticklabels([])
ax1.set_yticks(np.arange(0.5, ncols, step=1))
ax1.set_yticklabels([])
ax1.grid(True)
ax1.plot([0], [0], "gs", markersize=40)  # start is a big green square
ax1.text(0, 0.2, "Start", ha="center", va="center", color="white", fontsize=12)  # Start text
ax1.plot([nrows - 1], [ncols - 1], "rs", markersize=40)  # exit is a big red square
ax1.text(nrows - 1, ncols - 1 + 0.2, "Finish", ha="center", va="center", color="white", fontsize=12)  # Exit text
ax1.plot(rw, cl, ls='-', color='blue')  # Blue lines path
ax1.plot(rw, cl, "bo")  # Blue dots visited cells
ax1.imshow(maze_pic, cmap="binary")
fig.show()
# 3

智能体的移动路径如下:

 

Algorithm 4. Q-learning algorithm

1:Let 𝑆 be a set of states, and 𝐴(𝑠), 𝑠 ∈ 𝑆, be a set of actions available in the state 𝑠.

2:Initialize𝑞(𝑠,𝑎),𝑠∈𝑆,𝑠isnotterminal,𝑎∈𝐴(𝑠)arbitrarily

3:Initialize 𝛼 and 𝛾

4:for each game do

5:        Initialize a nonterminal state 𝑠0 at random

6:        𝑡←0

7:        for each step of the game 𝑡 until a stopping criterion is reached or until 𝑠𝑡

is a nonterminal state, do

8:                Select 𝑎𝑡 under the policy 𝜋𝑡(𝑎|𝑠𝑡)

9:                Take action 𝑎𝑡, find 𝑟𝑡+1, transition to 𝑠𝑡+1

10:               if 𝑠𝑡+1 is a terminal state, then

11:                       𝑞(𝑠𝑡, 𝑎𝑡) ← 𝑞(𝑠𝑡, 𝑎𝑡) + 𝛼(𝑟𝑡+1 − 𝑞(𝑠𝑡, 𝑎𝑡))

12:               else

13:                       𝑞(𝑠𝑡, 𝑎𝑡) ← 𝑞(𝑠𝑡, 𝑎𝑡) + 𝛼(𝑟𝑡+1 + 𝛾max(𝑞(𝑠𝑡+1, a)) − 𝑞(𝑠𝑡, 𝑎𝑡))

14:                end if

15:                𝑡 ← (𝑡+1)

16:        end if

17:end for

#程序包含以下内容
#使用Q-learning方法,进行10000场冰湖游戏
#使用训练完的模型进行游戏
#画出智能体的移动路径
epsilon = 0.1 # Epsilon parameter which is used in epsilon-greedy strategy
gamma = 0.9 # Discount coefficient gamma
random_seed = 6 #Random seed
time_delay = 1 # Time delay when rendering the game process after training (seconds)
lr_rate = 0.9 #Learning rate alpha
import time

#%% md


#%%Q-Learning

import gym
import numpy as np
import time
from IPython.display import clear_output


def generate_random_map(size, p, sd):
    """Generates a random valid map (one that has a path from start to goal)
    :param size: size of each side of the grid
    :param p: probability that a tile is frozen
    """
    valid = False
    np.random.seed(sd)

    # DFS to check that it's a valid path.
    def is_valid(res):
        frontier, discovered = [], set()
        frontier.append((0,0))
        while frontier:
            r, c = frontier.pop()
            if not (r,c) in discovered:
                discovered.add((r,c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    if res[r_new][c_new] == 'G':
                        return True
                    if (res[r_new][c_new] not in '#H'):
                        frontier.append((r_new, c_new))
        return False

    while not valid:
        p = min(1, p)
        res = np.random.choice(['F', 'H'], (size, size), p=[p, 1-p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
    return ["".join(x) for x in res]

# Map generation
random_map = generate_random_map(size=6, p=0.8, sd = random_seed) #Create our map
env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=False) #Initialize environment
print("Your map")
env.render() #Render the map

def choose_action(state):
    action=0
    if np.random.uniform(0, 1) < epsilon:
        action = np.random.randint(0,env.action_space.n) #***
    else:
        action = np.random.choice(np.array(np.argwhere(Q[state, :] == np.amax(Q[state, :])).flatten().tolist()))
    return action

def learn(state, state2, reward, action, done):
    if done is True:
        Q[state, action] = Q[state, action] + lr_rate*(reward - Q[state, action])
    else:
        Q[state, action] = Q[state, action] + lr_rate * (reward + gamma*np.max(Q[state2,:])-Q[state, action])
    #Q[state, action] = #Your code here


#Q-learning
from tqdm import tqdm
# Inititalization
np.random.seed(random_seed)
total_games = 10000
max_steps = 100
Q = np.zeros((env.observation_space.n, env.action_space.n))
# Main cycle
game1= [0]*total_games
for game in tqdm(range(total_games)):
    state = env.reset()
    t = 0
    while t < max_steps:
        #env.render()
        t += 1

        action = choose_action(state)

        state2, reward, done, info = env.step(action)
        if t == max_steps:
            done = True

        learn(state, state2, reward, action, done)

        state = state2

        if done:
            if reward == 1:
                game1[game]=1
            break
def get_fg_and_v(game):
    v = 0
    fg=0
    i=0
    l=[]
    for g in game:
        if g ==1:
            v=v+1
            if len(l) ==0:
                l.append(i)
            else:
                if len(l) <5:
                    if l[-1] - i == -1:
                        l.append(i)
                    else:
                        l=[i]
                    if len(l) == 5 and fg==0:
                        fg = i+1

        i=i+1
    return fg , v
fg,v=get_fg_and_v(game1)


print("The number of victories in a series of 10,000 games: ",v)#Your code here
print("Five wins in a row were first won in the game ",fg)#Your code here



import time
#Greedy action selection
def choose_action_one_game(state):
    action = np.random.choice(np.array(np.argwhere(Q[state, :] == np.amax(Q[state, :])).flatten().tolist()))
    return action

states=[]#Array to save agent states during the game
t = 0
state = env.reset()
wn = 0
while(t<100):
  #env.render()
  time.sleep(time_delay)
  clear_output(wait=True)
  action = choose_action_one_game(state)
  state2, reward, done, info = env.step(action)
  states.append(state)
  state = state2
  t += 1
  if done and reward == 1:
    wn=1
  if done:
    break
if wn == 1:
  print("!!!WIN!!!")

import matplotlib.pyplot as plt


def make_maze_pic(maze):
    maze_pic = []
    for i in range(len(maze)):
        row = []
        for j in range(len(maze[i])):
            if maze[i][j] == 'S':
                row.append(0)
            if maze[i][j] == 'F':
                row.append(0)
            if maze[i][j] == 'H':
                row.append(1)
            if maze[i][j] == 'G':
                row.append(0)
        maze_pic.append(row)
    maze_pic = np.array(maze_pic)
    return maze_pic


# Make maze fit to plot
maze_pic = make_maze_pic(random_map)
nrows, ncols = maze_pic.shape

# Arrays of picture elements
rw = np.remainder(states, nrows)
cl = np.floor_divide(states, nrows)
if wn == 1:
    rw = np.append(rw, [nrows - 1])
    cl = np.append(cl, [ncols - 1])

# Picture plotting
fig, ax1 = plt.subplots(1, 1, tight_layout=True)
ax1.clear()
ax1.set_xticks(np.arange(0.5, nrows, step=1))
ax1.set_xticklabels([])
ax1.set_yticks(np.arange(0.5, ncols, step=1))
ax1.set_yticklabels([])
ax1.grid(True)
ax1.plot([0], [0], "gs", markersize=40)  # start is a big green square
ax1.text(0, 0.2, "Start", ha="center", va="center", color="white", fontsize=12)  # Start text
ax1.plot([nrows - 1], [ncols - 1], "rs", markersize=40)  # exit is a big red square
ax1.text(nrows - 1, ncols - 1 + 0.2, "Finish", ha="center", va="center", color="white", fontsize=12)  # Exit text
ax1.plot(rw, cl, ls='-', color='blue')  # Blue lines path
ax1.plot(rw, cl, "bo")  # Blue dots visited cells
ax1.imshow(maze_pic, cmap="binary")
fig.show()

智能体的移动路径如下:

 

 

  • 0
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

фора 快跑

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值