import numpy as np
r = np.matrix(
[[-1, -1, -1, -1, 0, -1],
[-1, -1, -1, 0, -1, 100],
[-1, -1, -1, 0, -1, -1],
[-1, 0, 0, -1, 0, -1],
[0, -1, -1, 0, -1, 100],
[-1, 0, -1, -1, 0, 100]])
q = np.zeros((6, 6))
gmma = 0.8
epsilion = 0.3
a = 0.2
for episode in range(1000):
# give a random state of the agent , 0-5
state = np.random.randint(0, 6)
if state == 5:
print(state, " reach directly")
else:
print(state, end="")
# while the state is not reach the last goal 5
while state != 5:
# choose the possible actions, but we can not choose the
# action whose R[state, action] = -1
#record the all possible actions, and the possible Q values
possibleActions = []
possibleQ = []
# a for loop to choose the possible action
for action in range(6):
if r[state, action] >= 0:
possibleActions.append(action)
#record the possible q values, which will be used in
# update the Q matrix
possibleQ.append(q[state, action])
# choose the next action, epsilion= 0.4,
# means we have the possibility of 40 percent to choose random
# and the 60 percent to choose the max
action = -1
if np.random.random() < epsilion:
action = possibleActions[np.random.randint(0, len(possibleActions))]
else:
# epsilon--greedy, choose the maxQ action
action = possibleActions[np.argmax(possibleQ)]
# update the q value
q[state, action] = 0.2 * (r[state, action] + gmma * q[action].max()) + 0.8 * q[state, action]
# update the state
state = action
print("-->" + str(state), end="")
if state == 5:
print()
if episode % 10 == 0 :
print()
print("Training episode: %d" % episode)
print(q)
Q-Learning code
最新推荐文章于 2024-05-30 09:00:17 发布