目录
1、定义算法
import numpy as np
from collections import defaultdict
class FisrtVisitMC:
''' On-Policy First-Visit MC Control
'''
def __init__(self,cfg):
self.n_actions = cfg.n_actions
self.epsilon = cfg.epsilon
self.gamma = cfg.gamma
self.Q_table = defaultdict(lambda: np.zeros(cfg.n_actions))
self.returns_sum = defaultdict(float) # 保存return之和
self.returns_count = defaultdict(float)
def sample_action(self,state):
state = str(state)
if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
else:
action = np.random.randint(0,self.n_actions)
return action
def predict_action(self,state):
state = str(state)
if state in self.Q_table.keys():
best_action = np.argmax(self.Q_table[state])
action_probs = np.ones(self.n_actions, dtype=float) * self.epsilon / self.n_actions
action_probs[best_action] += (1.0 - self.epsilon)
action = np.argmax(self.Q_table[state])
else:
action = np.random.randint(0,self.n_actions)
return action
def update(self,one_ep_transition):
# Find all (state, action) pairs we've visited in this one_ep_transition
# We convert each state to a tuple so that we can use it as a dict key
sa_in_episode = set([(str(x[0]), x[1]) for x in one_ep_transition])
for state, action in sa_in_episode:
sa_pair = (state, action)
# Find the first occurence of the (state, action) pair in the one_ep_transition
first_occurence_idx = next(i for i,x in enumerate(one_ep_transition)
if str(x[0]) == state and x[1] == action)
# Sum up all rewards since the first occurance
G = sum([x[2]*(self.gamma**i) for i,x in enumerate(one_ep_transition[first_occurence_idx:])])
# Calculate average return for this state over all sampled episodes
self.returns_sum[sa_pair] += G
self.returns_count[sa_pair] += 1.0
self.Q_table[state][action] = self.returns_sum[sa_pair] / self.returns_count[sa_pair]
2、定义训练
def train(cfg,env,agent):
print('开始训练!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录奖励
for i_ep in range(cfg.train_eps):
ep_reward = 0 # 记录每个回合的奖励
one_ep_transition = []
state = env.reset(seed=cfg.seed) # 重置环境,即开始新的回合
for _ in range(cfg.max_steps):
action = agent.sample_action(state) # 根据算法采样一个动作
next_state, reward, terminated, info = env.step(action) # 与环境进行一次动作交互
one_ep_transition.append((state, action, reward)) # 保存transitions
agent.update(one_ep_transition) # 更新智能体
state = next_state # 更新状态
ep_reward += reward
if terminated:
break
rewards.append(ep_reward)
print(f"回合:{i_ep+1}/{cfg.train_eps},奖励:{ep_reward:.1f}")
print('完成训练!')
return {"rewards":rewards}
def test(cfg,env,agent):
print('开始测试!')
print(f'环境:{cfg.env_name}, 算法:{cfg.algo_name}, 设备:{cfg.device}')
rewards = [] # 记录所有回合的奖励
for i_ep in range(cfg.test_eps):
ep_reward = 0 # 记录每个episode的reward
state = env.reset(seed=cfg.seed) # 重置环境, 重新开一局(即开始新的一个回合)
for _ in range(cfg.max_steps):
action = agent.predict_action(state) # 根据算法选择一个动作
next_state, reward, terminated, info = env.step(action) # 与环境进行一个交互
state = next_state # 更新状态
ep_reward += reward
if terminated:
break
rewards.append(ep_reward)
print(f"回合数:{i_ep+1}/{cfg.test_eps}, 奖励:{ep_reward:.1f}")
print('完成测试!')
return {"rewards":rewards}
3、定义环境
import sys,os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))
import torch
import numpy as np
import random
from envs.racetrack import RacetrackEnv
def all_seed(env,seed = 1):
''' omnipotent seed for RL, attention the position of seed function, you'd better put it just following the env create function
'''
if seed == 0:
return
# print(f"seed = {seed}")
env.seed(seed) # env config
np