ptan实战4 || 经验回放缓冲区
ptan.experience.ExperienceReplayBuffer
均匀采样的缓冲区ptan.experience.PrioReplayBufferNaive
带优先级的经验缓冲区,采样复杂度为O(n)ptan.experience.PrioritizedReplayBuffer
使用区间树进行采样,采样复杂度O(log(n))
方法:
- populate(N) 产生N个经验条并存入经验池
- sample(N) 获取N个经验条
import gym
import ptan
from typing import List, Optional, Tuple, Any
# 自定义gym游戏环境
class ToyEnv(gym.Env):
def __init__(self):
super(ToyEnv, self).__init__()
self.observation_space = gym.spaces.Discrete(n=5)
self.action_space = gym.spaces.Discrete(n=3)
self.step_index = 0
def reset(self):
self.step_index = 0
return self.step_index
def step(self, action):
is_done = self.step_index == 10
if is_done:
return self.step_index % self.observation_space.n, \
0.0, is_done, {}
self.step_index += 1
return self.step_index % self.observation_space.n, \
float(action), self.step_index == 10, {}
# 自定义一个agent类,无论接收什么状态,都返回同一个动作
class DullAgent(ptan.agent.BaseAgent):
def __init__(self, action: int):
self.action = action
def __call__(self, observations: List[Any],
state: Optional[List] = None) \
-> Tuple[List[int], Optional[List]]:
return [self.action for _ in observations], state
if __name__ == "__main__":
env = ToyEnv() # 初始化环境
agent = DullAgent(action=1) # 初始化agent
# 初始化经验源 返回(s,a,r,s_) 累积1步奖励(立即奖励)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=1)
# 初始化经验池,传入经验源和池大小,使用均匀采样
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=100)
for step in range(6):
buffer.populate(1) # 产生一个经验条,并存入经验池,api会让agent与环境自动交互
if len(buffer) < 5: # 若经验池的经验条数量不足
continue
batch = buffer.sample(4) # 从经验池抽取4个经验条出来,抽取方式由初始化决定
print("Train time, %d batch samples:" % len(batch))
for s in batch:
print(s)