直接上代码
emmm 就算是强化学习的HelloWorld了吧
import logging
import numpy
import random
from gym import spaces
import gym
class GridEnv():
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 2
}
def __init__(self):
# render 中
self.x=[140,220,300,380,460,140,300,460]
self.y=[250,250,250,250,250,150,150,150]
# 状态空间
self.states = [1,2,3,4,5,6,7,8]
# 动作空间
self.actions = ['n','e','s','w']
# 终止状态为字典格式
self.terminate_states = dict()
self.terminate_states[6] = 1
self.terminate_states[7] = 1
self.terminate_states[8] = 1
# 回报函数
self.rewards = dict()
self.rewards['1_s'] = -1.0
self.rewards['3_s'] = 1.0
self.rewards['5_s'] = -1.0
# 状态转移概率
self.t = dict()
self.t['1_s'] = 6
self.t['1_e'] = 2
self.t['2_w'] = 1
self.t['2_e'] = 3
self.t['3_s'] = 7
self.t['3_w'] = 2
self.t['3_e'] = 4
self.t['4_w'] = 3
self.t['4_e'] = 5
self.t['5_s'] = 8
self.t['5_w'] = 4
self.gamma = 0.8 #折扣因子
self.viewer = None
self.state = None
def _step(self, action):
# 获取当前系统状态
state = self.state
if state in self.terminate_states:
return state, 0, True, {}
key = "%d_%s"%(state, action)
#状态转移
if key in self.t:
next_state = self.t[key]
else:
next_state = state
self.state = next_state
is_terminal = False
if next_state in self.terminate_states:
is_terminal = True
if key not in self.rewards:
r = 0.0
else :
r = self.rewards[key]
return next_state,r,is_terminal,{}
# 绘图贼简单- -
def _render(self, mode='human', close=False):
if close:
if self.viewer is not None:
self.viewer.close()
self.viewer = None
return
screen_width = 600
screen_height = 400
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
#创建网格世界
self.line1 = rendering.Line((100,300),(500,300))
self.line2 = rendering.Line((100, 200), (500, 200))
self.line3 = rendering.Line((100, 300), (100, 100))
self.line4 = rendering.Line((180, 300), (180, 100))
self.line5 = rendering.Line((260, 300), (260, 100))
self.line6 = rendering.Line((340, 300), (340, 100))
self.line7 = rendering.Line((420, 300), (420, 100))
self.line8 = rendering.Line((500, 300), (500, 100))
self.line9 = rendering.Line((100, 100), (180, 100))
self.line10 = rendering.Line((260, 100), (340, 100))
self.line11 = rendering.Line((420, 100), (500, 100))
#创建第一个骷髅
self.kulo1 = rendering.make_circle(40)
self.circletrans = rendering.Transform(translation=(140,150))
self.kulo1.add_attr(self.circletrans)
self.kulo1.set_color(0,0,0)
#创建第二个骷髅
self.kulo2 = rendering.make_circle(40)
self.circletrans = rendering.Transform(translation=(460, 150))
self.kulo2.add_attr(self.circletrans)
self.kulo2.set_color(0, 0, 0)
#创建金条
self.gold = rendering.make_circle(40)
self.circletrans = rendering.Transform(translation=(300, 150))
self.gold.add_attr(self.circletrans)
self.gold.set_color(1, 0.9, 0)
#创建机器人
self.robot= rendering.make_circle(30)
self.robotrans = rendering.Transform()
self.robot.add_attr(self.robotrans)
self.robot.set_color(0.8, 0.6, 0.4)
self.line1.set_color(0, 0, 0)
self.line2.set_color(0, 0, 0)
self.line3.set_color(0, 0, 0)
self.line4.set_color(0, 0, 0)
self.line5.set_color(0, 0, 0)
self.line6.set_color(0, 0, 0)
self.line7.set_color(0, 0, 0)
self.line8.set_color(0, 0, 0)
self.line9.set_color(0, 0, 0)
self.line10.set_color(0, 0, 0)
self.line11.set_color(0, 0, 0)
self.viewer.add_geom(self.line1)
self.viewer.add_geom(self.line2)
self.viewer.add_geom(self.line3)
self.viewer.add_geom(self.line4)
self.viewer.add_geom(self.line5)
self.viewer.add_geom(self.line6)
self.viewer.add_geom(self.line7)
self.viewer.add_geom(self.line8)
self.viewer.add_geom(self.line9)
self.viewer.add_geom(self.line10)
self.viewer.add_geom(self.line11)
self.viewer.add_geom(self.kulo1)
self.viewer.add_geom(self.kulo2)
self.viewer.add_geom(self.gold)
self.viewer.add_geom(self.robot)
if self.state is None: return None
#self.robotrans.set_translation(self.x[self.state-1],self.y[self.state-1])
self.robotrans.set_translation(self.x[self.state-1], self.y[self.state- 1])
return self.viewer.render(return_rgb_array=mode == 'rgb_array')
def _reset(self):
self.state = self.states[int (random.random()*len(self.states))]
return self.state码片
首先代码中主要有这几部分函数
函数中的self应该不是参数,就是本身对象的意思吧(原谅我从Java来到python不太熟悉语法),除了self其他的应该就是参数了。
init(self)
初始化方法,其中定义的主要是环境的一些常量,包括环境必需的常量以及回报等等,貌似复杂的环境回报这些可能需要用函数式来表达
_step(self, action)
行为的方法,传入的参数action是在S下,执行的a,这里主要是仿真的过程,然后返回的值是
next_state,r,is_terminal,{}
下一个动作,回报,是否结束,最后这个数组这里没有写,等以后的环境遇到了再补充
_render(self, mode=‘human’, close=False)
这个函数对于强化学习应该没什么用,主要是用来显示界面的
_reset(self):
这个函数是对环境的初始化方法
总结
仔细一看,对环境的仿真首先要知道环境中的变量有哪些,然后用step函数实现在不同的动作下这些变量发生了什么样的变化。
书上的习题4有一个迷宫世界的环境,晚上尝试去实现一下