#baselineimport tensorflow as tf
import numpy as np
import gym
env = gym.make('CartPole-v0')
env.reset()
random_episodes =0
reward_sum =0while random_episodes <10:#env.render()
observation,reward,done,_ = env.step(np.random.randint(0,2))
reward_sum += reward
if done:
random_episodes +=1print('Reward for the episode was :',reward_sum)
reward_sum =0
env.reset()
Reward for the episode was : 11.0
Reward for the episode was : 31.0
Reward for the episode was : 46.0
Reward for the episode was : 18.0
Reward for the episode was : 10.0
Reward for the episode was : 25.0
Reward for the episode was : 13.0
Reward for the episode was : 25.0
Reward for the episode was : 16.0
Reward for the episode was : 14.0
# 实现强化学习策略网络#常用网络参数
H =50#节点数
batch_size =25
learning_rate =0.1
D =4#观测维度
gamma =0.99#Reward的discount比例