Policy_Gradient-cartpole (keras)

import os
os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'
import sys
import gym
import numpy as np
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam

class PG_Agent:
    def __init__(self):
        self.model = self.build_model()
        self.states, self.actions, self.rewards = [], [], []

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=4, activation='relu', kernel_initializer='glorot_uniform'))
        model.add(Dense(24, activation='relu', kernel_initializer='glorot_uniform'))
        model.add(Dense(2, activation='softmax', kernel_initializer='glorot_uniform'))
        model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001))
        return model

    def discount_rewards(self, rewards):
        discounted_rewards = np.zeros_like(rewards)
        running_add = 0
        for t in reversed(range(0, len(rewards))):
            running_add = running_add * 0.99 + rewards[t]
            discounted_rewards[t] = running_add
        return discounted_rewards

    def append_sample(self, state, action, reward):
        self.states.append(state)
        self.rewards.append(reward)
        self.actions.append(action)

    def train_model(self):
        episode_length = len(self.states)
        discounted_rewards = self.discount_rewards(self.rewards)
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)
        advantages = np.zeros((episode_length, 2))
        for i in range(episode_length):
            advantages[i][self.actions[i]] = discounted_rewards[i]
        self.model.fit(np.vstack(self.states), advantages, verbose=0)
        self.states, self.actions, self.rewards = [], [], []

if __name__ == "__main__":
    env = gym.make('CartPole-v1')
    agent = PG_Agent()
    for e in range(1000):
        done = False
        score = 0
        state = env.reset()
        state = np.reshape(state, [1, 4])
        while not done:
            env.render() if e > 600 else None
            policy = agent.model.predict(state).flatten()
            action = np.random.choice(2, 1, p=policy)[0]
            next_state, reward, done, info = env.step(action)
            agent.append_sample(state, action, reward)
            state = np.reshape(next_state, [1, 4])
            score += reward
            if done:
                agent.train_model()
                print("episode:", e, "  score:", score)
        if e % 50 == 0:
            agent.model.save_weights("./cartpole_pg.h5")

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值