• 大小: 6KB
    文件类型: .py
    金币: 2
    下载: 1 次
    发布日期: 2021-06-07
  • 语言: Python
  • 标签:

资源简介

基于Tensorflow实现的PPO算法,依赖库:tensorflow-1.4及以上,gym

资源截图

代码片段和文件信息

import tensorflow as tf
import numpy as np
import gym
import copy

class PPO:
    def __init__(self n_features n_actions):
        self.n_actions = n_actions
        self.n_features = n_features
        self.learning_rate = 0.0015
        self.sess = tf.Session()
        self.observe = tf.placeholder(tf.float32 [None self.n_features])
        self.v self.act_prob self.params = self._build_net(‘pi‘ train=True)
        _ self.act_prob_old self.params_old = self._build_net(‘old_pi‘ train=False)
        self._get_loss()
        self.sess.run(tf.global_variables_initializer())

    def _build_net(self name train):
        with tf.variable_scope(name):
            initer = tf.initializers.truncated_normal(0.0 0.1)
            hidden = tf.layers.dense(self.observe 20 tf.nn.tanh trainable=train)
            hidden = tf.layers.dense(hidden 20 tf.nn.tanh trainable=train)
            v = tf.layers.dense(hidden 1 activation=None trainable=train)

            hidden1 = tf.layers.dense(self.observe 20 tf.nn.tanh trainable=train)
            hidden1 = tf.layers.dense(hidden1 20 tf.nn.tanh trainable=train)
            hidden1 = tf.layers.dense(hidden1 self.n_actions tf.nn.tanh trainable=train)
            act_prob = tf.layers.dense(hidden1 self.n_actions tf.nn.softmax trainable=train)

        params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES scope=name)
        return v act_prob params

    def _get_loss(self):
        self.adv = tf.placeholder(tf.float32 [None])
        self.v_next = tf.placeholder(tf.float32 [None])
        self.action = tf.placeholder(tf.int32 [None])
        self.reward = tf.placeholder(tf.float32 [None])

        td_error = self.reward + 0.95*self.v_next - self.v
        v_loss = tf.reduce_mean(tf.square(td_error))

        act_encode = tf.one_hot(self.action self.n_actions)

        prob = tf.reduce_sum(self.act_prob*act_encode axis=1)
        prob_old = tf.reduce_sum(self.act_prob_old*act_encode axis=1)

        ratio = tf.exp(tf.log(tf.clip_by_value(prob 1e-10 1.0)) - tf.log(tf.clip_by_value(prob_old 1e-10 1.0)))
        clip_ratio = tf.clip_by_value(ratio 1.0-0.2 1.0+0.2)
        clip_loss = tf.reduce_mean(tf.minimum(ratio*self.adv clip_ratio*self.adv))

        entroy_loss = -tf.reduce_mean(tf.reduce_sum(self.act_prob*tf.log(tf.clip_by_value(self.act_prob 1e-10 1.0)) axis=1))

        self.total_loss = clip_loss - v_loss + 0.01*entroy_loss
        learning_rate = tf.train.exponential_decay(0.0015 0 200 0.95)
        self.train_op = tf.train.AdamOptimizer(learning_rate).minimize(-self.total_loss)
        self.old_pi_update = [tf.assign(t e) for t e in zip(self.params_old self.params)]

    def learn(self observe v_pred adv reward act):
        loss _ = self.sess.run([self.total_loss self.train_op]
                      feed_dict={self.observe: observe self.v_next: v_pred
     

评论

共有 条评论