I'm trying to figure out how to implement DDPG in Tensorflow.js using Python examples such as this one from keras website. I got stuck on the training code:
with tf.GradientTape() as tape:
target_actions = target_actor(next_state_batch, training=True)
y = reward_batch + gamma * target_critic(
[next_state_batch, target_actions], training=True
)
critic_value = critic_model([state_batch, action_batch], training=True)
critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))
critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
critic_optimizer.apply_gradients(
zip(critic_grad, critic_model.trainable_variables)
)
with tf.GradientTape() as tape:
actions = actor_model(state_batch, training=True)
critic_value = critic_model([state_batch, actions], training=True)
# Used `-value` as we want to maximize the value given
# by the critic for our actions
actor_loss = -tf.math.reduce_mean(critic_value)
actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
actor_optimizer.apply_gradients(
zip(actor_grad, actor_model.trainable_variables)
)
So far my typescript version looks like this:
const batch = this.memory.getMinibatch(this.config.replayBatchSize);
const states = this.actorService.getStateTensor(this.actor, ...batch.map(s => s.state));
const nextStates = this.actorService.getStateTensor(this.actor, ...batch.map(s => s.nextState));
const rewards = tf.tensor2d(batch.map(s => s.reward), [batch.length, 1], 'float32');
const actions = this.actorService.getActionTensor(...batch.map(s => s.action));
const criticLossFunction = () => tf.tidy(() => {
let targetQs: tf.Tensor;
if (this.config.discountRate === 0) {
targetQs = rewards;
} else {
const targetActions = this.targetActorModel.predict(nextStates) as tf.Tensor;
const targetCriticQs = this.targetCriticModel.predict(tf.concat([nextStates, targetActions], 1)) as tf.Tensor;
targetQs = rewards.add(targetCriticQs.mul(this.config.discountRate));
}
const criticQs = this.criticModel.predict(tf.concat([states, actions], 1)) as tf.Tensor;
const criticLoss = tf.losses.meanSquaredError(targetQs, criticQs);
return criticLoss.asScalar();
});
const criticTrainableVars = this.criticModel.getWeights(true) as tf.Variable<tf.Rank>[];
const criticGradient = tf.variableGrads(criticLossFunction, criticTrainableVars);
// HOWTO: zip(critic_grad, critic_model.trainable_variables)
this.criticModel.optimizer.applyGradients(criticGradient.grads);
tf.dispose(criticGradient);
const actorLossFunction = () => tf.tidy(() => {
const policyActions = this.actorModel.predict(states) as tf.Tensor;
const criticQs = this.criticModel.predict(tf.concat([states, policyActions], 1)) as tf.Tensor;
const actorLoss = tf.mean(criticQs.mul(-1));
return actorLoss.asScalar()
});
const actorTrainableVars = this.actorModel.getWeights(true) as tf.Variable<tf.Rank>[];
const actorGradient = tf.variableGrads(actorLossFunction, actorTrainableVars);
// HOWTO: zip(actor_grad, actor_model.trainable_variables)
this.actorModel.optimizer.applyGradients(actorGradient.grads);
const actorLoss = actorGradient.value.dataSync()[0];
tf.dispose(actorGradient);
but my code does not work correctly (the loss is too high on a very simple task) due to it's missing one major step (?): zipping critic grads with critic trainable vars before passing it to applyGradients
.
False alarm, the code works correctly, no need to zip
or anything. High loss was due to I used actorLoss
as the value instead of criticLoss
(criticGradient.value.dataSync()[0]
)