I have made a small script in Python to solve various Gym environments with policy gradients.
import gym, os
import numpy as np
#create environment
env = gym.make('Cartpole-v0')
env.reset()
s_size = len(env.reset())
a_size = 2
#import my neural network code
os.chdir(r'C:\---\---\---\Python Code')
import RLPolicy
policy = RLPolicy.NeuralNetwork([s_size,a_size],learning_rate=0.000001,['softmax']) #a 3layer network might be ([s_size, 5, a_size],learning_rate=1,['tanh','softmax'])
#it supports the sigmoid activation function also
print(policy.weights)
DISCOUNT = 0.95 #parameter for discounting future rewards
#first step
action = policy.feedforward(env.reset)
state,reward,done,info = env.step(action)
for t in range(3000):
done = False
states = [] #lists for recording episode
probs2 = []
rewards = []
while not done:
#env.render() #to visualize learning
probs = policy.feedforward(state)[-1] #calculate probabilities of actions
action = np.random.choice(a_size,p=probs) #choose action from probs
#record and update state
probs2.append(probs)
states.append(state)
state,reward,done,info = env.step(action)
rewards.append(reward) #should reward be before updating state?
#calculate gradients
gradients_w = []
gradients_b = []
for i in range(len((rewards))):
totalReward = sum([rewards[t]*DISCOUNT**t for t in range(len(rewards[i:]))]) #discounted reward
## !! this is the line that I need help with
gradient = policy.backpropagation(states[i],totalReward*(probs2[i])) #what should be backpropagated through the network
## !!
##record gradients
gradients_w.append(gradient[0])
gradients_b.append(gradient[1])
#combine gradients and update the weights and biases
gradients_w = np.array(gradients_w,object)
gradients_b = np.array(gradients_b,object)
policy.weights += policy.learning_rate * np.flip(np.sum(gradients_w,0),0) #np.flip because the gradients are calculated backwards
policy.biases += policy.learning_rate * np.flip(np.sum(gradients_b,0),0)
#reset and record
env.reset()
if t%100==0:
print('t'+str(t),'r',sum(rewards))
What should be passed backwards to calculate the gradients? I am using gradient ascent but I could switch it to descent. Some people have defined the reward function as totalReward*log(probabilities). Would that make the score derivative totalReward*(1/probs) or log(probs) or something else? Do you use a cost function like cross entropy?
I have tried
totalReward*np.log(probs)
totalReward*(1/probs)
totalReward*(probs**2)
totalReward*probs
probs = np.zeros(a_size)
probs[action] = 1
totalRewards*probs
and a couple others. The last one is the only one that was able to solve any of them and it only worked on Cartpole. I have tested the various loss or score functions for thousands of episodes with gradient ascent and descent on Cartpole, Pendulum, and MountainCar. Sometimes it will improve a small amount but it will never solve it. What am I doing wrong?
And here is the RLPolicy code. It is not well written or pseudo coded but I don't think it is the problem because I checked it with gradient checking several times. But it would be helpful even if I could narrow it down to a problem with the neural network or somewhere else in my code.
#Neural Network
import numpy as np
import random, math, time, os
from matplotlib import pyplot as plt
def activation(x,function):
if function=='sigmoid':
return(1/(1+math.e**(-x))) #Sigmoid
if function=='relu':
x[x<0]=0
return(x)
if function=='tanh':
return(np.tanh(x.astype(float))) #tanh
if function=='softmax':
z = np.exp(np.array((x-max(x)),float))
y = np.sum(z)
return(z/y)
def activationDerivative(x,function):
if function=='sigmoid':
return(x*(1-x))
if function=='relu':
x[x<0]==0
x[x>0]==1
return(x)
if function=='tanh':
return(1-x**2)
if function=='softmax':
s = x.reshape(-1,1)
return(np.diagflat(s) - np.dot(s, s.T))
class NeuralNetwork():
def __init__ (self,layers,learning_rate,momentum,regularization,activations):
self.learning_rate = learning_rate
if (isinstance(layers[1],list)):
h = layers[1][:]
del layers[1]
for i in h:
layers.insert(-1,i)
self.layers = layers
self.weights = [2*np.random.rand(self.layers[i]*self.layers[i+1])-1 for i in range(len(self.layers)-1)]
self.biases = [2*np.random.rand(self.layers[i+1])-1 for i in range(len(self.layers)-1)]
self.weights = np.array(self.weights,object)
self.biases = np.array(self.biases,object)
self.activations = activations
def feedforward(self, input_array):
layer = input_array
neuron_outputs = [layer]
for i in range(len(self.layers)-1):
layer = np.tile(layer,self.layers[i+1])
layer = np.reshape(layer,[self.layers[i+1],self.layers[i]])
weights = np.reshape(self.weights[i],[self.layers[i+1],self.layers[i]])
layer = weights*layer
layer = np.sum(layer,1)#,self.layers[i+1]-1)
layer = layer+self.biases[i]
layer = activation(layer,self.activations[i])
neuron_outputs.append(np.array(layer,float))
return(neuron_outputs)
def neuronErrors(self,l,neurons,layerError,n_os):
if (l==len(self.layers)-2):
return(layerError)
totalErr = [] #total error
for e in range(len(layerError)): #-layers
e = e*self.layers[l+2]
a_ws = self.weights[l+1][e:e+self.layers[l+1]]
e = int(e/self.layers[l+2])
err = layerError[e]*a_ws #error
totalErr.append(err)
return(sum(totalErr))
def backpropagation(self,state,loss):
weights_gradient = [np.zeros(self.layers[i]*self.layers[i+1]) for i in range(len(self.layers)-1)]
biases_gradient = [np.zeros(self.layers[i+1]) for i in range(len(self.layers)-1)]
neuron_outputs = self.feedforward(state)
grad = self.individualBackpropagation(loss, neuron_outputs)
return(grad)
def individualBackpropagation(self, difference, neuron_outputs): #number of output
lr = self.learning_rate
n_os = neuron_outputs[:]
w_o = self.weights[:]
b_o = self.biases[:]
w_n = self.weights[:]
b_n = self.biases[:]
gradient_w = []
gradient_b = []
error = difference[:] #error for neurons
for l in range(len(self.layers)-2,-1,-1):
p_n = np.tile(n_os[l],self.layers[l+1]) #previous neuron
neurons = np.arange(self.layers[l+1])
error = (self.neuronErrors(l,neurons,error,n_os))
if not self.activations[l]=='softmax':
error = error*activationDerivative(neuron_outputs[l+1],self.activations[l])
else:
error = error @ activationDerivative(neuron_outputs[l+1],self.activations[l]) #because softmax derivative returns different dimensions
w_grad = np.repeat(error,self.layers[l]) #weights gradient
b_grad = np.ravel(error) #biases gradient
w_grad = w_grad*p_n
b_grad = b_grad
gradient_w.append(w_grad)
gradient_b.append(b_grad)
return(gradient_w,gradient_b)
Thanks for any answers, this is my first question here.
mprouveur's answer was half correct but I felt that I needed to explain the right thing to backpropagate. The answer to my question on ai.stackexchange.com was how I came to understand this. The correct error to backpropagate is the log probability of taking the action multiplied by the goal reward. This can also be calculated as the cross entropy loss between the outputted probabilities and an array of zeros with the action that was taken being one 1. Because of the derivative of cross entropy loss, this will have the effect of pushing only the probability of the action that was taken closer to one. Then, the multiplication of the total reward makes better actions get pushed more to a higher probability. So, with the label being a one-hot encoded vector, the correct equation is label/probs * totalReward
because that is the derivative of cross entropy loss and the derivative of the log of probs. I got this working in other code, but even with this equation I think something else in my code is wrong. It probably has something to do with how I made the softmax derivative too complicated instead of calculating the usual way, by combing the cross entropy derivative and softmax derivative. I will update this answer soon with correct code and more information.