pythonpytorchreinforcement-learningdqn

using DQN to solve shortest path


I'm trying to find out if DQN can solve the shortest path algorithm so I have this Dataframe which contains a source which has nodes id ,end which represents the destination and also has nodes id, and the weights which represent the distance of the edge and then I converted the data frame into a graph theory as following

DataFrame

    source  end weight
0   688615041   208456626   15.653688122127072
1   688615041   1799221665  10.092266065922756
2   1799221657  1799221660  8.673942902872051
3   1799221660  1799221665  15.282152665774992
4   1799221660  2003461246  25.85307821157314
5   1799221660  299832604   75.99884525624508
6   299832606   2003461227  4.510148061854331
7   299832606   2003461246  10.954119220974723
8   299832606   2364408910  4.903114362426424
9   1731824802  2003461235  6.812335798968233
10  1799221677  208456626   8.308567154008992
11  208456626   2003461246  14.56512909988425
12  208456626   1250468692  16.416527267975034
13  1011881546  1250468696  12.209773608913697
14  1011881546  2003461246  7.477102764665149
15  2364408910  1130166767  9.780352545373274
16  2364408910  2003461246  6.660771089602594
17  2364408910  2003461237  3.125301826317477
18  2364408911  2003461240  3.836966849565568
19  2364408911  2003461246  6.137847950353395
20  2364408911  2003461247  7.399469477211698
21  2364408911  2003461237  3.90876793066916
22  1250468692  1250468696  8.474825189804282
23  1250468701  2003461247  4.539111170687284
24  2003461235  2003461246  12.400601105777394
25  2003461246  2003461247  12.437602668573737

and the graph looks like this

pos = nx.spring_layout(g)
edge_labels = nx.get_edge_attributes(g, 'weight')
nx.draw(g, pos, node_size=100)
nx.draw_networkx_edge_labels(g, pos, edge_labels, font_size=8)
nx.draw_networkx_labels(g, pos, font_size=10)
plt.title("Syntethic representation of the City")
plt.show()
print('Total number of Nodes: '+str(len(g.nodes)))

graph

Now I used DQN in a fixed state from node number 1130166767 as a start to node number 1731824802 as a goal.

this is the whole code of mine

class Network(nn.Module):
  def __init__(self,input_dim,n_action):
    super(Network,self).__init__()
    self.f1=nn.Linear(input_dim,128)
    self.f2=nn.Linear(128,64)
    self.f3=nn.Linear(64,32)
    self.f4=nn.Linear(32,n_action)
    #self.optimizer=optim.Adam(self.parameters(),lr=lr)
    #self.loss=nn.MSELoss()
    self.device=T.device('cuda' if T.cuda.is_available() else 'cpu')
    self.to(self.device)

    
      def forward(self,x):
        x=F.relu(self.f1(x))
        x=F.relu(self.f2(x))
        x=F.relu(self.f3(x))
        x=self.f4(x)
        return x
    
      def act(self,obs):
        #state=T.tensor(obs).to(device)
        state=obs.to(self.device)
        actions=self.forward(state)
        action=T.argmax(actions).item()
    
        return action

device=T.device('cuda' if T.cuda.is_available() else 'cpu')
print(device)

num_states = len(g.nodes)*1
### if we need to train a specific set of nodes for ex 10 we *10
num_actions = len(g.nodes)
print("Expected number of States are: "+str(num_states))
print("Expected number of action are: "+str(num_actions))

#num_action*2=when we would like to convert the state into onehotvector we need to concatinate the two vector 22+22
online=Network(num_actions*2,num_actions)
target=Network(num_actions*2,num_actions)
target.load_state_dict(online.state_dict())
optimizer=T.optim.Adam(online.parameters(),lr=5e-4)

#create a dictionary that have encoded index for each node
#to solve this isssu
#reset()=476562122273
#number of state < 476562122273
enc_node={}
dec_node={}
for index,nd in enumerate(g.nodes):
  enc_node[nd]=index
  dec_node[index]=nd

def wayenc(current_node,new_node,type=1):
  #encoded
  if type==1: #distance
    if new_node in g[current_node]:
      rw=g[current_node][new_node]['weight']*-1
      return rw,True
    rw=-5000
    return rw,False

def rw_function(current,action):
  #current_node
  #new_node
  beta=1 #between 1 and 0
  current=dec_node[current]
  new_node=dec_node[action]
  rw0,link=wayenc(current,new_node)
  rw1=0
  frw=rw0*beta+(1-beta)*rw1


  return frw,link

def state_enc(dst, end,n=len(g.nodes)):
    return dst+n*end

def state_dec(state,n=len(g.nodes)):
    dst = state%n
    end = (state-dst)/n
    return dst, int(end)

def step(state,action):
    done=False    
    current_node , end = state_dec(state)

    new_state = state_enc(action,end)


    rw,link=rw_function(current_node,action)

    if not link:
        new_state = state
        return new_state,rw,False  

    elif action == end:
        rw = 10000 #500*12
        done=True
      
    return new_state,rw,done

def reset():
  state=state_enc(enc_node[1130166767],enc_node[1731824802])
  return state

def state_to_vector(current_node,end_node):
  n=len(g.nodes)
  source_state_zeros=[0.]*n
  source_state_zeros[current_node]=1

  end_state_zeros=[0.]*n
  end_state_zeros[end_node]=1.
  vector=source_state_zeros+end_state_zeros
  return vector
    

#return a list of list converted from state to vectors
def list_of_vecotrs(new_obses_t):
  list_new_obss_t=new_obses_t.tolist()
  #convert to integer
  list_new_obss_t=[int(v) for v in list_new_obss_t]
  vector_list=[]
  for state in list_new_obss_t:
    s,f=state_dec(state)
    vector=state_to_vector(s,f)
    vector_list.append(vector)
  return vector_list

#fill the replay buffer
#replay_buffer=[]
rew_buffer=[0]
penalties=[]
episode_reward=0.0
batch_size=num_actions*2
buffer_size=100000
min_replay_size=int(buffer_size*0.20)
target_update_freq=1000
flag=0
action_list=np.arange(0,len(g.nodes)).tolist()
replay_buffer=deque(maxlen=buffer_size)


#populate the experience network 
obs=reset()
#obs,end=state_dec(start,len(g.nodes))
for _ in tqdm(range(min_replay_size)):
  action=np.random.choice(action_list)
  new_obs,rew,done=step(obs,action)
  transition=(obs,action,rew,done,new_obs)
  replay_buffer.append(transition)
  obs=new_obs
  if done:
    obs=reset()

#main training loop
obs=reset()
episodes=100000
start=1
end=0.1
decay=episodes
gamma=0.99
epsilon=0.5



gamma_list=[]
mean_reward=[]
done_location=[]
loss_list=[]
number_of_episodes=[]
stat_dict={'episodes':[],'epsilon':[],'explore_exploit':[],'time':[]}


for i in tqdm(range(episodes)):
  itr=0
  #epsilon=np.interp(i,[0,decay],[start,end])
  #gamma=np.interp(i,[0,decay],[start,end])
  epsilon=np.exp(-i/(episodes/3))
  rnd_sample=random.random()

  stat_dict['episodes'].append(i)
  stat_dict['epsilon'].append(epsilon)

  #choose an action
  if rnd_sample <=epsilon:
    action=np.random.choice(action_list)
    stat_dict['explore_exploit'].append('explore')

  else:
    source,end=state_dec(obs)
    v_obs=state_to_vector(source,end)
    t_obs=T.tensor(v_obs)
    action=online.act(t_obs)
    stat_dict['explore_exploit'].append('exploit')

  #fill transition and append to replay buffer

  
  new_obs,rew,done=step(obs,action)

  transition=(obs,action,rew,done,new_obs)
  replay_buffer.append(transition)
  obs=new_obs
  episode_reward+=rew


  if done:
    obs=reset()
    rew_buffer.append(episode_reward)
    episode_reward=0.0
    done_location.append(i)


  #start gradient step

  transitions=random.sample(replay_buffer,batch_size)

  obses=np.asarray([t[0] for t in transitions])
  actions=np.asarray([t[1] for t in transitions])
  rews=np.asarray([t[2] for t in transitions])
  dones=np.asarray([t[3] for t in transitions])
  new_obses=np.asarray([t[4] for t in transitions])


  obses_t=T.as_tensor(obses,dtype=T.float32).to(device)
  actions_t=T.as_tensor(actions,dtype=T.int64).to(device).unsqueeze(-1)
  rews_t=T.as_tensor(rews,dtype=T.float32).to(device)
  dones_t=T.as_tensor(dones,dtype=T.float32).to(device)
  new_obses_t=T.as_tensor(new_obses,dtype=T.float32).to(device)

  
  list_new_obses_t=T.tensor(list_of_vecotrs(new_obses_t)).to(device)
  target_q_values=target(list_new_obses_t)##


  max_target_q_values=target_q_values.max(dim=1,keepdim=False)[0]
  targets=rews_t+gamma*(1-dones_t)*max_target_q_values

  
  list_obses_t=T.tensor(list_of_vecotrs(obses_t)).to(device)
  q_values=online(list_obses_t)
  action_q_values=T.gather(input=q_values,dim=1,index=actions_t)


  
  #warning UserWarning: Using a target size (torch.Size([24, 24])) that is different to the input size (torch.Size([24, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  targets=targets.unsqueeze(-1)
  loss=nn.functional.mse_loss(action_q_values,targets)
  #loss=rmsle(action_q_values,targets)
  loss_list.append(loss.item())

  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

  #plot
  mean_reward.append(np.mean(rew_buffer))
  number_of_episodes.append(i)
  gamma_list.append(gamma)
  dec = {'number_of_episodes':number_of_episodes,'mean_reward':mean_reward,'gamma':gamma_list}
  #clear_output(wait=True)
  #sns.lineplot(data=dec, x="number_of_episodes", y="mean_reward")
  #plt.show()

  

  if i % target_update_freq==0:
    target.load_state_dict(online.state_dict())
  if i % 1000 ==0:
    print('step',i,'avg rew',round(np.mean(rew_buffer),2))
    pass

now if you can see from thephotos

nither the rewards are increasing or the loss is decreasing, I tried the following

  1. increasing and decreasing the learning rate

  2. changing target_update_freq from 100 1000 1000

  3. I tried changing the state representation from Onehotvector to [state, end] and sent it as pair.

  4. i tried to change the loss function from mse_loss,smooth_l1,... etc

  5. i tried to increase the number of episodes

  6. adding another layer to NN network 7.changing how the decay of epsilon works linear ,exponential

most of these solutions are from questions on Stacked, but nothing works for me

How can I improve the performance? or in another ward? How can I increase the rewards?


Solution

  • It seems your problem needs just parameters tunning only

    1. I change your learning rate into 0.02
    2. I changed the dimension of the states that been send to NN
    class Network(nn.Module):
      def __init__(self,input_dim,n_action):
        super(Network,self).__init__()
        self.f1=nn.Linear(input_dim,128)
        self.f2=nn.Linear(128,64)
        self.f3=nn.Linear(64,32)
        self.f4=nn.Linear(32,n_action)
        #self.optimizer=optim.Adam(self.parameters(),lr=lr)
        #self.loss=nn.MSELoss()
        self.device=T.device('cuda' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
      def forward(self,x):
        x=F.relu(self.f1(x))
        x=F.relu(self.f2(x))
        x=F.relu(self.f3(x))
        x=self.f4(x)
        return x
    
      def act(self,obs):
        #state=T.tensor(obs).to(device)
        state=obs.to(self.device)
        actions=self.forward(state)
        action=T.argmax(actions).item()
    
        return action
    
    device=T.device('cuda' if T.cuda.is_available() else 'cpu')
    print(device)
    num_states = len(g.nodes)**2
    num_actions = len(g.nodes)
    online=Network(num_actions*2,num_actions)
    target=Network(num_actions*2,num_actions)
    target.load_state_dict(online.state_dict())
    optimizer=T.optim.Adam(online.parameters(),lr=1e-2)
    enc_node={}
    dec_node={}
    for index,nd in enumerate(g.nodes):
      enc_node[nd]=index
      dec_node[index]=nd
    
    def wayenc(current_node,new_node,type=1):
      #encoded
      if type==1: #distance
        if new_node in g[current_node]:
          rw=g[current_node][new_node]['weight']*-1
          return rw,True
        rw=-1000
        return rw,False
    def rw_function(current,action):
      beta=1 
      current=dec_node[current]
      new_node=dec_node[action]
      rw0,link=wayenc(current,new_node)
      rw1=0
      frw=rw0*beta+(1-beta)*rw1
    
    
      return frw,link
    
    def state_enc(dst, end,n=len(g.nodes)):
        return dst+n*end
    
    def state_dec(state,n=len(g.nodes)):
        dst = state%n
        end = (state-dst)/n
        return dst, int(end)
    
    def step(state,action):
        done=False    
        current_node , end = state_dec(state)
    
        new_state = state_enc(action,end)
    
    
        rw,link=rw_function(current_node,action)
    
        if not link:
            new_state = state
            return new_state,rw,False  
    
        elif action == end:
            rw = 10000
            done=True
          
        return new_state,rw,done
    
    def reset():
      state=state_enc(enc_node[1130166767],enc_node[1731824802])
      return state
    
    def state_to_vector(current_node,end_node):
      n=len(g.nodes)
      source_state_zeros=[0.]*n
      source_state_zeros[current_node]=1
    
      end_state_zeros=[0.]*n
      end_state_zeros[end_node]=1.
      vector=source_state_zeros+end_state_zeros
      return vector
    
    
        
    
    #return a list of list converted from state to vectors
    def list_of_vecotrs(new_obses_t):
      list_new_obss_t=new_obses_t.tolist()
      #convert to integer
      list_new_obss_t=[int(v) for v in list_new_obss_t]
      vector_list=[]
      for state in list_new_obss_t:
        s,f=state_dec(state)
        vector=state_to_vector(s,f)
        vector_list.append(vector)
      return vector_list
    
      #replay_buffer=[]
    rew_buffer=[0]
    penalties=[]
    episode_reward=0.0
    #batch_size=num_actions*2
    batch_size=32
    buffer_size=50000 
    min_replay_size=int(buffer_size*0.25)
    
    target_update_freq=1000
    flag=0
    action_list=np.arange(0,len(g.nodes)).tolist()
    replay_buffer=deque(maxlen=min_replay_size)
    
    
    #populate the experience network 
    obs=reset()
    #obs,end=state_dec(start,len(g.nodes))
    for _ in tqdm(range(min_replay_size)):
      action=np.random.choice(action_list)
      new_obs,rew,done=step(obs,action)
      transition=(obs,action,rew,done,new_obs)
      replay_buffer.append(transition)
      obs=new_obs
      if done:
        obs=reset()
    
    #main training loop
    obs=reset()
    episodes=70000
    start=1
    end=0.1
    decay=episodes
    gamma=0.99
    epsilon=0.5
    
    
    
    gamma_list=[]
    mean_reward=[]
    done_location=[]
    loss_list=[]
    number_of_episodes=[]
    stat_dict={'episodes':[],'epsilon':[],'explore_exploit':[],'time':[]}
    
    
    for i in tqdm(range(episodes)):
      itr=0
    
      epsilon=np.exp(-i/(episodes/2))
      rnd_sample=random.random()
    
      stat_dict['episodes'].append(i)
      stat_dict['epsilon'].append(epsilon)
    
      if rnd_sample <=epsilon:
        action=np.random.choice(action_list)
        stat_dict['explore_exploit'].append('explore')
    
      else:
        source,end=state_dec(obs)
        v_obs=state_to_vector(source,end)
        t_obs=T.tensor([v_obs])
        action=online.act(t_obs)
        stat_dict['explore_exploit'].append('exploit')
    
    
      
      new_obs,rew,done=step(obs,action)
    
      transition=(obs,action,rew,done,new_obs)
      replay_buffer.append(transition)
      obs=new_obs
      episode_reward+=rew
    
    
      if done:
        obs=reset()
        rew_buffer.append(episode_reward)
        episode_reward=0.0
        done_location.append(i)
    
    
      batch_size=32
      transitions=random.sample(replay_buffer,batch_size)
    
      obses=np.asarray([t[0] for t in transitions])
      actions=np.asarray([t[1] for t in transitions])
      rews=np.asarray([t[2] for t in transitions])
      dones=np.asarray([t[3] for t in transitions])
      new_obses=np.asarray([t[4] for t in transitions])
    
    
      obses_t=T.as_tensor(obses,dtype=T.float32).to(device)
      actions_t=T.as_tensor(actions,dtype=T.int64).to(device).unsqueeze(-1)
      rews_t=T.as_tensor(rews,dtype=T.float32).to(device)
      dones_t=T.as_tensor(dones,dtype=T.float32).to(device)
      new_obses_t=T.as_tensor(new_obses,dtype=T.float32).to(device)
    
      
      list_new_obses_t=T.tensor(list_of_vecotrs(new_obses_t)).to(device)
      target_q_values=target(list_new_obses_t)##
      #target_q_values=target(obses_t)
    
    
      max_target_q_values=target_q_values.max(dim=1,keepdim=False)[0]
      targets=rews_t+gamma*(1-dones_t)*max_target_q_values
      targets=targets.unsqueeze(-1)
      
      list_obses_t=T.tensor(list_of_vecotrs(obses_t)).to(device)
      q_values=online(list_obses_t)
      #q_values=online(obses_t)
      action_q_values=T.gather(input=q_values,dim=1,index=actions_t)
    
    
      
      
      loss=nn.functional.mse_loss(action_q_values,targets)
      loss_list.append(loss.item())
    
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    
      mean_reward.append(np.mean(rew_buffer))
      number_of_episodes.append(i)
      gamma_list.append(gamma)
      dec = {'number_of_episodes':number_of_episodes,'mean_reward':mean_reward,'gamma':gamma_list}
    
      
    
      if i % target_update_freq==0:
        target.load_state_dict(online.state_dict())
      if i % 1000 ==0:
        print('step',i,'avg rew',round(np.mean(rew_buffer),2))
        pass
      if i==5000:
        pass
    
    

    I did run this script and it gave me a good performance changing the learning rate helped a lot

    enter image description here