python pytorch reinforcement-learning dqn

DQN model either doesn't work or it is extremely slow in training

I'm trying to build a DQN model for my PhD progress, and before I implement it with the actual real data, I want to utilize dummy data.

Using the same process with simple Q Learning the approach was effective, but once I transitioned it to DQN to make it more advanced and adaptive, I starting facing issues with the training phase. I also implemented GPU acceleration but it doesn't help at all. I wonder if it's because of the size of the dummy dataset, or something else that I can't figure out.

Any help or guidance is appreciated.

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pandas as pd
from collections import deque

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device:", device)

# Dummy data setup
data = {
    'message_size': np.random.randint(1000, 70000, size=1000),
    'cpu_usage': np.random.uniform(40, 100, size=1000),
    'submission_time': np.random.uniform(0, 300, size=1000)
}
dummy_data = pd.DataFrame(data)

# Parameters
MAX_BLOCK_SIZE = 32768
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 1.0
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.99
BATCH_SIZE = 32
EPISODES = 1000

# DQN model
class DQN(nn.Module):
    def __init__(self, input_dim):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Initialize models and optimizer
dqn = DQN(input_dim=2).to(device)
target_model = DQN(input_dim=2).to(device)
target_model.load_state_dict(dqn.state_dict())
optimizer = optim.Adam(dqn.parameters(), lr=ALPHA)
memory = deque(maxlen=2000)

# Block choice function
def block_choice(state):
    if random.random() < EPSILON:
        return random.randint(1, int(state[0] // MAX_BLOCK_SIZE) + 1)
    else:
        state_tensor = torch.FloatTensor(state).to(device)
        return torch.argmax(dqn(state_tensor)).item() + 1

# Reward function based on utility
def utility_function_rewarding(total_latency, cpu_per_block, max_latency=300, max_cpu=100):
    latency_reward = max(0, 1 - (total_latency / max_latency))
    cpu_reward = max(0, 1 - (cpu_per_block / max_cpu))
    return latency_reward + cpu_reward

# Training function
def dqn_training(batch_size):
    if len(memory) < batch_size:
        return
    batch = random.sample(memory, batch_size)
    states, actions, rewards, next_states, dones = zip(*batch)
    
    # Move data to device
    states = torch.FloatTensor(states).to(device)
    rewards = torch.FloatTensor(rewards).to(device)
    next_states = torch.FloatTensor(next_states).to(device)
    dones = torch.FloatTensor(dones).to(device)

    state_action_values = dqn(states)
    next_state_values = target_model(next_states).max(1)[0]
    expected_values = rewards + (GAMMA * next_state_values * (1 - dones))

    loss = nn.functional.mse_loss(state_action_values, expected_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Store transitions in memory
def store_transition(state, action, reward, next_state, done):
    memory.append((state, action, reward, next_state, done))

# Main training loop
for episode in range(EPISODES):
    print(f"Starting Episode {episode + 1}/{EPISODES}")
    row = dummy_data.sample().iloc[0]
    state = [row['submission_time'], row['cpu_usage']]
    total_reward = 0
    done = False

    while not done:
        action = block_choice(state)
        next_row = dummy_data.sample().iloc[0]
        next_latency = next_row['submission_time']
        next_cpu = next_row['cpu_usage'] / action
        next_state = [next_latency, next_cpu]

        reward = utility_function_rewarding(next_latency, next_cpu)
        total_reward += reward
        done = episode == EPISODES - 1
        store_transition(state, action, reward, next_state, done)
        
        state = next_state
        dqn_training(BATCH_SIZE)
    
    # Update epsilon for exploration-exploitation balance
    if EPSILON > EPSILON_MIN:
        EPSILON *= EPSILON_DECAY

    print(f"Episode {episode + 1}/{EPISODES} - Total Reward: {total_reward}")

Solution

Firstly, this code has a Mismatch Tensor Shape Error:

UserWarning: Using a target size (torch.Size([32])) that is different to the input size (torch.Size([32, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  loss = nn.functional.mse_loss(state_action_values, expected_values)

Solution:

Flatten state_action_values: Convert it from [32, 1] to [32].

Expand expected_values: Convert it from [32] to [32, 1].

Secondly, there is an infinite loop in episodes.

In your main training loop, the done flag is set based on the condition episode == EPISODES - 1. This means that for all episodes except the last one, done remains False, causing the inner while not done loop to run indefinitely.

Solution:

Implement something like MAX_STEPS_PER_EPISODE.

MAX_STEPS_PER_EPISODE = 100  # This can be a hyperparameter to tune.