I'm trying to build a DQN model for my PhD progress, and before I implement it with the actual real data, I want to utilize dummy data.
Using the same process with simple Q Learning the approach was effective, but once I transitioned it to DQN to make it more advanced and adaptive, I starting facing issues with the training phase. I also implemented GPU acceleration but it doesn't help at all. I wonder if it's because of the size of the dummy dataset, or something else that I can't figure out.
Any help or guidance is appreciated.
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pandas as pd
from collections import deque
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on device:", device)
# Dummy data setup
data = {
'message_size': np.random.randint(1000, 70000, size=1000),
'cpu_usage': np.random.uniform(40, 100, size=1000),
'submission_time': np.random.uniform(0, 300, size=1000)
}
dummy_data = pd.DataFrame(data)
# Parameters
MAX_BLOCK_SIZE = 32768
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 1.0
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.99
BATCH_SIZE = 32
EPISODES = 1000
# DQN model
class DQN(nn.Module):
def __init__(self, input_dim):
super(DQN, self).__init__()
self.fc1 = nn.Linear(input_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 1)
def forward(self, x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
return self.fc3(x)
# Initialize models and optimizer
dqn = DQN(input_dim=2).to(device)
target_model = DQN(input_dim=2).to(device)
target_model.load_state_dict(dqn.state_dict())
optimizer = optim.Adam(dqn.parameters(), lr=ALPHA)
memory = deque(maxlen=2000)
# Block choice function
def block_choice(state):
if random.random() < EPSILON:
return random.randint(1, int(state[0] // MAX_BLOCK_SIZE) + 1)
else:
state_tensor = torch.FloatTensor(state).to(device)
return torch.argmax(dqn(state_tensor)).item() + 1
# Reward function based on utility
def utility_function_rewarding(total_latency, cpu_per_block, max_latency=300, max_cpu=100):
latency_reward = max(0, 1 - (total_latency / max_latency))
cpu_reward = max(0, 1 - (cpu_per_block / max_cpu))
return latency_reward + cpu_reward
# Training function
def dqn_training(batch_size):
if len(memory) < batch_size:
return
batch = random.sample(memory, batch_size)
states, actions, rewards, next_states, dones = zip(*batch)
# Move data to device
states = torch.FloatTensor(states).to(device)
rewards = torch.FloatTensor(rewards).to(device)
next_states = torch.FloatTensor(next_states).to(device)
dones = torch.FloatTensor(dones).to(device)
state_action_values = dqn(states)
next_state_values = target_model(next_states).max(1)[0]
expected_values = rewards + (GAMMA * next_state_values * (1 - dones))
loss = nn.functional.mse_loss(state_action_values, expected_values)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Store transitions in memory
def store_transition(state, action, reward, next_state, done):
memory.append((state, action, reward, next_state, done))
# Main training loop
for episode in range(EPISODES):
print(f"Starting Episode {episode + 1}/{EPISODES}")
row = dummy_data.sample().iloc[0]
state = [row['submission_time'], row['cpu_usage']]
total_reward = 0
done = False
while not done:
action = block_choice(state)
next_row = dummy_data.sample().iloc[0]
next_latency = next_row['submission_time']
next_cpu = next_row['cpu_usage'] / action
next_state = [next_latency, next_cpu]
reward = utility_function_rewarding(next_latency, next_cpu)
total_reward += reward
done = episode == EPISODES - 1
store_transition(state, action, reward, next_state, done)
state = next_state
dqn_training(BATCH_SIZE)
# Update epsilon for exploration-exploitation balance
if EPSILON > EPSILON_MIN:
EPSILON *= EPSILON_DECAY
print(f"Episode {episode + 1}/{EPISODES} - Total Reward: {total_reward}")
Firstly, this code has a Mismatch Tensor Shape Error:
UserWarning: Using a target size (torch.Size([32])) that is different to the input size (torch.Size([32, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
loss = nn.functional.mse_loss(state_action_values, expected_values)
Solution:
Flatten state_action_values
: Convert it from [32, 1]
to [32]
.
or
Expand expected_values
: Convert it from [32]
to [32, 1]
.
Secondly, there is an infinite loop in episodes.
In your main training loop, the done flag is set based on the condition episode == EPISODES - 1
. This means that for all episodes except the last one, done
remains False
, causing the inner while not done loop to run indefinitely.
Solution:
Implement something like MAX_STEPS_PER_EPISODE
.
MAX_STEPS_PER_EPISODE = 100 # This can be a hyperparameter to tune.