I want to build a model (RNN) on an image sequence to predict a binary variable. I saw a lot of tutorials for multi-input classification, but I didn't find anywhere how to import my data ? I have a folder containing folders of image sequence like this:
-- film_1
-------------film_1_image1
-------------film_1_image2
-------------film_1_image3
-------------film_1_image4
-- film_2
-------------film_2_image1
-------------film_2_image2
-------------film_2_image3
-------------film_2_image4
I know that I need data with the shape (batch, time, width, height) but I didn't know how to do that. I searched for a solution in keras
and pytorch
but I didn't find anything.
I would like to import my films, split the dataset in train/validation and train a model.
Does anyone know how to import data like that ?
I tried to use a DataLoader
but it's quite difficult to understand how it works. So if anyone can help, it would be great.
Here is a PyTorch for training RNN example,
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
class ImageSequenceDataset(Dataset):
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.transform = transform
self.sequences = sorted(os.listdir(root_dir))
def __len__(self):
return len(self.sequences)
def __getitem__(self, idx):
sequence_dir = os.path.join(self.root_dir, self.sequences[idx])
images = sorted(os.listdir(sequence_dir))
image_sequence = []
for image_name in images:
image_path = os.path.join(sequence_dir, image_name)
image = Image.open(image_path).convert('RGB')
if self.transform:
image = self.transform(image)
image_sequence.append(image)
#convert list of images into a single tensor
image_sequence = torch.stack(image_sequence)
#example: Replace with your method to get the label
label = self.get_label(sequence_dir)
return image_sequence, label
def get_label(self, sequence_dir):
#implement your label extraction logic here
#this is a dummy implementation, replace it with your actual label logic
if "positive" in sequence_dir:
return 1
else:
return 0
then you must define your transforms,
transform = transforms.Compose([
transforms.Resize((128, 128)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
create a dataset and loader,
#assuming the data is located in "path/to/your/data"
dataset = ImageSequenceDataset(root_dir="path/to/your/data", transform=transform)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)
now you can setup your RNN model, something like,
class RNNModel(nn.Module):
def __init__(self):
super(RNNModel, self).__init__()
#example CNN to process each frame
self.cnn = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.rnn = nn.LSTM(input_size=64*16*16, hidden_size=128, num_layers=1, batch_first=True)
self.fc = nn.Linear(128, 1)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
batch_size, seq_len, c, h, w = x.size()
cnn_out = []
for t in range(seq_len):
out = self.cnn(x[:, t, :, :, :])
out = out.view(batch_size, -1) #flatten the output
cnn_out.append(out)
cnn_out = torch.stack(cnn_out, dim=1) #reshape to (batch_size, seq_len, -1)
rnn_out, _ = self.rnn(cnn_out)
out = self.fc(rnn_out[:, -1, :]) #use the last RNN output
out = self.sigmoid(out)
return out
model init and loss function definition,
model = RNNModel()
criterion = nn.BCELoss() # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
then start training your model,
num_epochs = 10
for epoch in range(num_epochs):
model.train()
running_loss = 0.0
for i, (inputs, labels) in enumerate(dataloader):
inputs, labels = inputs, labels.float().view(-1, 1) #adjust labels shape
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader)}")
Hope this example code snippets would be helpful. cheers!