pythonkeraspytorchclassificationrecurrent-neural-network

How to import dataset for image sequence classification?


I want to build a model (RNN) on an image sequence to predict a binary variable. I saw a lot of tutorials for multi-input classification, but I didn't find anywhere how to import my data ? I have a folder containing folders of image sequence like this:

-- film_1

-------------film_1_image1

-------------film_1_image2

-------------film_1_image3

-------------film_1_image4

-- film_2

-------------film_2_image1

-------------film_2_image2

-------------film_2_image3

-------------film_2_image4

I know that I need data with the shape (batch, time, width, height) but I didn't know how to do that. I searched for a solution in keras and pytorch but I didn't find anything. I would like to import my films, split the dataset in train/validation and train a model. Does anyone know how to import data like that ?

I tried to use a DataLoader but it's quite difficult to understand how it works. So if anyone can help, it would be great.


Solution

  • Here is a PyTorch for training RNN example,

    import os
    from PIL import Image
    import torch
    from torch.utils.data import Dataset, DataLoader
    import torchvision.transforms as transforms
    import torch.nn as nn
    import torch.optim as optim
    
    class ImageSequenceDataset(Dataset):
        def __init__(self, root_dir, transform=None):
            self.root_dir = root_dir
            self.transform = transform
            self.sequences = sorted(os.listdir(root_dir))
    
        def __len__(self):
            return len(self.sequences)
    
        def __getitem__(self, idx):
            sequence_dir = os.path.join(self.root_dir, self.sequences[idx])
            images = sorted(os.listdir(sequence_dir))
            image_sequence = []
            for image_name in images:
                image_path = os.path.join(sequence_dir, image_name)
                image = Image.open(image_path).convert('RGB')
                if self.transform:
                    image = self.transform(image)
                image_sequence.append(image)
    
            #convert list of images into a single tensor
            image_sequence = torch.stack(image_sequence)
            #example: Replace with your method to get the label
            label = self.get_label(sequence_dir)
            return image_sequence, label
    
        def get_label(self, sequence_dir):
            #implement your label extraction logic here
            #this is a dummy implementation, replace it with your actual label logic
            if "positive" in sequence_dir:
                return 1
            else:
                return 0
    

    then you must define your transforms,

    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    

    create a dataset and loader,

    #assuming the data is located in "path/to/your/data"
    dataset = ImageSequenceDataset(root_dir="path/to/your/data", transform=transform)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)
    

    now you can setup your RNN model, something like,

    class RNNModel(nn.Module):
        def __init__(self):
            super(RNNModel, self).__init__()
            #example CNN to process each frame
            self.cnn = nn.Sequential(
                nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2),
                nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2)
            )
            self.rnn = nn.LSTM(input_size=64*16*16, hidden_size=128, num_layers=1, batch_first=True)
            self.fc = nn.Linear(128, 1)
            self.sigmoid = nn.Sigmoid()
    
        def forward(self, x):
            batch_size, seq_len, c, h, w = x.size()
            cnn_out = []
            for t in range(seq_len):
                out = self.cnn(x[:, t, :, :, :])
                out = out.view(batch_size, -1)  #flatten the output
                cnn_out.append(out)
            
            cnn_out = torch.stack(cnn_out, dim=1)  #reshape to (batch_size, seq_len, -1)
            rnn_out, _ = self.rnn(cnn_out)
            out = self.fc(rnn_out[:, -1, :])  #use the last RNN output
            out = self.sigmoid(out)
            return out
    

    model init and loss function definition,

    model = RNNModel()
    criterion = nn.BCELoss()  # Binary Cross Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    

    then start training your model,

    num_epochs = 10
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        
        for i, (inputs, labels) in enumerate(dataloader):
            inputs, labels = inputs, labels.float().view(-1, 1)  #adjust labels shape
            optimizer.zero_grad()
    
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    
            running_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(dataloader)}")
    

    Hope this example code snippets would be helpful. cheers!