pythondeep-learningpytorchrecommendation-enginepytorch-lightning

Deep movie recommendation system with pytorch/pytorch-lightining


I am trying to build a recommender system using movielens 1m dataset and pytorch/pytorch-lightining frameworks and i get an out of bounds error saying the following:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-24-5371fdf1850d> in <cell line: 5>()
      3 max_movie_id = data_features['movieId'].max()
      4 n_movies = max_movie_id + 1  # Add 1 to include the maximum movie ID
----> 5 model = LSTM(n_users, n_movies, embed_dim, hidden_dim, train_u_i_dict)
      6 trainer = pl.Trainer(gpus=1, max_epochs=2, progress_bar_refresh_rate=20, check_val_every_n_epoch=1)
      7 trainer.fit(model, train_dataloaders=trainLoader, val_dataloaders=valLoader)

<ipython-input-22-2862a796cf85> in __init__(self, n_users, n_movies, embed_dim, hidden_size, u_i_dict, k, lr)
     26         self.mask = torch.ones(n_users, n_movies)
     27         for k, v in u_i_dict.items():
---> 28             self.mask[k][v] = 0
     29 
     30 

IndexError: index 6040 is out of bounds for dimension 0 with size 6040


I tried to make self mask tensor match the dimensions properly, and change the k,v counters without any result. The full modelling code:

def train_val_test_split(dataset, sequence_length):

    dataset, n_users, n_movies, user_interactions = preprocess_dataset(dataset, sequence_length)

    dataset_grouped = dataset.groupby('userId').cumcount(ascending=False)

    train = dataset[dataset_grouped > 1]
    val = dataset[dataset_grouped == 1]
    test = dataset[dataset_grouped == 0]
    return train, val, test, n_users, n_movies, user_interactions


def preprocess_dataset(dataset, sequence_length):
    n_users = dataset['userId'].nunique()
    n_movies = dataset['movieId'].nunique()
    
    def generate_sequences(x):
        movies_list = x.values
        result = []
        for i in range(len(movies_list) - 1):
            sequence = np.zeros(sequence_length, dtype=int) + n_movies
            start_index = max(0, i - sequence_length)
            end_index = i + 1
            for idx, movie in enumerate(reversed(movies_list[start_index:end_index])):
                sequence[sequence_length - 1 - idx] = int(movie)
            result.append((sequence, movies_list[i+1]))
        return result

    user_interactions = dataset.groupby('userId')['movieId'].apply(list).to_dict()
    
    
    dataset = dataset.groupby('userId')['movieId'].apply(generate_sequences).reset_index()
    dataset = dataset.explode('movieId')
    dataset[['features','label']] =  pd.DataFrame(dataset['movieId'].tolist(), index=dataset.index)
    
    return dataset[['userId','features','label']], n_users, n_movies, user_interactions
train, val, test, n_users, n_movies, train_u_i_dict = train_val_test_split(data_features, 4)
for user, interactions in train_u_i_dict.items():
    print("User:", user)
    print("Interactions:", interactions)
    print()
train
from numpy import double
from torch.utils.data import Dataset

class SequentialDataset(Dataset):
    def __init__(self, dataset, train):
        self.dataset = dataset
        if not train:
            self.dataset.sort_values(by='userId',inplace=True)
    
    def __len__(self):
        return self.dataset.shape[0]
        
    def __getitem__(self, index):
        row = self.dataset.iloc[index]
        user = int(row['userId'])
        sequence = row['features']
        target = int(row['label'])
        return {'users':np.tile(user, len(sequence)), 'seqs': sequence, 'targets':target}
trainLoader = DataLoader(SequentialDataset(train, True), batch_size=2, shuffle=True)
valLoader = DataLoader(SequentialDataset(val, False), batch_size=val.shape[0])
testLoader = DataLoader(SequentialDataset(test, False), batch_size=test.shape[0])
for i, batch in enumerate(trainLoader):
  print (batch)
  break
#Pytorch-Lightning template
class MyModel(pl.LightningModule):

  def __init__(self):
    super(MyModel, self).__init__()

  def training_step(self, batch, batch_idx):
    pass

  def validation_step(self, batch, batch_idx):
    pass

  def test_step(self, batch, batch_idx):
    pass

  def configure_optimizers(self):
    pass
class LSTM(pl.LightningModule):
    def __init__(self, n_users, n_movies, embed_dim, hidden_size, u_i_dict, k=5, lr=1e-3):
        super(LSTM, self).__init__()

        self.user_embedding = nn.Embedding(n_users, embed_dim)
        self.item_embedding = nn.Embedding(n_movies + 1, embed_dim, padding_idx=n_movies)

        self.lstm = nn.LSTM(
            input_size=2 * embed_dim,
            hidden_size=hidden_size,
            num_layers=1,
            bias=True,
            batch_first=True,
            dropout=0.,
            bidirectional=False,
            proj_size=0
        )

        self.l1 = nn.Linear(in_features=hidden_size, out_features=hidden_size * 2)
        self.dropout = nn.Dropout(0.1)
        self.l2 = nn.Linear(in_features=hidden_size * 2, out_features=n_movies)
        self.softmax = nn.Softmax(dim=2)
        self.loss = nn.CrossEntropyLoss()
        self.lr = lr
        self.k = k
        self.mask = torch.ones(n_users, n_movies)
        for k, v in u_i_dict.items():
            self.mask[k][v] = 0


    def predict(self, batch):
        users= self.user_embedding(batch['users'])
        movies = self.item_embedding(batch['seqs'])
        
        input = torch.cat((users, movies), dim=2)
        output, _ = self.lstm(input)
        output = self.l1(output)
        output = self.dropout(output)
        output = self.l2(output)
        output = self.softmax(output)
        return output
    
    def training_step(self, batch, batch_idx):
        targets = batch['targets']
        output = self.predict(batch)
        loss = self.loss(output[:,-1,:], targets)
        self.log('train_loss', loss, on_epoch=True)
        return loss
    
    def evaluation(self, outputs, target):
        top_k_items = torch.argsort(outputs, dim=1, descending=True)[:,:self.k]
        print(top_k_items.size())
        ndcg_k = ndcg(top_k_items, target)
        hr_k = hr(top_k_items, target)
        return ndcg_k, hr_k
    
    def validation_step(self, batch, batch_idx):
        targets = batch['targets']
        output = self.predict(batch)
        print(output.size())
        print(output[:,-1,:].size())
        print(targets.size())
        
        loss = self.loss(output[:,-1,:], targets)
        
        ndcg_k, hr_k = self.evaluation(output[:,-1,:], targets)
        self.log('val_loss', loss)
        self.log('val_ndcg', ndcg_k)
        self.log('val_hr', hr_k)
        return loss
    
    def test_step(self, batch, batch_idx):
        targets = batch['targets']
        output = self.predict(batch)
        loss = self.loss(output[:,-1,:], targets)
        
        ndcg_k, hr_k = self.evaluation(output, targets)
        self.log('test_loss', loss)
        self.log('test_ndcg', ndcg_k)
        self.log('test_hr', hr_k)
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer
import math
def ndcg(ratings, target):

    ndcg = []
    for i,v in enumerate(ratings):
        condition = v == target[i]
        indices = condition.nonzero()
        if indices.size()[0] > 0:
            ranking = indices[0][0]
            ndcg.append(1. / math.log(ranking.item() + 2,2))
        else:
            ndcg.append(0.)
    return sum(ndcg) / float(len(ndcg))

def hr(ratings, target):
    hr = []
    for i,v in enumerate(ratings):
        condition = v == target[i]
        indices = condition.nonzero()
        if indices.size()[0] > 0:
            hr.append(1.)
        else:
            hr.append(0.)
    return sum(hr) / float(len(hr))
embed_dim = 128
hidden_dim= 256
max_movie_id = data_features['movieId'].max()
n_movies = max_movie_id + 1  # Add 1 to include the maximum movie ID
model = LSTM(n_users, n_movies, embed_dim, hidden_dim, train_u_i_dict)
trainer = pl.Trainer(gpus=1, max_epochs=2, progress_bar_refresh_rate=20, check_val_every_n_epoch=1)
trainer.fit(model, train_dataloaders=trainLoader, val_dataloaders=valLoader)
test = trainer.test(dataloaders = testLoader, verbose=True)

Does anyone have any idea on where is the exact problem and how to fix it. Thanks in advance


Solution

  • This line

    self.mask[k][v] = 0
    

    is going out of array bounds (by one unit) for an array that was sized

    self.mask = torch.ones(n_users, n_movies)
    

    It's not saying if it's k or v that is too large. Looking at how you are passing n_movies I think you need to try incrementing n_users by 1 (cannot see that part of your code).