I am trying to build a recommender system using movielens 1m dataset and pytorch/pytorch-lightining frameworks and i get an out of bounds error saying the following:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-24-5371fdf1850d> in <cell line: 5>()
3 max_movie_id = data_features['movieId'].max()
4 n_movies = max_movie_id + 1 # Add 1 to include the maximum movie ID
----> 5 model = LSTM(n_users, n_movies, embed_dim, hidden_dim, train_u_i_dict)
6 trainer = pl.Trainer(gpus=1, max_epochs=2, progress_bar_refresh_rate=20, check_val_every_n_epoch=1)
7 trainer.fit(model, train_dataloaders=trainLoader, val_dataloaders=valLoader)
<ipython-input-22-2862a796cf85> in __init__(self, n_users, n_movies, embed_dim, hidden_size, u_i_dict, k, lr)
26 self.mask = torch.ones(n_users, n_movies)
27 for k, v in u_i_dict.items():
---> 28 self.mask[k][v] = 0
29
30
IndexError: index 6040 is out of bounds for dimension 0 with size 6040
I tried to make self mask tensor match the dimensions properly, and change the k,v counters without any result. The full modelling code:
def train_val_test_split(dataset, sequence_length):
dataset, n_users, n_movies, user_interactions = preprocess_dataset(dataset, sequence_length)
dataset_grouped = dataset.groupby('userId').cumcount(ascending=False)
train = dataset[dataset_grouped > 1]
val = dataset[dataset_grouped == 1]
test = dataset[dataset_grouped == 0]
return train, val, test, n_users, n_movies, user_interactions
def preprocess_dataset(dataset, sequence_length):
n_users = dataset['userId'].nunique()
n_movies = dataset['movieId'].nunique()
def generate_sequences(x):
movies_list = x.values
result = []
for i in range(len(movies_list) - 1):
sequence = np.zeros(sequence_length, dtype=int) + n_movies
start_index = max(0, i - sequence_length)
end_index = i + 1
for idx, movie in enumerate(reversed(movies_list[start_index:end_index])):
sequence[sequence_length - 1 - idx] = int(movie)
result.append((sequence, movies_list[i+1]))
return result
user_interactions = dataset.groupby('userId')['movieId'].apply(list).to_dict()
dataset = dataset.groupby('userId')['movieId'].apply(generate_sequences).reset_index()
dataset = dataset.explode('movieId')
dataset[['features','label']] = pd.DataFrame(dataset['movieId'].tolist(), index=dataset.index)
return dataset[['userId','features','label']], n_users, n_movies, user_interactions
train, val, test, n_users, n_movies, train_u_i_dict = train_val_test_split(data_features, 4)
for user, interactions in train_u_i_dict.items():
print("User:", user)
print("Interactions:", interactions)
print()
train
from numpy import double
from torch.utils.data import Dataset
class SequentialDataset(Dataset):
def __init__(self, dataset, train):
self.dataset = dataset
if not train:
self.dataset.sort_values(by='userId',inplace=True)
def __len__(self):
return self.dataset.shape[0]
def __getitem__(self, index):
row = self.dataset.iloc[index]
user = int(row['userId'])
sequence = row['features']
target = int(row['label'])
return {'users':np.tile(user, len(sequence)), 'seqs': sequence, 'targets':target}
trainLoader = DataLoader(SequentialDataset(train, True), batch_size=2, shuffle=True)
valLoader = DataLoader(SequentialDataset(val, False), batch_size=val.shape[0])
testLoader = DataLoader(SequentialDataset(test, False), batch_size=test.shape[0])
for i, batch in enumerate(trainLoader):
print (batch)
break
#Pytorch-Lightning template
class MyModel(pl.LightningModule):
def __init__(self):
super(MyModel, self).__init__()
def training_step(self, batch, batch_idx):
pass
def validation_step(self, batch, batch_idx):
pass
def test_step(self, batch, batch_idx):
pass
def configure_optimizers(self):
pass
class LSTM(pl.LightningModule):
def __init__(self, n_users, n_movies, embed_dim, hidden_size, u_i_dict, k=5, lr=1e-3):
super(LSTM, self).__init__()
self.user_embedding = nn.Embedding(n_users, embed_dim)
self.item_embedding = nn.Embedding(n_movies + 1, embed_dim, padding_idx=n_movies)
self.lstm = nn.LSTM(
input_size=2 * embed_dim,
hidden_size=hidden_size,
num_layers=1,
bias=True,
batch_first=True,
dropout=0.,
bidirectional=False,
proj_size=0
)
self.l1 = nn.Linear(in_features=hidden_size, out_features=hidden_size * 2)
self.dropout = nn.Dropout(0.1)
self.l2 = nn.Linear(in_features=hidden_size * 2, out_features=n_movies)
self.softmax = nn.Softmax(dim=2)
self.loss = nn.CrossEntropyLoss()
self.lr = lr
self.k = k
self.mask = torch.ones(n_users, n_movies)
for k, v in u_i_dict.items():
self.mask[k][v] = 0
def predict(self, batch):
users= self.user_embedding(batch['users'])
movies = self.item_embedding(batch['seqs'])
input = torch.cat((users, movies), dim=2)
output, _ = self.lstm(input)
output = self.l1(output)
output = self.dropout(output)
output = self.l2(output)
output = self.softmax(output)
return output
def training_step(self, batch, batch_idx):
targets = batch['targets']
output = self.predict(batch)
loss = self.loss(output[:,-1,:], targets)
self.log('train_loss', loss, on_epoch=True)
return loss
def evaluation(self, outputs, target):
top_k_items = torch.argsort(outputs, dim=1, descending=True)[:,:self.k]
print(top_k_items.size())
ndcg_k = ndcg(top_k_items, target)
hr_k = hr(top_k_items, target)
return ndcg_k, hr_k
def validation_step(self, batch, batch_idx):
targets = batch['targets']
output = self.predict(batch)
print(output.size())
print(output[:,-1,:].size())
print(targets.size())
loss = self.loss(output[:,-1,:], targets)
ndcg_k, hr_k = self.evaluation(output[:,-1,:], targets)
self.log('val_loss', loss)
self.log('val_ndcg', ndcg_k)
self.log('val_hr', hr_k)
return loss
def test_step(self, batch, batch_idx):
targets = batch['targets']
output = self.predict(batch)
loss = self.loss(output[:,-1,:], targets)
ndcg_k, hr_k = self.evaluation(output, targets)
self.log('test_loss', loss)
self.log('test_ndcg', ndcg_k)
self.log('test_hr', hr_k)
return loss
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
return optimizer
import math
def ndcg(ratings, target):
ndcg = []
for i,v in enumerate(ratings):
condition = v == target[i]
indices = condition.nonzero()
if indices.size()[0] > 0:
ranking = indices[0][0]
ndcg.append(1. / math.log(ranking.item() + 2,2))
else:
ndcg.append(0.)
return sum(ndcg) / float(len(ndcg))
def hr(ratings, target):
hr = []
for i,v in enumerate(ratings):
condition = v == target[i]
indices = condition.nonzero()
if indices.size()[0] > 0:
hr.append(1.)
else:
hr.append(0.)
return sum(hr) / float(len(hr))
embed_dim = 128
hidden_dim= 256
max_movie_id = data_features['movieId'].max()
n_movies = max_movie_id + 1 # Add 1 to include the maximum movie ID
model = LSTM(n_users, n_movies, embed_dim, hidden_dim, train_u_i_dict)
trainer = pl.Trainer(gpus=1, max_epochs=2, progress_bar_refresh_rate=20, check_val_every_n_epoch=1)
trainer.fit(model, train_dataloaders=trainLoader, val_dataloaders=valLoader)
test = trainer.test(dataloaders = testLoader, verbose=True)
Does anyone have any idea on where is the exact problem and how to fix it. Thanks in advance
This line
self.mask[k][v] = 0
is going out of array bounds (by one unit) for an array that was sized
self.mask = torch.ones(n_users, n_movies)
It's not saying if it's k
or v
that is too large. Looking at how you are passing n_movies
I think you need to try incrementing n_users
by 1 (cannot see that part of your code).