I tried to fine tune a model with my personal information. So I can create a chat box where people can learn about me via chat gpt.
However, I got the error of
RuntimeError: stack expects each tensor to be equal size, but got [47] at entry 0 and [36] at entry 1
Because I have different length of input
Here are 2 of my sample input
What is the webisite of ABC company ? -> https://abcdef.org/
Do you know the website of ABC company ? -> It is https://abcdef.org/
Here is what I have tried so far
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset, DataLoader
class QADataset(Dataset):
def __init__(self, questions, answers, tokenizer, max_length):
self.questions = questions
self.answers = answers
self.tokenizer = tokenizer
self.max_length = max_length
# Add a padding token to the tokenizer
self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def __len__(self):
return len(self.questions)
def __getitem__(self, index):
question = self.questions[index]
answer = self.answers[index]
input_text = f"Q: {question} A: {answer}"
input_ids = self.tokenizer.encode(input_text, add_special_tokens=True, max_length=self.max_length, padding=True, truncation=True)
if input_ids is None:
return None
input_ids = torch.tensor(input_ids, dtype=torch.long)
print(f"Input ids size: {input_ids.size()}")
return input_ids
# Set up the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
# Load the question and answer data
questions = ["What is the webisite of ABC company ?", "Do you know the website of ABC company ?"]
answers = ["https://abcdef.org/", "It is https://abcdef.org/"]
# Create the dataset and data loader
max_length = 64
dataset = QADataset(questions, answers, tokenizer, max_length=max_length)
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)
# Fine-tune the model on the QA dataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
for epoch in range(3):
running_loss = 0.0
for batch in data_loader:
batch = batch.to(device)
outputs = model(batch, labels=batch)
loss, _ = outputs[:2]
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.item()
print(f"Epoch {epoch + 1} loss: {running_loss / len(data_loader)}")
# Save the fine-tuned model
model.save_pretrained("qa_finetuned_gpt2")
I dont have a solid background of AI, it is more like reading references and try to implement it.
Yes seems like you didn't pad your inputs. The model expects the size to be the same for each text. So if it's too short, you pad it, and if it's too long, it should be truncated.
See also
Try changing how the tokenizer process the inputs:
# Define the data loading class
class MyDataset(Dataset):
def __init__(self, data_path, tokenizer):
self.data_path = data_path
self.tokenizer = tokenizer
with open(self.data_path, 'r') as f:
self.data = f.read().split('\n')
def __len__(self):
return len(self.data)
def __getitem__(self, index):
text = self.data[index]
inputs = self.tokenizer.encode(text, add_special_tokens=True,
truncation=True, max_length=80, padding="max_length")
return torch.tensor(inputs)