I've been attempting to "transform" a multiclass classification system using ViTForImageClassification into a multilabel. However, I've been running into some problems.
(The original multiclass system which I'm attempting to convert can be found here.)
The folder structure for the dataset is as follow:
/dataset
./class1
./class2
./class3
./class1-class2
./class2-class3
The code I have so far to prepare the dataset is as follow:
file_names = []
labels = []
all_labels = []
for file in sorted((Path('/content/dataset').glob('*/*.*'))):
folder = str(file).split('/')[-2].split('.')[0]
label = folder.split('-')
for l in label:
if not set([l + '.class']).issubset(all_labels):
all_labels.append(str(label[0]) + '.class')
labels.append([x + '.class' for x in label])
file_names.append(str(file))
print(len(file_names), len(labels))
df = pd.DataFrame.from_dict({"image": file_names, "label": labels})
mlb = MultiLabelBinarizer()
mlb_result = mlb.fit_transform([df.loc[i,'label'] for i in range(len(df))])
df_final = pd.concat([df['image'],pd.DataFrame(mlb_result,columns=list(mlb.classes_))],axis=1)
dataset = Dataset.from_pandas(df_final).cast_column("image", Image())
labels_list = list(set(all_labels))
label2id, id2label = dict(), dict()
for i, label in enumerate(labels_list):
label2id[label] = i
id2label[i] = label
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
print(ClassLabels)
dataset = dataset.train_test_split(test_size=0.8, shuffle=True)
train_data = dataset['train']
test_data = dataset['test']
model_str = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_str)
image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]
normalize = Normalize(mean=image_mean, std=image_std)
_train_transforms = Compose(
[
Resize((size, size)),
RandomRotation(10),
RandomAdjustSharpness(2),
ToTensor(),
normalize
]
)
_val_transforms = Compose(
[
Resize((size, size)),
ToTensor(),
normalize
]
)
def train_transforms(examples):
examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
return examples
def val_transforms(examples):
examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['image']]
return examples
train_data.set_transform(train_transforms)
test_data.set_transform(val_transforms)
The code that I have to prepare the model is:
model = ViTForImageClassification.from_pretrained(model_str, num_labels=len(labels_list), problem_type="multi_label_classification")
model.config.id2label = id2label
model.config.label2id = label2id
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions = eval_pred.predictions
label_ids = eval_pred.label_ids
predicted_labels = predictions.argmax(axis=1)
acc_score = accuracy.compute(predictions=predicted_labels, references=label_ids)['accuracy']
return {
"accuracy": acc_score
}
metric_name = "accuracy"
model_name = "multilabel-classifier"
num_train_epochs = 30
args = TrainingArguments(
output_dir=model_name,
logging_dir='./logs',
evaluation_strategy="epoch",
learning_rate=1e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=8,
num_train_epochs=num_train_epochs,
weight_decay=0.02,
warmup_steps=50,
remove_unused_columns=False,.
save_strategy='epoch',
load_best_model_at_end=True,
save_total_limit=1,
report_to="mlflow"
)
# Attempting to shape it: pixel_values of shape (batch_size, num_channels, height, width) and labels of shape (batch_size, num_labels)
def collate_fn(examples):
pixel_values = torch.stack([example["pixel_values"] for example in examples])
temp = []
for example in examples:
temp2 = []
for label in example:
if label != 'image' and label != 'pixel_values':
temp2.append(example[label])
temp.append(temp2)
print(temp)
labels = torch.tensor(temp)
print(labels)
return {"pixel_values": pixel_values, "labels": labels}
trainer = Trainer(
model,
args,
train_dataset=train_data,
eval_dataset=test_data,
data_collator=collate_fn,
compute_metrics=compute_metrics,
tokenizer=processor,
)
The code I have to train the model is:
trainer.evaluate()
trainer.train()
The current error I'm running into is:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-13-bceb9305a605> in <cell line: 5>()
3 # to assess how well the model is performing on unseen data.
4
----> 5 trainer.evaluate()
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in binary_cross_entropy_with_logits(input, target, weight, size_average, reduce, reduction, pos_weight)
3193 raise ValueError(f"Target size ({target.size()}) must be the same as input size ({input.size()})")
3194
-> 3195 return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
3196
3197
RuntimeError: result type Float can't be cast to the desired output type Long
I believe I'm preparing the dataset wrong as I can infer from here, however I'm not sure how to continue / fix what I currently have.
The question above is already very close to the answer, however it requires a little bit of tweaking.
First of all we add a id mapping step before creating the test and train split. This will allow for the system to be able to label with the class's name instead of simply the ID.
ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
def map_label2id(example):
for label in all_labels:
example[label] = ClassLabels.str2int(example[label])
return example
dataset = dataset.map(map_label2id, batched=True)
for label in labels_list:
dataset = dataset.cast_column(label, ClassLabels)
The creation of the model has been changed to:
model = ViTForImageClassification.from_pretrained(
model_str,
num_labels=len(labels_list),
problem_type="multi_label_classification"
)
model.config.labels = labels_list
model.config.label2id = label2id
model.config.id2label = id2label
Then I changed the collate_fn function (which solves the specific bug described above) by changing the return statment to:
return {"pixel_values": pixel_values, "labels": labels.float()}
The last function changed was the compute_metrics. This is important, as the metrics which you wish to calculate with a multiclass or multilabel model vary differently.
def compute_metrics(eval_pred):
predictions = eval_pred.predictions
label_ids = eval_pred.label_ids
predicted_labels = (predictions > 0).astype(float)
temp = 0
for i in range(label_ids.shape[0]):
temp += sum(np.logical_and(label_ids[i], predicted_labels[i])) / sum(np.logical_or(label_ids[i], predicted_labels[i]))
acc_score = temp / label_ids.shape[0]
temp = 0
for i in range(label_ids.shape[0]):
temp += np.size(label_ids[i] == predicted_labels[i]) - np.count_nonzero(label_ids[i] == predicted_labels[i])
hamming_loss = temp / (label_ids.shape[0] * label_ids.shape[1])
return {
"accuracy": acc_score,
"hamming_loss": hamming_loss
}