pythonpandasmachine-learninghuggingface-transformersmultilabel-classification

Prepare a dataset for multilabel ViTForImageClassification


I've been attempting to "transform" a multiclass classification system using ViTForImageClassification into a multilabel. However, I've been running into some problems.
(The original multiclass system which I'm attempting to convert can be found here.)

The folder structure for the dataset is as follow:

/dataset
./class1
./class2
./class3
./class1-class2
./class2-class3

The code I have so far to prepare the dataset is as follow:

file_names = []
labels = []
all_labels = []

for file in sorted((Path('/content/dataset').glob('*/*.*'))):
    folder = str(file).split('/')[-2].split('.')[0]
    label = folder.split('-')
    for l in label:
      if not set([l + '.class']).issubset(all_labels):
        all_labels.append(str(label[0]) + '.class')
    labels.append([x + '.class' for x in label])
    file_names.append(str(file))

print(len(file_names), len(labels))

df = pd.DataFrame.from_dict({"image": file_names, "label": labels})

mlb = MultiLabelBinarizer()
mlb_result = mlb.fit_transform([df.loc[i,'label'] for i in range(len(df))])
df_final = pd.concat([df['image'],pd.DataFrame(mlb_result,columns=list(mlb.classes_))],axis=1)
dataset = Dataset.from_pandas(df_final).cast_column("image", Image())

labels_list = list(set(all_labels))

label2id, id2label = dict(), dict()
for i, label in enumerate(labels_list):
    label2id[label] = i
    id2label[i] = label

ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
print(ClassLabels)

dataset = dataset.train_test_split(test_size=0.8, shuffle=True)

train_data = dataset['train']

test_data = dataset['test']
model_str = 'google/vit-base-patch16-224-in21k'
processor = ViTImageProcessor.from_pretrained(model_str)

image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

normalize = Normalize(mean=image_mean, std=image_std)

_train_transforms = Compose(
    [
        Resize((size, size)),
        RandomRotation(10),
        RandomAdjustSharpness(2),
        ToTensor(),
        normalize
    ]
)

_val_transforms = Compose(
    [
        Resize((size, size)),
        ToTensor(),
        normalize
    ]
)

def train_transforms(examples):
    examples['pixel_values'] = [_train_transforms(image.convert("RGB")) for image in examples['image']]
    return examples

def val_transforms(examples):
    examples['pixel_values'] = [_val_transforms(image.convert("RGB")) for image in examples['image']]
    return examples

train_data.set_transform(train_transforms)

test_data.set_transform(val_transforms)

The code that I have to prepare the model is:

model = ViTForImageClassification.from_pretrained(model_str, num_labels=len(labels_list), problem_type="multi_label_classification")

model.config.id2label = id2label
model.config.label2id = label2id

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = eval_pred.predictions

    label_ids = eval_pred.label_ids

    predicted_labels = predictions.argmax(axis=1)

    acc_score = accuracy.compute(predictions=predicted_labels, references=label_ids)['accuracy']

    return {
        "accuracy": acc_score
    }

metric_name = "accuracy"

model_name = "multilabel-classifier"
num_train_epochs = 30

args = TrainingArguments(
    output_dir=model_name,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    num_train_epochs=num_train_epochs,
    weight_decay=0.02,
    warmup_steps=50,
    remove_unused_columns=False,.
    save_strategy='epoch',
    load_best_model_at_end=True,
    save_total_limit=1,
    report_to="mlflow"
)

# Attempting to shape it: pixel_values of shape (batch_size, num_channels, height, width) and labels of shape (batch_size, num_labels)
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])

    temp = []
    for example in examples:
      temp2 = []
      for label in example:
        if label != 'image' and label != 'pixel_values':
          temp2.append(example[label])
      temp.append(temp2)

    print(temp)

    labels = torch.tensor(temp)
    print(labels)

    return {"pixel_values": pixel_values, "labels": labels}

trainer = Trainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

The code I have to train the model is:

trainer.evaluate()
trainer.train()

The current error I'm running into is:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-13-bceb9305a605> in <cell line: 5>()
      3 # to assess how well the model is performing on unseen data.
      4 
----> 5 trainer.evaluate()

/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in binary_cross_entropy_with_logits(input, target, weight, size_average, reduce, reduction, pos_weight)
   3193         raise ValueError(f"Target size ({target.size()}) must be the same as input size ({input.size()})")
   3194 
-> 3195     return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
   3196 
   3197 

RuntimeError: result type Float can't be cast to the desired output type Long

I believe I'm preparing the dataset wrong as I can infer from here, however I'm not sure how to continue / fix what I currently have.


Solution

  • The question above is already very close to the answer, however it requires a little bit of tweaking.

    First of all we add a id mapping step before creating the test and train split. This will allow for the system to be able to label with the class's name instead of simply the ID.

    ClassLabels = ClassLabel(num_classes=len(labels_list), names=labels_list)
    
    def map_label2id(example):
      for label in all_labels:
        example[label] = ClassLabels.str2int(example[label])
      return example
    
    dataset = dataset.map(map_label2id, batched=True)
    
    for label in labels_list:
      dataset = dataset.cast_column(label, ClassLabels)
    

    The creation of the model has been changed to:

    model = ViTForImageClassification.from_pretrained(
        model_str,
        num_labels=len(labels_list),
        problem_type="multi_label_classification"
    )
    
    model.config.labels = labels_list
    model.config.label2id = label2id
    model.config.id2label = id2label
    

    Then I changed the collate_fn function (which solves the specific bug described above) by changing the return statment to:

    return {"pixel_values": pixel_values, "labels": labels.float()}
    

    The last function changed was the compute_metrics. This is important, as the metrics which you wish to calculate with a multiclass or multilabel model vary differently.

    def compute_metrics(eval_pred):
        predictions = eval_pred.predictions
        label_ids = eval_pred.label_ids
    
        predicted_labels = (predictions > 0).astype(float)
    
        temp = 0
        for i in range(label_ids.shape[0]):
            temp += sum(np.logical_and(label_ids[i], predicted_labels[i])) / sum(np.logical_or(label_ids[i], predicted_labels[i]))
        acc_score = temp / label_ids.shape[0]
    
        temp = 0
        for i in range(label_ids.shape[0]):
            temp += np.size(label_ids[i] == predicted_labels[i]) - np.count_nonzero(label_ids[i] == predicted_labels[i])
        hamming_loss = temp / (label_ids.shape[0] * label_ids.shape[1])
    
        return {
            "accuracy": acc_score,
            "hamming_loss": hamming_loss
        }