I am using the OpenFL framework for doing Federated Learning experiments. I run their tutorial notebooks without problems, so for example I am able to run classification on MNIST and everything is ok. Now I am using 2 clients with 2 different datasets. However, my accuracy is around 0% for a binary classification problem. So, I have 2 classes, "neg" and "pos" for both datasets. Images of the first dataset are 3000x2951 while images of the second are 4892x4020. I resize both to 256x256. My network is a ResNet9 without any sigmoid at the end, because I am using BCEWithLogitsLoss(). Here a bit of code, to check if everything is ok:
optimizer_adam = optim.Adam(params_to_update, lr=1e-4)
def cross_entropy(output, target):
"""Binary cross-entropy metric
"""
target = target.unsqueeze(1)
criterion = nn.BCEWithLogitsLoss()
loss = criterion(output, target.float())
return loss
def train(net_model, train_loader, optimizer, device, loss_fn=cross_entropy, some_parameter=None):
torch.manual_seed(0)
device='cpu'
function_defined_in_notebook(some_parameter)
train_loader = tqdm.tqdm(train_loader, desc="train")
net_model.train()
net_model.to(device)
losses = []
for data, target in train_loader:
data, target = torch.tensor(data).to(device), torch.tensor(
target).to(device, dtype=torch.int64)
optimizer.zero_grad()
#data = data.type(torch.LongTensor)
#target = target.type(torch.LongTensor)
output = net_model(data)
loss = loss_fn(output=output, target=target)
loss.backward()
optimizer.step()
losses.append(loss.detach().cpu().numpy())
return {'train_loss': np.mean(losses),}
@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')
def validate(net_model, val_loader, device):
torch.manual_seed(0)
device = torch.device('cpu')
net_model.eval()
net_model.to(device)
val_loader = tqdm.tqdm(val_loader, desc="validate")
val_score = 0
total_samples = 0
with torch.no_grad():
for data, target in val_loader:
samples = target.shape[0]
total_samples += samples
data, target = torch.tensor(data).to(device), \
torch.tensor(target).to(device, dtype=torch.int64)
output = net_model(data)
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
val_score = (pred == target).sum().cpu().item()/data.size(0)
#val_score += pred.eq(target).sum().cpu().numpy()
return {'acc': val_score / total_samples,}
I think that all this is correct. So the only part that can be wrong is when I import the data because in this federated learning framework is a bit tricky. Basically my datasets are organized both in this way: /Dataset1(2)/Train(Test)/neg(pos)/images.png. I want to extract x_train, y_train, x_test and y_test because I am following exactly the structure of a tutorial that works. So this is my proposed solution:
def download_data(self):
"""Download prepared dataset."""
image_list_train = []
image_list_test = []
x_train = []
y_train = []
x_test = []
y_test = []
base_dir_train = 'Montgomery_real_splitted/TRAIN/'
base_dir_test = 'Montgomery_real_splitted/TEST/'
for f in sorted(os.listdir(base_dir_train)):
if os.path.isdir(base_dir_train+f):
print(f"{f} is a target class")
for i in sorted(os.listdir(base_dir_train+f)):
y_train.append(f)
im = Image.open(base_dir_train+f+'/'+i)
x_train.append(im)
for f in sorted(os.listdir(base_dir_test)):
if os.path.isdir(base_dir_test+f):
print(f"{f} is a target class")
for i in sorted(os.listdir(base_dir_test+f)):
y_test.append(f)
imt=Image.open(base_dir_test+f+'/'+i)
x_test.append(imt)
y_train = np.array(y_train)
y_test = np.array(y_test)
for i in range(len(y_train)):
if y_train[i]=="neg":
y_train[i]=0
else:
y_train[i]=1
y_train = y_train.astype(np.uint8)
for i in range(len(y_test)):
if y_test[i]=="neg":
y_test[i]=0
else:
y_test[i]=1
y_test = y_test.astype(np.uint8)
print('Mont-china data was loaded!')
return (x_train, y_train), (x_test, y_test)
This code above is in a python script needed to load the data. Then, inside the Jupyter notebook I have these cells in order to import the dataset:
normalize = T.Normalize(
mean=[0.1307],
std=[0.3081]
)
augmentation = T.RandomApply(
[T.RandomHorizontalFlip(),
T.RandomRotation(10)],
p=.8
)
training_transform = T.Compose(
[T.Resize((256,256)),
augmentation,
T.ToTensor()]
)
valid_transform = T.Compose(
[T.Resize((256,256)),
T.ToTensor()]
)
class TransformedDataset(Dataset):
def __init__(self, dataset, transform=None, target_transform=None):
"""Initialize Dataset."""
self.dataset = dataset
self.transform = transform
self.target_transform = target_transform
def __len__(self):
"""Length of dataset."""
return len(self.dataset)
def __getitem__(self, index):
img, label = self.dataset[index]
label = self.target_transform(label) if self.target_transform else label
img = self.transform(img) if self.transform else img
return img, label
class MontChinaDataset(DataInterface):
def __init__(self, **kwargs):
self.kwargs = kwargs
@property
def shard_descriptor(self):
return self._shard_descriptor
@shard_descriptor.setter
def shard_descriptor(self, shard_descriptor):
"""
Describe per-collaborator procedures or sharding.
This method will be called during a collaborator initialization.
Local shard_descriptor will be set by Envoy.
"""
self._shard_descriptor = shard_descriptor
self.train_set = TransformedDataset(
self._shard_descriptor.get_dataset('train'),
transform=training_transform
)
self.valid_set = TransformedDataset(
self._shard_descriptor.get_dataset('val'),
transform=valid_transform
)
def get_train_loader(self, **kwargs):
"""
Output of this method will be provided to tasks with optimizer in contract
"""
generator=torch.Generator()
generator.manual_seed(0)
return DataLoader(
self.train_set, batch_size=self.kwargs['train_bs'], shuffle=True, generator=generator
)
def get_valid_loader(self, **kwargs):
"""
Output of this method will be provided to tasks without optimizer in contract
"""
return DataLoader(self.valid_set, batch_size=self.kwargs['valid_bs'])
def get_train_data_size(self):
"""
Information for aggregation
"""
return len(self.train_set)
def get_valid_data_size(self):
"""
Information for aggregation
"""
return len(self.valid_set)
fed_dataset = MontChinaDataset(train_bs=16, valid_bs=16)
The strange thing is that the loss decreases, while the accuracy remains 0 or around 0.
[12:29:44] METRIC Round 0, collaborator env_one train result train_loss: 0.673127 experiment.py:116
[12:29:53] METRIC Round 0, collaborator env_one locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:29:56] METRIC Round 0, collaborator env_one aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:30:49] METRIC Round 0, collaborator env_two train result train_loss: 0.562856 experiment.py:116
[12:31:14] METRIC Round 0, collaborator env_two locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:19] METRIC Round 0, collaborator env_two aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:21] METRIC Round 0, collaborator Aggregator train result train_loss: 0.581464 experiment.py:116
METRIC Round 0, collaborator Aggregator locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:22] METRIC Round 0, collaborator Aggregator aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:39] METRIC Round 1, collaborator env_one train result train_loss: 0.637785 experiment.py:116
[12:31:41] METRIC Round 1, collaborator env_one locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:31:44] METRIC Round 1, collaborator env_one aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:31:55] METRIC Round 1, collaborator env_two train result train_loss: 0.432979 experiment.py:116
[12:32:00] METRIC Round 1, collaborator env_two locally_tuned_model_validate result acc: 0.000000 experiment.py:116
[12:32:05] METRIC Round 1, collaborator env_two aggregated_model_validate result acc: 0.000000 experiment.py:116
[12:32:08] METRIC Round 1, collaborator Aggregator train result train_loss: 0.467540 experiment.py:116
METRIC Round 1, collaborator Aggregator locally_tuned_model_validate result acc: 0.000000 experiment.py:116
METRIC Round 1, collaborator Aggregator aggregated_model_validate result acc: 0.000000
And this goes on for several rounds
I'm not sure if this will solve your problem, but your validation code has some bugs (two new lines annotated below):
@task_interface.register_fl_task(model='net_model', data_loader='val_loader', device='device')
def validate(net_model, val_loader, device):
torch.manual_seed(0)
device = torch.device('cpu')
net_model.eval()
net_model.to(device)
val_loader = tqdm.tqdm(val_loader, desc="validate")
val_score = 0
total_samples = 0
with torch.no_grad():
for data, target in val_loader:
samples = target.shape[0]
total_samples += samples
data, target = torch.tensor(data).to(device), \
torch.tensor(target).to(device, dtype=torch.int64)
output = net_model(data)
##new line vvv
output = torch.sigmoid(output) #compress output into prob distribution
pred = (output >= 0.5).long() # Binarize predictions to 0 and 1
##changed line below
val_score += (pred == target).sum().cpu().item() ###/data.size(0)
#val_score += pred.eq(target).sum().cpu().numpy()
return {'acc': val_score / total_samples,}
Essentially there are two issues:
val_score
by both data.size(0)
(batch size?) and then also total_samples
which is NOT the number of batches, but the count of all of your data.val_score
every iteration, you were resetting it. If you have a lot of batches this would explain why it was 0 or close to 0.Hopefully these fixes should get you closer to your goal!