I am getting the following error related to this function definition what is wrong?
Convert_to_client_data() is a function in federated learning where I am trying to convert a dataset into the federated dataset.
Here is the declaration of the class Distribute which is used in the function which gives the error
#Declaration of Class Distribute
def partition_list (list_in, n):
random.shuffle(list_in)
return [list_in[i::n] for i in range(n)]
class Distribute:
def __init__(self, data, data_type):
self.data = data
self.data_type = data_type.lower()
self.selected_feature = -1
self.type = 'iid'
self.client_no = 10
self.data_sample_fraction = 0.1
self.min_user_number = 10
self.max_user_number = 20
self.train_data_fraction = 0.9
self.random_sampling_seed = 4
self.random_split_seed = 1
self.split_type = 'sample'
def __shuffle(self, data, label):
random.Random(self.random_sampling_seed).shuffle(data)
def _iid_no_clint(self):
size = random.randrange(2, len(self.data))
self.__shuffle(self.data)
glist = []
group_size = int(len(self.data) / size)
for i in range(size):
glist.append(self.data[group_size * i: group_size * (i + 1)])
return glist
def _iid_clint(self, number_of_clients):
self.__shuffle(self.data)
glist = []
group_size = int(len(self.data) / number_of_clients)
for i in range(number_of_clients):
glist.append(self.data[group_size * i: group_size * (i + 1)])
return glist
def _iid(self, **kwargs):
number_of_clients = kwargs.get('number_of_clients')
if number_of_clients:
return self._iid_clint(number_of_clients)
else:
return self._iid_no_clint()
def _niid(self, **kwargs):
selected_feature = kwargs.get('selected_feature', self.selected_feature)
min_user_number = kwargs.get('min_user_number', self.min_user_number)
max_user_number = kwargs.get('max_user_number', self.max_user_number)
number_of_clients = kwargs.get('number_of_clients')
data_type = kwargs.get('data_type')
if data_type == 'image':
if number_of_clients:
if number_of_clients > len(self.data):
raise ValueError('Total number of data:', len(self.data),
'is less than total number of clients specified:', number_of_clients)
else:
data = self.__select_feature_image_client(number_of_clients)
else:
data = self.__select_feature_image_no_client(min_user_number, max_user_number)
elif data_type == 'text':
if number_of_clients:
if number_of_clients > len(self.data):
raise ValueError('Total number of data:', len(self.data),
'is less than total number of clients specified:', number_of_clients)
else:
data = self.__select_feature_text_client(number_of_clients)
else:
data = self.__select_feature_text_no_client(min_user_number, max_user_number)
elif data_type == 'csv':
if number_of_clients:
if number_of_clients > len(self.data):
raise ValueError('Total number of data:', len(self.data),
'is less than total number of clients specified:', number_of_clients)
else:
data = self.__select_feature_csv_client(number_of_clients)
else:
data = self.__select_feature_csv_no_client(min_user_number, max_user_number)
else:
raise ValueError(
f'Given data type: "{data_type}" is not correct, choose between options "text" or "image".')
return data
def distribute_data(self, **kwargs):
if kwargs.get('dist_type', self.type) == 'iid':
return self._iid(**kwargs)
else:
return self._niid(**kwargs)
def __select_feature_image_no_client(self, min_user_number, max_user_number):
client_size = random.randint(min_user_number, max_user_number)
grouped_data = partition_list (self.data, client_size)
return grouped_data
def __select_feature_image_client(self, number_of_clients):
grouped_data = np.array_split(self.data, number_of_clients)
return grouped_data
def __select_feature_text_no_client(self, min_user_number, max_user_number):
client_size = random.randint(min_user_number, max_user_number)
grouped_data = partition_list (self.data, client_size)
return grouped_data
def __select_feature_text_client(self, number_of_clients):
grouped_data = np.array_split(self.data, number_of_clients)
return grouped_data
def __select_feature_csv_no_client(self, min_user_number, max_user_number):
client_size = random.randint(min_user_number, max_user_number)
grouped_data = partition_list (self.data, client_size)
return grouped_data
def __select_feature_csv_client(self, number_of_clients):
grouped_data = np.array_split(self.data, number_of_clients)
return grouped_data
def split_data(self, x, y, **kwargs):
train_data_fraction = kwargs.get('train_data_fraction', self.train_data_fraction)
if kwargs.get('type', self.type) == 'sample':
return self._sample_split(x, y, train_data_fraction)
else:
return self._user_split(train_data_fraction)
def _user_split(self, train_data_fraction):
rng_seed = (self.random_split_seed if (self.random_split_seed is not None and self.random_split_seed >= 0)
else int(time.time()))
rng = random.Random(rng_seed)
# randomly sample from user_files to pick training set users
num_users = self.client_no
num_train_users = int(train_data_fraction * num_users)
indices = [i for i in range(num_users)]
train_indices = rng.sample(indices, num_train_users)
train_blist = [False for i in range(num_users)]
for i in train_indices:
train_blist[i] = True
train_user_files = []
test_user_files = []
train_labels = []
test_labels = []
for i in range(num_users):
if train_blist[i]:
train_user_files.append(self.data[i])
train_labels.append(self.label[i])
else:
test_user_files.append(self.data[i])
test_labels.append(self.label[i])
return train_user_files, test_user_files, train_labels, test_labels
def _sample_split(self, x, y, train_data_fraction):
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=train_data_fraction)
return x_train, x_test, y_train, y_test
#DATA variable
data_type = 'text'
input_path = '/content/drive/MyDrive/Divya-Yasaman/v2/data/text/topics_sample' # accepts either folder or csv file
obj = Reader(data_type, input_path)
%%time
data = obj.read_data()
#function DEFINITION which gives the error
def convert_to_client_data(data, data_type, **kwargs):
distributor_obj = Distribute(data, data_type)
distributed_data = distributor_obj.distribute_data(data_type=data_type, **kwargs)
client_train_dataset = collections.OrderedDict()
for i in range(len(distributed_data)):
client_name = "client_" + str(i)
data = collections.OrderedDict('data', distributed_data[i])
# data = collections.OrderedDict( distributed_data[i])
client_train_dataset[client_name] = data
print(f'Converting data to {len(distributed_data)} client data...')
train_dataset = tff.simulation.datasets.TestClientData(client_train_dataset)
print(f'Data successfully converted to {len(distributed_data)} client data.')
return train_dataset
ERROR STATEMENT for the function definition
<decorator-gen-53> in time(self, line, cell, local_ns)
<timed exec> in <module>()
<ipython-input-60-7b390d37230c> in convert_to_client_data(data, data_type, **kwargs)
13 for i in range(len(distributed_data)):
14 client_name = "client_" + str(i)
---> 15 data = collections.OrderedDict('data', distributed_data[i])
16 # data = collections.OrderedDict( distributed_data[i])
17 client_train_dataset[client_name] = data
TypeError: expected at most 1 arguments, got 2
collections.OrderedDict()
takes the same arguments as dict()
: a sequence of key/value pairs to put in the dictionary. It doesn't take the key and value as separate arguments.
If data
is supposed to be the key, don't put it as a separate argument.
data = collections.OrderedDict([('data', distributed_data[i])])
Also, as of Python 3.6, regular dictionaries retain their insertion order, so you may not need to use OrderedDict
. Just write:
data = {'data': distributed_data[i]}