I'm trying to build a 3D CNN for binary classification of greyscale MRI data. I'm new to this, so don't pull any punches, I'm here to learn! I have a subsample of 20 3D files, with dimensions (189, 233, 197). I add a a dimension to act as the channel using np.reshape to get (189, 233, 197, 1). I use tf.shape to get the shape of the dataset, which is
<tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 20, 189, 233, 197, 1], dtype=int32)>
and the same on the label data which is
<tf.Tensor: shape=(1,), dtype=int32, numpy=array([20], dtype=int32)>
Below is the full code that I'm using:
import numpy as np
import glob
import os
import tensorflow as tf
import pandas as pd
import glob
import SimpleITK as sitk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from google.colab import drive
drive.mount('/content/gdrive')
datapath = ('/content/gdrive/My Drive/DirectoryTest/All Data/')
patients = os.listdir(datapath)
labels_df = pd.read_csv('/content/Data_Index.csv', index_col = 0 )
FullDataSet = []
for patient in patients:
a = sitk.ReadImage(datapath + patient)
b = sitk.GetArrayFromImage(a)
c = np.reshape(b, (189,233,197))
FullDataSet.append(c)
labelset = []
for i in patients:
label = labels_df.loc[i, 'Group']
if label == 'AD': # use `==` instead of `is` to compare strings
labelset.append(0.)
elif label == 'CN':
labelset.append(1.)
else:
raise "Oops, unknown label"
labelset = np.array(labelset)
x_train, x_valid, y_train, y_valid = train_test_split(FullDataSet, labelset, train_size=0.75)
## 3D CNN
CNN_model = tf.keras.Sequential(
[
#tf.keras.layers.Reshape([189, 233, 197, 1], input_shape=[189, 233, 197]),
tf.keras.layers.Input(shape =[ 189, 233, 197, 1] ),
tf.keras.layers.Conv3D(kernel_size=(7, 7, 7), filters=32, activation='relu',
padding='same', strides=(3, 3, 3)),
#tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool3D(pool_size=(3, 3, 3), padding='same'),
tf.keras.layers.Dropout(0.20),
tf.keras.layers.Conv3D(kernel_size=(5, 5, 5), filters=64, activation='relu',
padding='same', strides=(3, 3, 3)),
#tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2), padding='same'),
tf.keras.layers.Dropout(0.20),
tf.keras.layers.Conv3D(kernel_size=(3, 3, 3), filters=128, activation='relu',
padding='same', strides=(1, 1, 1)),
#tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool3D(pool_size=(2, 2, 2), padding='same'),
tf.keras.layers.Dropout(0.20),
# last activation could be either sigmoid or softmax, need to look into this more. Sig for binary output, Soft for multi output
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(256, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dropout(0.20),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile the model
CNN_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.00001), loss='binary_crossentropy', metrics=['accuracy'])
# print model layers
CNN_model.summary()
CNN_history = CNN_model.fit(x_train, y_train, epochs=10, validation_data=[x_valid, y_valid])
When I attempt to fit the model the dimensionality doesn't seem to line up and I get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-48-c698c45a4d36> in <module>()
1 #running of the model
2 #CNN_history = CNN_model.fit(dataset_train, epochs=100, validation_data =dataset_test, validation_steps=1)
----> 3 CNN_history = CNN_model.fit(x_train, y_train, epochs=10, validation_data=[x_valid, y_valid], batch_size = 1)
4
5
3 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1061 use_multiprocessing=use_multiprocessing,
1062 model=self,
-> 1063 steps_per_execution=self._steps_per_execution)
1064
1065 # Container that configures and calls `tf.keras.Callback`s.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weight, batch_size, steps_per_epoch, initial_epoch, epochs, shuffle, class_weight, max_queue_size, workers, use_multiprocessing, model, steps_per_execution)
1115 use_multiprocessing=use_multiprocessing,
1116 distribution_strategy=ds_context.get_strategy(),
-> 1117 model=model)
1118
1119 strategy = ds_context.get_strategy()
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/data_adapter.py in __init__(self, x, y, sample_weights, sample_weight_modes, batch_size, epochs, steps, shuffle, **kwargs)
280 label, ", ".join(str(i.shape[0]) for i in nest.flatten(data)))
281 msg += "Please provide data which shares the same first dimension."
--> 282 raise ValueError(msg)
283 num_samples = num_samples.pop()
284
ValueError: Data cardinality is ambiguous:
x sizes: 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189
y sizes: 15
Please provide data which shares the same first dimension.
The training split is set to 0.75 so 15 of the 20. I'm confused why this isn't working and can't figure out why this is the input the model is receiving. I have had some help previously, and using the following code to create a dummy set results in a model that will run:
train_size = 20
val_size = 5
X_train = np.random.random([train_size, 189, 233, 197]).astype(np.float32)
X_valid = np.random.random([val_size, 189, 233, 197]).astype(np.float32)
y_train = np.random.randint(2, size=train_size).astype(np.float32)
y_valid = np.random.randint(2, size=val_size).astype(np.float32)
I've been hitting my head against the wall for a while on this one. Any help would be greatly appreciated.
I do not have commenting privileges at this time, otherwise I would, as this might not be a full answer, that said:
When I try creating a toy 4-dimensional dataset, and then appending it to a list (to add a channel - which is what I believe you have done?), the shape I get is not (dim1, dim2, dim3, dim4, channel) but (channel, dim1, dim2, dim3, dim4). I've included a worked example below:
import numpy as np
df = np.arange(0,625).reshape(5,5,5,5)
print(df.shape) # returns (5,5,5,5)
lst = []
lst.append(df)
print(np.asarray(g).shape) # returns (1,5,5,5,5)
Based on this, could it be that the shape of your data would actually be (1, 189, 233, 197) not (189, 233, 197, 1) as you intended?
In addition, the error message to me seems to be implying as if you are not passing the same number of samples for X and y?
ValueError: Data cardinality is ambiguous:
x sizes: 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189, 189
y sizes: 15
Please provide data which shares the same first dimension.
Typically the input to a network will have identical first sizes (and stealing your own toy dataset as an example, and running):
print(X_train.shape, y_train_shape, X_test.shape, y_test.shape)
# returns: (20, 189, 233, 197), (20,) (5, 189, 233, 197) (5,)
They match because then this essentially means that each sample corresponds to a label and vice versa. It seems to me as if the error message is stating that the first dimension of each is 189 and 15 for your X and y inputs, respectively. Could you double-check the shapes immediately prior to inputting to the network?