pythontensorflowkerasconfusion-matrix

How to get correct confusion_matrix data in customdatagenerator


I'm building confusion_matrix, but I always return wrong shape y_true

I think my y_label is correct, I have 62 val data

I dont know y_true should be declare at where and where to get y_true

ValueError
Found input variables with inconsistent numbers of samples: [63, 62]
  File "C:\Labbb\inceptionResnetV2\InceptionResnetV2_1.py", line 213, in <module>
    sns.heatmap(confusion_matrix(y_true, y_pred),
ValueError: Found input variables with inconsistent numbers of samples: [63, 62]

I try to append self.y_true in get_data, use def get_y_true return self.y_true, and "self.y_true = []" in on_epoch_end, shuffle=False.

Here is CustomDataGenerator.

Where should I declare "self.y_true = []"

train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']  
label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']  
class CustomDataGenerator(Sequence):
    def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
        self.image_folders = image_folders
        ...
        self.image_paths = []
        self.label_paths = []
        self.on_epoch_end()
    def __len__(self):
        return int(np.ceil(len(self.image_paths) / self.batch_size))  
    def __getitem__(self, index):
        batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
        batch = zip(batch_image_paths, batch_label_paths)
        return self.get_data(batch)
    def on_epoch_end(self):
        self.image_paths = []
        self.label_paths = []
        
        self.y_true = []
        for folder in self.image_folders:
            image_folder_path = os.path.join(self.dir, folder)
            image_files = os.listdir(image_folder_path)
            for file_name in image_files:
                self.image_paths.append(os.path.join(image_folder_path, file_name))
        for folder in self.label_folders:
            ...
                
        if self.shuffle:
            np.random.shuffle(self.image_paths)
            np.random.shuffle(self.label_paths)
    def get_data(self, batch):
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes))
        
        for i, (image_path, label_path) in enumerate(batch):
            image = np.load(image_path)
            with open(label_path, 'r') as f:
                line = f.readline().strip()
                filepath, label = line.rsplit(' ', 1)
                label = int(label)
                self.y_true.append(label)
            label_one_hot = to_categorical(label, num_classes=self.n_classes)

            X[i,] = image
            y[i,] = label_one_hot
            
        return X, y
    
    def get_y_true(self):
        return self.y_true

Here is get y_true and y_pred ,and build confusion_matrix

At here ,"y_true = val_datagen.get_y_true()" should be put before or after this line "Y_pred = model.predict"?

train_datagen = CustomDataGenerator(image_folders, label_folders, train_dir, **params, shuffle = True)
val_datagen = CustomDataGenerator(image_folders, label_folders, valid_dir, **params, shuffle = False)

y_true = val_datagen.get_y_true()
Y_pred = model.predict(val_datagen)
y_pred = np.argmax(Y_pred, axis=1) 
fig, ax = plt.subplots(figsize=(12,6))  
sns.heatmap(confusion_matrix(y_true, y_pred),annot=True, fmt="d", cmap='Greens',ax = ax)

Solution

  • I can't test it, but it should work now. I moved the self.y_true=[] from on_epoch_end() to __get_item__(), where it gets only reset when the first batch is called. It would be even better with a callback on epoch starts. This will only work after the dataset has been called at least once, as the images and labels are loaded one batch after the other on get_data().
    But I'm alos not sure how many images are loaded with a batch size of 1. It seems one batch gets an image folder path, are there multiple images in that folder?

    train_dir = r'C:\Labbb\mergeimage_npy\512512\npy\train'
    valid_dir = r'C:\Labbb\mergeimage_npy\512512\npy\val'
    image_folders = ['image0', 'image1', 'image2', 'image3', 'image4', 'image6', 'image7']  
    label_folders = ['label0', 'label1', 'label2', 'label3', 'label4', 'label6', 'label7']  
    
    class CustomDataGenerator(Sequence):
        def __init__(self, image_folders, label_folders, dir, dim=(512,512),  batch_size=1,n_classes=7,n_channels=8,shuffle=True):
            self.image_folders = image_folders
            ...
            self.image_paths = []
            self.label_paths = []
            self.init_paths()
        
        def __len__(self):
            return int(np.ceil(len(self.image_paths) / self.batch_size))  
        
        def __getitem__(self, index):
            if index == 0:  # this line here should fix it
                self.y_true = []
            batch_image_paths = self.image_paths[index * self.batch_size: (index + 1) * self.batch_size]
            batch_label_paths = self.label_paths[index * self.batch_size: (index + 1) * self.batch_size]
            batch = zip(batch_image_paths, batch_label_paths)
            return self.get_data(batch)
    
        def init_paths(self):
            for folder in self.image_folders:
                image_folder_path = os.path.join(self.dir, folder)
                image_files = os.listdir(image_folder_path)
                for file_name in image_files:
                    self.image_paths.append(os.path.join(image_folder_path, file_name))
            for folder in self.label_folders:
                ...
            if self.shuffle:
                np.random.shuffle(self.image_paths)
                np.random.shuffle(self.label_paths)
                    
        def on_epoch_end(self):
            if self.shuffle:
                np.random.shuffle(self.image_paths)
                np.random.shuffle(self.label_paths)
    
        def get_data(self, batch):
            X = np.empty((self.batch_size, *self.dim, self.n_channels))
            y = np.empty((self.batch_size, self.n_classes))
            
            for i, (image_path, label_path) in enumerate(batch):
                image = np.load(image_path)
                with open(label_path, 'r') as f:
                    line = f.readline().strip()
                    filepath, label = line.rsplit(' ', 1)
                    label = int(label)
                    self.y_true.append(label)
                label_one_hot = to_categorical(label, num_classes=self.n_classes)
    
                X[i,] = image
                y[i,] = label_one_hot
                
            return X, y
        
        def get_y_true(self):
            return self.y_true