Is Caffe's reported accuracy reliable?

I recently tried to get the confusion matrix for one of my trained models, to see how precise it is. I downloaded this script and fed my model. To my astonishment, the accuracy calculated by the script, is very different than the one, Caffe reports.

I have used this script to calculate the confusion matrix, this however, reports the accuracy as well, the problem is the accuracy reported by this script is way different that the one reported by Caffe! For example Caffe reports the accuracy lets say for CIFAR10, as 92.34%, while, when the model is fed to the script to calculate confusion matrix and its accuracy, it results in for example something like 86.5%!

Which one of these accuracies are the correct one, and can be reported in papers or compared with the results of other papers such as those here ?

I also saw something weird again, I trained two identical models, with only one difference, that being one used Xavier, and the other used msra for initialization. The first one reports an accuracy of 94.25 and the other reports 94.26 in Caffe. when these models are fed to the script I linked above, for confusion matrix computations. their accuracies were 89.2% and 87.4% respectively!

Is this normal? what is the cause for this? msra?

Are the accuracies reported by Caffe true and reliable?

PS: The accuracy in the script is calculated as (complete script):

  for i, image, label in reader:
        image_caffe = image.reshape(1, *image.shape)
        out = net.forward_all(data=np.asarray([ image_caffe ]))
        plabel = int(out['prob'][0].argmax(axis=0))

        count += 1
        iscorrect = label == plabel
        correct += (1 if iscorrect else 0)
        matrix[(label, plabel)] += 1
        labels_set.update([label, plabel])

        if not iscorrect:
            print("\rError: i=%s, expected %i but predicted %i" \
                    % (i, label, plabel))

        sys.stdout.write("\rAccuracy: %.1f%%" % (100.*correct/count))
        sys.stdout.flush()

    print(", %i/%i corrects" % (correct, count))

Which imho is OK and correct. the number of correct predictions, divided by the total number of instances in the dataset.

Solution

I found the reason. The reason for the mismatch between Caffe generated accuracy and the accuract generated by the script in question, was solely because of mean-subtraction, which was done in caffe, and not in the script. This is the modified version of script which takes this into account and hopefully everything is just fine.

# Author: Axel Angel, copyright 2015, license GPLv3.
# added mean subtraction so that, the accuracy can be reported accurately just like caffe when doing a mean subtraction
# Seyyed Hossein Hasan Pour
# Coderx7@Gmail.com
# 7/3/2016 

import sys
import caffe
import numpy as np
import lmdb
import argparse
from collections import defaultdict

def flat_shape(x):
    "Returns x without singleton dimension, eg: (1,28,28) -> (28,28)"
    return x.reshape(filter(lambda s: s > 1, x.shape))

def lmdb_reader(fpath):
    import lmdb
    lmdb_env = lmdb.open(fpath)
    lmdb_txn = lmdb_env.begin()
    lmdb_cursor = lmdb_txn.cursor()

    for key, value in lmdb_cursor:
        datum = caffe.proto.caffe_pb2.Datum()
        datum.ParseFromString(value)
        label = int(datum.label)
        image = caffe.io.datum_to_array(datum).astype(np.uint8)
        yield (key, flat_shape(image), label)

def leveldb_reader(fpath):
    import leveldb
    db = leveldb.LevelDB(fpath)

    for key, value in db.RangeIter():
        datum = caffe.proto.caffe_pb2.Datum()
        datum.ParseFromString(value)
        label = int(datum.label)
        image = caffe.io.datum_to_array(datum).astype(np.uint8)
        yield (key, flat_shape(image), label)

def npz_reader(fpath):
    npz = np.load(fpath)

    xs = npz['arr_0']
    ls = npz['arr_1']

    for i, (x, l) in enumerate(np.array([ xs, ls ]).T):
        yield (i, x, l)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--proto', type=str, required=True)
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--mean', type=str, required=True)
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--lmdb', type=str, default=None)
    group.add_argument('--leveldb', type=str, default=None)
    group.add_argument('--npz', type=str, default=None)
    args = parser.parse_args()

# Extract mean from the mean image file
    mean_blobproto_new = caffe.proto.caffe_pb2.BlobProto()
    f = open(args.mean, 'rb')
    mean_blobproto_new.ParseFromString(f.read())
    mean_image = caffe.io.blobproto_to_array(mean_blobproto_new)
    f.close()

    count = 0
    correct = 0
    matrix = defaultdict(int) # (real,pred) -> int
    labels_set = set()

   # CNN reconstruction and loading the trained weights 
    net = caffe.Net(args.proto, args.model, caffe.TEST)
    caffe.set_mode_cpu()
    print "args", vars(args)
    if args.lmdb != None:
        reader = lmdb_reader(args.lmdb)
    if args.leveldb != None:
        reader = leveldb_reader(args.leveldb)
    if args.npz != None:
        reader = npz_reader(args.npz)

    for i, image, label in reader:
        image_caffe = image.reshape(1, *image.shape)
        out = net.forward_all(data=np.asarray([ image_caffe ])- mean_image)
        plabel = int(out['prob'][0].argmax(axis=0))

        count += 1
        iscorrect = label == plabel
        correct += (1 if iscorrect else 0)
        matrix[(label, plabel)] += 1
        labels_set.update([label, plabel])

        if not iscorrect:
            print("\rError: i=%s, expected %i but predicted %i" \
                    % (i, label, plabel))

        sys.stdout.write("\rAccuracy: %.1f%%" % (100.*correct/count))
        sys.stdout.flush()

    print(", %i/%i corrects" % (correct, count))

    print ""
    print "Confusion matrix:"
    print "(r , p) | count"
    for l in labels_set:
        for pl in labels_set:
            print "(%i , %i) | %i" % (l, pl, matrix[(l,pl)])