pythontensorflowtensorflow-slimvgg-net

Tensorflow object detection API: Custom VGG 16 model


I am in the process of creating a Custom VGG model as a feature extractor of Faster RCNN model in Tensorflow object detection API. As mentioned on in the document https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/defining_your_own_model.md the feature extractor code consists of extract_proposal_features and extract_classifier_features. I am using TF slim code of creating the convolution layers (since they Tensorflow team uses it). As a reference please find the model structure of VGG 16 returned using by TF slim

      ([('vgg_16/conv1/conv1_1',
         <tf.Tensor 'vgg_16/vgg_16/conv1/conv1_1/Relu:0' shape=(?, 224, 224, 64) dtype=float32>),
        ('vgg_16/conv1/conv1_2',
         <tf.Tensor 'vgg_16/vgg_16/conv1/conv1_2/Relu:0' shape=(?, 224, 224, 64) dtype=float32>),
        ('vgg_16/vgg_16/pool1',
         <tf.Tensor 'vgg_16/vgg_16/pool1/MaxPool:0' shape=(?, 112, 112, 64) dtype=float32>),
        ('vgg_16/conv2/conv2_1',
         <tf.Tensor 'vgg_16/vgg_16/conv2/conv2_1/Relu:0' shape=(?, 112, 112, 128) dtype=float32>),
        ('vgg_16/conv2/conv2_2',
         <tf.Tensor 'vgg_16/vgg_16/conv2/conv2_2/Relu:0' shape=(?, 112, 112, 128) dtype=float32>),
        ('vgg_16/vgg_16/pool2',
         <tf.Tensor 'vgg_16/vgg_16/pool2/MaxPool:0' shape=(?, 56, 56, 128) dtype=float32>),
        ('vgg_16/conv3/conv3_1',
         <tf.Tensor 'vgg_16/vgg_16/conv3/conv3_1/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
        ('vgg_16/conv3/conv3_2',
         <tf.Tensor 'vgg_16/vgg_16/conv3/conv3_2/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
        ('vgg_16/conv3/conv3_3',
         <tf.Tensor 'vgg_16/vgg_16/conv3/conv3_3/Relu:0' shape=(?, 56, 56, 256) dtype=float32>),
        ('vgg_16/vgg_16/pool3',
         <tf.Tensor 'vgg_16/vgg_16/pool3/MaxPool:0' shape=(?, 28, 28, 256) dtype=float32>),
        ('vgg_16/conv4/conv4_1',
         <tf.Tensor 'vgg_16/vgg_16/conv4/conv4_1/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
        ('vgg_16/conv4/conv4_2',
         <tf.Tensor 'vgg_16/vgg_16/conv4/conv4_2/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
        ('vgg_16/conv4/conv4_3',
         <tf.Tensor 'vgg_16/vgg_16/conv4/conv4_3/Relu:0' shape=(?, 28, 28, 512) dtype=float32>),
        ('vgg_16/vgg_16/pool4',
         <tf.Tensor 'vgg_16/vgg_16/pool4/MaxPool:0' shape=(?, 14, 14, 512) dtype=float32>),
        ('vgg_16/conv5/conv5_1',
         <tf.Tensor 'vgg_16/vgg_16/conv5/conv5_1/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
        ('vgg_16/conv5/conv5_2',
         <tf.Tensor 'vgg_16/vgg_16/conv5/conv5_2/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
        ('vgg_16/conv5/conv5_3',
         <tf.Tensor 'vgg_16/vgg_16/conv5/conv5_3/Relu:0' shape=(?, 14, 14, 512) dtype=float32>),
        ('vgg_16/vgg_16/pool5',
         <tf.Tensor 'vgg_16/vgg_16/pool5/MaxPool:0' shape=(?, 7, 7, 512) dtype=float32>),
        ('vgg_16/fc6',
         <tf.Tensor 'vgg_16/vgg_16/fc6/Relu:0' shape=(?, 1, 1, 4096) dtype=float32>),
        ('vgg_16/fc7',
         <tf.Tensor 'vgg_16/vgg_16/fc7/Relu:0' shape=(?, 1, 1, 4096) dtype=float32>)])

My question is that, which convolution layer needs to be included and returned in extract_proposal_features method and which convolution layers needs to be included and returned in extract_classifier_features. Please let me know.


Solution

  • I've changed vgg slim code to get the right tensor.

    def vgg_16(inputs,
           num_classes=1000,
           is_training=True,
           dropout_keep_prob=0.5,
           spatial_squeeze=True,
           scope='vgg_16',
           fc_conv_padding='VALID',
           global_pool=False):
    """Oxford Net VGG 16-Layers version D Example.
    
    Note: All the fully_connected layers have been transformed to conv2d layers.
          To use in classification mode, resize input to 224x224.
    
    Args:
      inputs: a tensor of size [batch_size, height, width, channels].
      num_classes: number of predicted classes. If 0 or None, the logits layer is
        omitted and the input features to the logits layer are returned instead.
      is_training: whether or not the model is being trained.
      dropout_keep_prob: the probability that activations are kept in the dropout
        layers during training.
      spatial_squeeze: whether or not should squeeze the spatial dimensions of the
        outputs. Useful to remove unnecessary dimensions for classification.
      scope: Optional scope for the variables.
      fc_conv_padding: the type of padding to use for the fully connected layer
        that is implemented as a convolutional layer. Use 'SAME' padding if you
        are applying the network in a fully convolutional manner and want to
        get a prediction map downsampled by a factor of 32 as an output.
        Otherwise, the output prediction map will be (input / 32) - 6 in case of
        'VALID' padding.
      global_pool: Optional boolean flag. If True, the input to the classification
        layer is avgpooled to size 1x1, for any input size. (This is not part
        of the original VGG architecture.)
    
    Returns:
      net: the output of the logits layer (if num_classes is a non-zero integer),
        or the input to the logits layer (if num_classes is 0 or None).
      end_points: a dict of tensors with intermediate activations.
    """
    with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc:
        end_points_collection = sc.original_name_scope + '_end_points'
        # Collect outputs for conv2d, fully_connected and max_pool2d.
        with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d],
                            outputs_collections=end_points_collection):
            net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2')
            net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
            net = slim.max_pool2d(net, [2, 2], scope='pool3')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
            net = slim.max_pool2d(net, [2, 2], scope='pool4')
            net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
            net = slim.max_pool2d(net, [2, 2], scope='pool5')
    
            # Convert end_points_collection into a end_point dict.
            end_points = slim.utils.convert_collection_to_dict(end_points_collection)
    
            end_points['head'] = net
    
            # Use conv2d instead of fully_connected layers.
            net = slim.conv2d(net, 4096, [7, 7], padding=fc_conv_padding, scope='fc6')
            net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                               scope='dropout6')
            net = slim.conv2d(net, 4096, [1, 1], scope='fc7')
    
            if global_pool:
                net = tf.reduce_mean(net, [1, 2], keep_dims=True, name='global_pool')
                end_points['global_pool'] = net
            if num_classes:
                net = slim.dropout(net, dropout_keep_prob, is_training=is_training,
                                   scope='dropout7')
                net = slim.conv2d(net, num_classes, [1, 1],
                                  activation_fn=None,
                                  normalizer_fn=None,
                                  scope='fc8')
                if spatial_squeeze and num_classes is not None:
                    net = tf.squeeze(net, [1, 2], name='fc8/squeezed')
                end_points[sc.name + '/fc8'] = net
            return net, end_points
    

    end_points['head'] = net is the tensor use to extract_proposal_features.

    def _extract_proposal_features(self, preprocessed_inputs, scope):
        """Extracts first stage RPN features.
    
        Args:
          preprocessed_inputs: A [batch, height, width, channels] float32 tensor
            representing a batch of images.
          scope: A scope name.
    
        Returns:
          rpn_feature_map: A tensor with shape [batch, height, width, depth]
        Raises:
          InvalidArgumentError: If the spatial size of `preprocessed_inputs`
            (height or width) is less than 33.
          ValueError: If the created network is missing the required activation.
        """
    
        preprocessed_inputs.get_shape().assert_has_rank(4)
        shape_assert = tf.Assert(
            tf.logical_and(tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
                           tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
            ['image size must at least be 33 in both height and width.'])
    
        with tf.control_dependencies([shape_assert]):
            with tf.variable_scope('vgg_16', 'vgg_16', reuse=self._reuse_weights):
                _, activations = vgg.vgg_16(
                    preprocessed_inputs,
                    scope=scope)
    
        return activations['head']
    

    and

    def _extract_box_classifier_features(self, proposal_feature_maps, scope):
        """Extracts second stage box classifier features.
    
        Args:
          proposal_feature_maps: A 4-D float tensor with shape
            [batch_size * self.max_num_proposals, crop_height, crop_width, depth]
            representing the feature map cropped to each proposal.
          scope: A scope name (unused).
    
        Returns:
          proposal_classifier_features: A 4-D float tensor with shape
            [batch_size * self.max_num_proposals, height, width, depth]
            representing box classifier features for each proposal.
        """
        net = proposal_feature_maps
    
        with tf.variable_scope('vgg_16', reuse=self._reuse_weights):
            with slim.arg_scope(
                    [slim.conv2d],
                    stride=1,
                    padding='VALID'):
                # Use conv2d instead of fully_connected layers.
                fc6 = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
                if self._is_training:
                    fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True,
                                       scope='dropout6')
                fc7 = slim.conv2d(fc6, 4096, [1, 1], scope='fc7')
                if self._is_training:
                    fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True,
                                       scope='dropout7')
        proposal_classifier_features = fc7
    
        return proposal_classifier_features
    

    I do like this. I do not know if it's the correct way :)

    This is my test code.

    import numpy as np
    import tensorflow as tf
    
    from models import faster_rcnn_vgg_16_feature_extractor as faster_rcnn_vgg_16
    
    
    class FasterRcnnVgg16FeatureExtractorTest(tf.test.TestCase):
    
    def _build_feature_extractor(self, first_stage_features_stride):
        return faster_rcnn_vgg_16.FasterRCNNVgg16FeatureExtractor(
            is_training=False,
            first_stage_features_stride=first_stage_features_stride,
            weight_decay=0.0005)
    
    def test_extract_proposal_features_returns_expected_size(self):
        feature_extractor = self._build_feature_extractor(
            first_stage_features_stride=16)
        preprocessed_inputs = tf.random_uniform(
            [4, 224, 224, 3], maxval=255, dtype=tf.float32)
        rpn_feature_map = feature_extractor.extract_proposal_features(
            preprocessed_inputs, scope='TestScope')
        features_shape = tf.shape(rpn_feature_map)
    
        init_op = tf.global_variables_initializer()
        with self.test_session() as sess:
            sess.run(init_op)
            features_shape_out = sess.run(features_shape)
            self.assertAllEqual(features_shape_out, [4, 7, 7, 512])
    
    def test_extract_proposal_features_stride_eight(self):
        feature_extractor = self._build_feature_extractor(
            first_stage_features_stride=8)
        preprocessed_inputs = tf.random_uniform(
            [4, 224, 224, 3], maxval=255, dtype=tf.float32)
        rpn_feature_map = feature_extractor.extract_proposal_features(
            preprocessed_inputs, scope='TestScope')
        features_shape = tf.shape(rpn_feature_map)
    
        init_op = tf.global_variables_initializer()
        with self.test_session() as sess:
            sess.run(init_op)
            features_shape_out = sess.run(features_shape)
            self.assertAllEqual(features_shape_out, [4, 7, 7, 512])
    
    def test_extract_proposal_features_half_size_input(self):
        feature_extractor = self._build_feature_extractor(
            first_stage_features_stride=16)
        preprocessed_inputs = tf.random_uniform(
            [1, 112, 112, 3], maxval=255, dtype=tf.float32)
        rpn_feature_map = feature_extractor.extract_proposal_features(
            preprocessed_inputs, scope='TestScope')
        features_shape = tf.shape(rpn_feature_map)
    
        init_op = tf.global_variables_initializer()
        with self.test_session() as sess:
            sess.run(init_op)
            features_shape_out = sess.run(features_shape)
            self.assertAllEqual(features_shape_out, [1, 4, 4, 512])
    
    def test_extract_proposal_features_dies_on_invalid_stride(self):
        with self.assertRaises(ValueError):
            self._build_feature_extractor(first_stage_features_stride=99)
    
    def test_extract_proposal_features_dies_on_very_small_images(self):
        feature_extractor = self._build_feature_extractor(
            first_stage_features_stride=16)
        preprocessed_inputs = tf.placeholder(tf.float32, (4, None, None, 3))
        rpn_feature_map = feature_extractor.extract_proposal_features(
            preprocessed_inputs, scope='TestScope')
        features_shape = tf.shape(rpn_feature_map)
    
        init_op = tf.global_variables_initializer()
        with self.test_session() as sess:
            sess.run(init_op)
            with self.assertRaises(tf.errors.InvalidArgumentError):
                sess.run(
                    features_shape,
                    feed_dict={preprocessed_inputs: np.random.rand(4, 32, 32, 3)})
    
    def test_extract_proposal_features_dies_with_incorrect_rank_inputs(self):
        feature_extractor = self._build_feature_extractor(
            first_stage_features_stride=16)
        preprocessed_inputs = tf.random_uniform(
            [224, 224, 3], maxval=255, dtype=tf.float32)
        with self.assertRaises(ValueError):
            feature_extractor.extract_proposal_features(
                preprocessed_inputs, scope='TestScope')
    
    def test_extract_box_classifier_features_returns_expected_size(self):
        feature_extractor = self._build_feature_extractor(
            first_stage_features_stride=16)
        proposal_feature_maps = tf.random_uniform(
            [3, 7, 7, 512], maxval=255, dtype=tf.float32)
        proposal_classifier_features = (
            feature_extractor.extract_box_classifier_features(
                proposal_feature_maps, scope='TestScope'))
        features_shape = tf.shape(proposal_classifier_features)
    
        init_op = tf.global_variables_initializer()
        with self.test_session() as sess:
            sess.run(init_op)
            features_shape_out = sess.run(features_shape)
            self.assertAllEqual(features_shape_out, [3, 1, 1, 4096])
    
    
    if __name__ == '__main__':
          tf.test.main()