python tensorflow tensorflow2.0 tflearn migrating
Migrating code to TF2 and not getting the ModuleNotFoundError: No module named 'tensorflow.contrib.framework'

I have this code here from https://github.com/skokec/DAU-ConvNet, which is based on this paper https://arxiv.org/abs/1902.07474 and replaces the standard grid-based filters in a convolutional block with an adaptive filter version. When I try to import the package I get this error:
ModuleNotFoundError: No module named 'tensorflow.contrib.framework'.
I know that "tensorflow.contrib" is apparently being removed in version 2.0 and I need to revert back to version <= 1.14 to make it work. BUT I wanted to see if someone can make this code work in the newer TF.2 version as this is a very interesting paper and the results are very good and by migrating this code to TF2 it will encourage other people to try it out and experiment with this layer in different architectures/setups. Below is the source code:
import os
import numpy as np
import tensorflow as tf

from tensorflow.python.layers import base
from tensorflow.python.layers import utils

from tensorflow.python.framework import ops
from tensorflow.python.framework import tensor_shape

from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import nn
from tensorflow.python.ops import init_ops

class DAUConv2dTF(base.Layer):
def __init__(self, filters,
             dau_units,
             max_kernel_size,
             strides=1,
             data_format='channels_first',
             activation=None,
             use_bias=True,
             weight_initializer=init_ops.random_normal_initializer(stddev=0.1),
             mu1_initializer=None,
             mu2_initializer=None,
             sigma_initializer=None,
             bias_initializer=init_ops.zeros_initializer(),
             weight_regularizer=None,
             mu1_regularizer=None,
             mu2_regularizer=None,
             sigma_regularizer=None,
             bias_regularizer=None,
             activity_regularizer=None,
             weight_constraint=None,
             mu1_constraint=None,
             mu2_constraint=None,
             sigma_constraint=None,
             bias_constraint=None,
             trainable=True,
             mu_learning_rate_factor=500,
             dau_unit_border_bound=0.01,
             dau_sigma_trainable=False,
             name=None,
             **kwargs):
    super(DAUConv2dTF, self).__init__(trainable=trainable, name=name,
                                activity_regularizer=activity_regularizer,
                                **kwargs)
    self.rank = 2
    self.filters = filters
    self.dau_units = utils.normalize_tuple(dau_units, self.rank, 'dau_components')
    self.max_kernel_size = max_kernel_size
    self.padding = np.floor(self.max_kernel_size/2.0)
    self.strides = strides
    self.data_format = utils.normalize_data_format(data_format)
    self.activation = activation
    self.use_bias = use_bias
    self.bias_initializer = bias_initializer
    self.bias_regularizer = bias_regularizer
    self.bias_constraint = bias_constraint

    self.weight_initializer = weight_initializer
    self.weight_regularizer = weight_regularizer
    self.weight_constraint = weight_constraint

    self.mu1_initializer = mu1_initializer
    self.mu1_regularizer = mu1_regularizer
    self.mu1_constraint = mu1_constraint

    self.mu2_initializer = mu2_initializer
    self.mu2_regularizer = mu2_regularizer
    self.mu2_constraint = mu2_constraint

    self.sigma_initializer = sigma_initializer
    self.sigma_regularizer = sigma_regularizer
    self.sigma_constraint = sigma_constraint

    if self.mu1_initializer is None:
        raise Exception("Must initialize MU1")
    if self.mu2_initializer is None:
        raise Exception("Must initialize MU2")

    if self.sigma_initializer is None:
        self.sigma_initializer=init_ops.constant_initializer(0.5)

    self.mu_learning_rate_factor = mu_learning_rate_factor

    self.input_spec = base.InputSpec(ndim=self.rank + 2)

    self.dau_unit_border_bound = dau_unit_border_bound
    self.num_dau_units_all = np.int32(np.prod(self.dau_units))

    self.dau_weights = None
    self.dau_mu1 = None
    self.dau_mu2 = None
    self.dau_sigma = None

    self.dau_sigma_trainable = dau_sigma_trainable

def set_dau_variables_manually(self, w = None, mu1 = None, mu2 = None, sigma = None):
    """ Manually set w,mu1,mu2 and/or sigma variables with custom tensor. Call before build() or __call__().
    The shape must match the expecated shape as returned by the get_dau_variable_shape(input_shape)
    otherwise the build() function will fail."""

    if w is not None:
        self.dau_weights = w

    if mu1 is not None:
        self.dau_mu1 = mu1

    if mu2 is not None:
        self.dau_mu2 = mu2

    if sigma is not None:
        self.dau_sigma = sigma

def _get_input_channel_axis(self):
    if self.data_format == 'channels_first':
        channel_axis = 1
    else:
        channel_axis = -1

    return channel_axis

def _get_input_channels(self, input_shape):
    channel_axis = self._get_input_channel_axis()

    if input_shape[channel_axis].value is None:
        raise ValueError('The channel dimension of the inputs '
                         'should be defined. Found `None`.')

    return input_shape[channel_axis].value

def get_dau_variable_shape(self, input_shape):
    # get input
    num_input_channels = self._get_input_channels(input_shape)

    dau_params_shape_ = (num_input_channels, self.dau_units[0], self.dau_units[1], self.filters)
    dau_params_shape = (1, num_input_channels, self.num_dau_units_all, self.filters)

    return dau_params_shape

def add_dau_weights_var(self, input_shape):
    dau_params_shape = self.get_dau_variable_shape(input_shape)
    return self.add_variable(name='weights',
                             shape=dau_params_shape,
                             initializer=self.weight_initializer,
                             regularizer=self.weight_regularizer,
                             constraint=self.weight_constraint,
                             trainable=True,
                             dtype=self.dtype)

def add_dau_mu1_var(self, input_shape):
    dau_params_shape = self.get_dau_variable_shape(input_shape)
    mu1_var = self.add_variable(name='mu1',
                             shape=dau_params_shape,
                             initializer=self.mu1_initializer,
                             regularizer=self.mu1_regularizer,
                             constraint=self.mu1_constraint,
                             trainable=True,
                             dtype=self.dtype)

    # limit max offset based on self.dau_unit_border_bound and kernel size
    mu1_var = tf.minimum(tf.maximum(mu1_var,
                                    -(self.max_kernel_size - self.dau_unit_border_bound)),
                         self.max_kernel_size - self.dau_unit_border_bound)

    return mu1_var



def add_dau_mu2_var(self, input_shape):
    dau_params_shape = self.get_dau_variable_shape(input_shape)
    mu2_var = self.add_variable(name='mu2',
                               shape=dau_params_shape,
                               initializer=self.mu2_initializer,
                               regularizer=self.mu2_regularizer,
                               constraint=self.mu2_constraint,
                               trainable=True,
                               dtype=self.dtype)


    # limit max offset based on self.dau_unit_border_bound and kernel size
    mu2_var = tf.minimum(tf.maximum(mu2_var,
                                    -(self.max_kernel_size - self.dau_unit_border_bound)),
                         self.max_kernel_size - self.dau_unit_border_bound)

    return mu2_var
def add_dau_sigma_var(self, input_shape, trainable=False):
    dau_params_shape = self.get_dau_variable_shape(input_shape)

    # create single sigma variable
    sigma_var = self.add_variable(name='sigma',
                                  shape=dau_params_shape,
                                  initializer=self.sigma_initializer,
                                  regularizer=self.sigma_regularizer,
                                  constraint=self.sigma_constraint,
                                  trainable=self.dau_sigma_trainable,
                                  dtype=self.dtype)

    # but make variable shared across all channels as required for the efficient DAU implementation
    return sigma_var


def add_bias_var(self):
    return self.add_variable(name='bias',
                             shape=(self.filters,),
                             initializer=self.bias_initializer,
                             regularizer=self.bias_regularizer,
                             constraint=self.bias_constraint,
                             trainable=True,
                             dtype=self.dtype)

def build(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape)

    dau_params_shape = self.get_dau_variable_shape(input_shape)
    if self.dau_weights is None:
        self.dau_weights = self.add_dau_weights_var(input_shape)
    elif np.any(self.dau_weights.shape != dau_params_shape):
        raise ValueError('Shape mismatch for variable `dau_weights`')
    if self.dau_mu1 is None:
        self.dau_mu1 = self.add_dau_mu1_var(input_shape)
    elif np.any(self.dau_mu1.shape != dau_params_shape):
        raise ValueError('Shape mismatch for variable `dau_mu1`')

    if self.dau_mu2 is None:
        self.dau_mu2 = self.add_dau_mu2_var(input_shape)
    elif np.any(self.dau_mu2.shape != dau_params_shape):
        raise ValueError('Shape mismatch for variable `dau_mu2`')
    if self.dau_sigma is None:
        self.dau_sigma = self.add_dau_sigma_var(input_shape, trainable=self.dau_sigma_trainable)
    elif np.any(self.dau_sigma.shape != dau_params_shape):
        raise ValueError('Shape mismatch for variable `dau_sigma`')

    if self.use_bias:
        self.bias = self.add_bias_var()
    else:
        self.bias = None

    input_channel_axis = self._get_input_channel_axis()
    num_input_channels = self._get_input_channels(input_shape)

    self.input_spec = base.InputSpec(ndim=self.rank + 2,
                                     axes={input_channel_axis: num_input_channels})

    kernel_shape = tf.TensorShape((self.max_kernel_size, self.max_kernel_size, num_input_channels, self.filters))

    self._convolution_op = nn_ops.Convolution(
        input_shape,
        filter_shape=kernel_shape,
        dilation_rate=(1,1),
        strides=(self.strides,self.strides),
        padding="SAME",
        data_format=utils.convert_data_format(self.data_format,
                                              self.rank + 2))
    self.built = True

def call(self, inputs):

    def get_kernel_fn(dau_w, dau_mu1, dau_mu2, dau_sigma, max_kernel_size, mu_learning_rate_factor=1):

        # add mu1/mu2 gradient multiplyer
        if mu_learning_rate_factor != 1:
            dau_mu1 = mu_learning_rate_factor * dau_mu1 + (1 - mu_learning_rate_factor) * tf.stop_gradient(dau_mu1)
            dau_mu2 = mu_learning_rate_factor * dau_mu2 + (1 - mu_learning_rate_factor) * tf.stop_gradient(dau_mu2)

        [X,Y] = np.meshgrid(np.arange(max_kernel_size),np.arange(max_kernel_size))

        X = np.reshape(X,(max_kernel_size*max_kernel_size,1,1,1)) - int(max_kernel_size/2)
        Y = np.reshape(Y,(max_kernel_size*max_kernel_size,1,1,1)) - int(max_kernel_size/2)

        X = X.astype(np.float32)
        Y = Y.astype(np.float32)

        # Gaussian kernel
        X = tf.convert_to_tensor(X,name='X',dtype=tf.float32)
        Y = tf.convert_to_tensor(Y,name='Y',dtype=tf.float32)

        gauss_kernel = tf.exp(-1* (tf.pow(X - dau_mu1,2.0) + tf.pow(Y - dau_mu2,2.0)) / (2.0*tf.pow(dau_sigma,2.0)),name='gauss_kernel')

        gauss_kernel_sum = tf.reduce_sum(gauss_kernel,axis=0, keep_dims=True,name='guass_kernel_sum')

        gauss_kernel_norm = tf.divide(gauss_kernel, gauss_kernel_sum ,name='gauss_kernel_norm')

        # normalize to sum of 1 and add weight
        gauss_kernel_norm = tf.multiply(dau_w, gauss_kernel_norm,name='gauss_kernel_weight')

        # sum over Gaussian units
        gauss_kernel_norm = tf.reduce_sum(gauss_kernel_norm, axis=2, keep_dims=True,name='gauss_kernel_sum_units')

        # convert to [Kw,Kh,S,F] shape
        gauss_kernel_norm = tf.reshape(gauss_kernel_norm, (max_kernel_size, max_kernel_size, gauss_kernel_norm.shape[1], gauss_kernel_norm.shape[3]),name='gauss_kernel_reshape')

        return gauss_kernel_norm

    try:
        # try with XLA if exists
        from tensorflow.contrib.compiler import xla

        gauss_kernel_norm = xla.compile(computation=get_kernel_fn, inputs=(self.dau_weights, self.dau_mu1, self.dau_mu2, self.dau_sigma, self.max_kernel_size, self.mu_learning_rate_factor))[0]

    except:
        # otherwise revert to direct method call
        gauss_kernel_norm = get_kernel_fn(self.dau_weights, self.dau_mu1, self.dau_mu2, self.dau_sigma, self.max_kernel_size, self.mu_learning_rate_factor)

    outputs = self._convolution_op(inputs, gauss_kernel_norm)

    if self.use_bias:
        if self.data_format == 'channels_first':
            if self.rank == 1:
                # nn.bias_add does not accept a 1D input tensor.
                bias = array_ops.reshape(self.bias, (1, self.filters, 1))
                outputs += bias
            if self.rank == 2:
                outputs = nn.bias_add(outputs, self.bias, data_format='NCHW')
            if self.rank == 3:
                # As of Mar 2017, direct addition is significantly slower than
                # bias_add when computing gradients. To use bias_add, we collapse Z
                # and Y into a single dimension to obtain a 4D input tensor.
                outputs_shape = outputs.shape.as_list()
                if outputs_shape[0] is None:
                    outputs_shape[0] = -1
                outputs_4d = array_ops.reshape(outputs,
                                               [outputs_shape[0], outputs_shape[1],
                                                outputs_shape[2] * outputs_shape[3],
                                                outputs_shape[4]])
                outputs_4d = nn.bias_add(outputs_4d, self.bias, data_format='NCHW')
                outputs = array_ops.reshape(outputs_4d, outputs_shape)
        else:
            outputs = nn.bias_add(outputs, self.bias, data_format='NHWC')

    if self.activation is not None:
        return self.activation(outputs)
    return outputs

def compute_output_shape(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape).as_list()
    if self.data_format == 'channels_last':
        space = input_shape[1:-1]
        new_space = []
        for i in range(len(space)):
            new_dim = utils.conv_output_length(
                space[i],
                self.max_kernel_size[i],
                padding=self.padding,
                stride=self.strides[i],
                dilation=1)
            new_space.append(new_dim)
        return tensor_shape.TensorShape([input_shape[0]] + new_space +
                                        [self.filters])
    else:
        space = input_shape[2:]
        new_space = []
        for i in range(len(space)):
            new_dim = utils.conv_output_length(
                space[i],
                self.kernel_size[i],
                padding=self.padding,
                stride=self.strides,
                dilation=1)
            new_space.append(new_dim)
        return tensor_shape.TensorShape([input_shape[0], self.filters] +
                                        new_space)


from tensorflow.contrib.framework.python.ops import add_arg_scope
from tensorflow.python.ops import variable_scope
from tensorflow.contrib.layers.python.layers import layers as layers_contrib
from tensorflow.contrib.layers.python.layers import utils as utils_contrib

@add_arg_scope
def dau_conv2d_tf(inputs,
             filters,
             dau_units,
             max_kernel_size,
             stride=1,
             mu_learning_rate_factor=500,
             data_format=None,
             activation_fn=nn.relu,
             normalizer_fn=None,
             normalizer_params=None,
             weights_initializer=init_ops.random_normal_initializer(stddev=0.1), #init_ops.glorot_uniform_initializer(),
             weights_regularizer=None,
             weights_constraint=None,
             mu1_initializer=None,
             mu1_regularizer=None,
             mu1_constraint=None,
             mu2_initializer=None,
             mu2_regularizer=None,
             mu2_constraint=None,
             sigma_initializer=None,
             sigma_regularizer=None,
             sigma_constraint=None,
             biases_initializer=init_ops.zeros_initializer(),
             biases_regularizer=None,
             biases_constraint=None,
             dau_unit_border_bound=0.01,
             dau_sigma_trainable=False,
             reuse=None,
             variables_collections=None,
             outputs_collections=None,
             trainable=True,
             scope=None):

if data_format not in [None, 'NWC', 'NCW', 'NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
    raise ValueError('Invalid data_format: %r' % (data_format,))

layer_variable_getter = layers_contrib._build_variable_getter({
    'bias': 'biases',
    'weight': 'weights',
    'mu1': 'mu1',
    'mu2': 'mu2',
    'sigma': 'sigma'
})

with variable_scope.variable_scope(
        scope, 'DAUConv', [inputs], reuse=reuse,
        custom_getter=layer_variable_getter) as sc:
    inputs = ops.convert_to_tensor(inputs)
    input_rank = inputs.get_shape().ndims

    if input_rank != 4:
        raise ValueError('DAU convolution not supported for input with rank',
                         input_rank)

    df = ('channels_first'
          if data_format and data_format.startswith('NC') else 'channels_last')

    layer = DAUConv2dTF(filters,
                      dau_units,
                      max_kernel_size,
                      strides=stride,
                      data_format=df,
                      activation=None,
                      use_bias=not normalizer_fn and biases_initializer,
                      mu_learning_rate_factor=mu_learning_rate_factor,
                      weight_initializer=weights_initializer,
                      mu1_initializer=mu1_initializer,
                      mu2_initializer=mu2_initializer,
                      sigma_initializer=sigma_initializer,
                      bias_initializer=biases_initializer,
                      weight_regularizer=weights_regularizer,
                      mu1_regularizer=mu1_regularizer,
                      mu2_regularizer=mu2_regularizer,
                      sigma_regularizer=sigma_regularizer,
                      bias_regularizer=biases_regularizer,
                      activity_regularizer=None,
                      weight_constraint=weights_constraint,
                      mu1_constraint=mu1_constraint,
                      mu2_constraint=mu2_constraint,
                      sigma_constraint=sigma_constraint,
                      bias_constraint=biases_constraint,
                      dau_unit_border_bound=dau_unit_border_bound,
                      dau_sigma_trainable=dau_sigma_trainable,
                      trainable=trainable,
                      name=sc.name,
                      _scope=sc,
                      _reuse=reuse)

    outputs = layer.apply(inputs)

    # Add variables to collections.
    layers_contrib._add_variable_to_collections(layer.dau_weights, variables_collections, 'weights')
    layers_contrib._add_variable_to_collections(layer.dau_mu1, variables_collections, 'mu1')
    layers_contrib._add_variable_to_collections(layer.dau_mu2, variables_collections, 'mu2')
    layers_contrib._add_variable_to_collections(layer.dau_sigma, variables_collections, 'sigma')

    if layer.use_bias:
        layers_contrib._add_variable_to_collections(layer.bias, variables_collections, 'biases')

    if normalizer_fn is not None:
        normalizer_params = normalizer_params or {}
        outputs = normalizer_fn(outputs, **normalizer_params)

    if activation_fn is not None:
        outputs = activation_fn(outputs)
    return utils_contrib.collect_named_outputs(outputs_collections, sc.name, outputs)
Any help would be highly appreciated. Cheers, H
Solution
In TensorFlow 2.x, the recommended method of creating reusable neural network layers is to create a new tf.keras.layers.Layer subclass. TensorFlow provides a great tutorial on this. You can reuse the vast majority of the code in your posted example in a tf.keras layer class. You might also be able to inherit from tensorflow.python.keras.layers.convolutional.Conv to reduce the amount of boilerplate code.
As for the some modules not being found, you should use the aliases that TensorFlow exposes. Here is an incomplete list:
array_ops.reshape -> tf.reshape
init_ops.constant_initializer -> tf.initializers.constant
tensor_shape -> tf.TensorShape
nn_ops.Convolution -> tf.nn.convolution or tf.keras.layers.Conv?D