pythonc++numpytensorflow

Tensorflow Create Protobuf for a Tensor


I want to have a Python script that converts Numpy arrays to TensorFlow Tensors in Protobuf Binary so later in C++ I can reload them. This can be done with a compute graph like this.

I found the following functions and features in the TensorFlow Python API.

C++ has a corresponding load operation

Can you give me an example of serializing a TF tensor to Protobuf binary and back?


Solution

  • I'll post the answer as I figure it out, so perhaps someone can pitch in with the rest of the solution.

    Python

    Tensor -> Protobuf Binary

    >>> import tensorflow as tf
    >>> with tf.Graph().as_default():
    ...     s = tf.constant([1.2, 3.4, 5.6, 7.8])._op.node_def.attr['value'].SerializeToString()
    ...
    >>> s
    'B\x1a\x08\x01\x12\x04\x12\x02\x08\x04"\x10\x9a\x99\x99?\x9a\x99Y@33\xb3@\x9a\x99\xf9@'
    

    Protobuf Binary -> Tensor

    >>> import tensorflow as tf
    >>> s = 'B\x1a\x08\x01\x12\x04\x12\x02\x08\x04"\x10\x9a\x99\x99?\x9a\x99Y@33\xb3@\x9a\x99\xf9@'
    >>> with tf.Graph().as_default():
    ...     c = tf.constant(1)
    ...     c._op.node_def.attr['value'].ParseFromString(s)
    ...     c._op.node_def.attr['dtype'].type = c._op.node_def.attr['value'].tensor.dtype
    ...     print c.eval(session=tf.Session())
    ... 
    28
    [ 1.20000005  3.4000001   5.5999999   7.80000019]
    

    Benchmarks

       Array Elements  from_string [us]  to_string [us]
    0              10         10.273593        2.308139
    1             100         10.450414        2.291126
    2            1000         10.540897        2.359392
    3           10000         12.175265        2.734819
    4          100000         31.460438        7.349958
    

    Benchmark Log-Log Plot

    Benchmark script

    import tensorflow as tf
    import pandas as pd
    import numpy as np
    import timeit
    import matplotlib.pyplot as plt
    
    def to_string(shape):
        with tf.Graph().as_default():
            s = tf.constant(np.empty(shape))._op.node_def.attr['value'].SerializeToString()
        return s
    
    
    def from_string(s):
        with tf.Graph().as_default():
            c = tf.constant(1)
            c._op.node_def.attr['value'].ParseFromString(s)
            c._op.node_def.attr['dtype'].type = c._op.node_def.attr['value'].tensor.dtype
            c.eval(session=tf.Session())
    
    NUM_RUNS = 10000
    MAX_POW = 6
    
    print "Collecting to_string stats"
    to_string_results = np.array([[N, timeit.timeit('to_string((%d,))' % N,
                                                    setup="from __main__ import to_string",
                                                    number=NUM_RUNS)]
                                  for N in 10**np.arange(1, MAX_POW)]).T
    
    print "Collecting from_string stats"
    strings = {N:to_string((N,)) for N in 10**np.arange(1, MAX_POW)}
    from_string_results = np.array([[N, timeit.timeit('from_string(strings[%d])' % N,
                                                      setup="from __main__ import from_string, strings",
                                                      number=NUM_RUNS)]
                                    for N in 10**np.arange(1, MAX_POW)]).T
    
    df = pd.DataFrame.from_dict({"Array Elements": to_string_results[0],
                                 "to_string [us]": to_string_results[1],
                                 "from_string [us]": from_string_results[1]})
    
    print df
    df.to_csv('benchmark.csv')
    
    plt.subplot(2, 1, 1)
    plt.loglog(to_string_results[0], to_string_results[1])
    plt.title('to_string')
    plt.ylabel('microseconds')
    plt.xlabel('Array size')
    plt.subplot(2, 1, 2)
    plt.loglog(from_string_results[0], from_string_results[1])
    plt.title('from_string')
    plt.ylabel('microseconds')
    plt.xlabel('Array size')
    plt.show()
    

    C++

    Working on it