python-3.xtensorflowtfrecord

Tensorflow: read variable length data, via Dataset (tfrecord)


Best

I would like to read some TF records data.
This works, but only for Fixed length data, but now I would like to do the same thing with variable length data VarLenFeature

def load_tfrecord_fixed(serialized_example):

    context_features = {
        'length':tf.FixedLenFeature([],dtype=tf.int64),
        'type':tf.FixedLenFeature([],dtype=tf.string)
    }

    sequence_features = {
        "values":tf.FixedLenSequenceFeature([], dtype=tf.int64)
    }


    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )


    return context_parsed,sequence_parsed

and

   tf.reset_default_graph()



    with tf.Session() as sess:

        filenames = [fp.name]

        dataset = tf.data.TFRecordDataset(filenames)
        dataset = dataset.map(load_tfrecord_fixed)
        dataset = dataset.repeat()
        dataset = dataset.batch(2)

        iterator = dataset.make_initializable_iterator()
        next_element = iterator.get_next()

        a = sess.run(iterator.initializer)

        for i in range(3):
            a = sess.run(next_element)
            print(a)

result:

({'length': array([3, 3], dtype=int64), 'type': array([b'FIXED_length', b'FIXED_length'], dtype=object)}, {'values': array([[82,  2,  2],
       [42,  5,  1]], dtype=int64)}) ({'length': array([3, 3], dtype=int64), 'type': array([b'FIXED_length', b'FIXED_length'], dtype=object)}, {'values': array([[2, 3, 1],
       [1, 2, 3]], dtype=int64)}) ({'length': array([3, 3], dtype=int64), 'type': array([b'FIXED_length', b'FIXED_length'], dtype=object)}, {'values': array([[  1, 100, 200],
       [123,  12,  12]], dtype=int64)})

here is the map function which i'm trying to use, but at the end it gives me some errors :'(

def load_tfrecord_variable(serialized_example):

    context_features = {
        'length':tf.FixedLenFeature([],dtype=tf.int64),
        'batch_size':tf.FixedLenFeature([],dtype=tf.int64),
        'type':tf.FixedLenFeature([],dtype=tf.string)
    }

    sequence_features = {
        "values":tf.VarLenFeature(tf.int64)
    }


    context_parsed, sequence_parsed = tf.parse_single_sequence_example(
        serialized=serialized_example,
        context_features=context_features,
        sequence_features=sequence_features
    )
    #return context_parsed, sequence_parsed (which is sparse)

    # return context_parsed, sequence_parsed
    batched_data = tf.train.batch(
        tensors=[sequence_parsed['values']],
        batch_size= 2,
        dynamic_pad=True
    )

    # make dense data
    dense_data = tf.sparse_tensor_to_dense(batched_data)

    return context_parsed, dense_data

error:

OutOfRangeError: Attempted to repeat an empty dataset infinitely.
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[[], [], [], [?,?,?]], output_types=[DT_INT64, DT_INT64, DT_STRING, DT_INT64], _device="/job:localhost/replica:0/task:0/device:CPU:0"](Iterator)]]

During handling of the above exception, another exception occurred:

Thus can you someone help me? Also, i'm using tensorflow nightly. I don't think that i'm missing a lot ...


Solution

  • def load_tfrecord_variable(serialized_example):
    
        context_features = {
            'length':tf.FixedLenFeature([],dtype=tf.int64),
            'batch_size':tf.FixedLenFeature([],dtype=tf.int64),
            'type':tf.FixedLenFeature([],dtype=tf.string)
        }
    
        sequence_features = {
            "values":tf.VarLenFeature(tf.int64)
        }
    
        context_parsed, sequence_parsed = tf.parse_single_sequence_example(
            serialized=serialized_example,
            context_features=context_features,
            sequence_features=sequence_features
        )
        
        length = context_parsed['length']
        batch_size = context_parsed['batch_size']
        type = context_parsed['type']
        
        values = sequence_parsed['values'].values
        
        return tf.tuple([length, batch_size, type, values])
        
    # 
    filenames = [fp.name]    
        
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(load_tfrecord_fixed)
    dataset = dataset.repeat()
    dataset = dataset.padded_batch(
        batch_size, 
        padded_shapes=(
            tf.TensorShape([]),
            tf.TensorShape([]),
            tf.TensorShape([]),
            tf.TensorShape([None])  # if you reshape 'values' in load_tfrecord_variable, add the added dims after None, e.g. [None, 3]
            ),
        padding_values = (
            tf.constant(0, dtype=tf.int64),
            tf.constant(0, dtype=tf.int64),
            tf.constant(""),
            tf.constant(0, dtype=tf.int64)
            )
        )
    
    iterator = dataset.make_initializable_iterator()
    next_element = iterator.get_next()
    
    with tf.Session() as sess:
        a = sess.run(iterator.initializer)
        for i in range(3):
            [length_vals, batch_size_vals, type_vals, values_vals] = sess.run(next_element)