apache-arrow

Arrow Dictionary over list arrays


I have a list arrays of string that I would like to dictionary encode using Arrow in Go and Java

["a", ["b", "c"], "b", "c", "d"]

Ideally where the dictionary would be done on the individual string values.

I tried to define a type using

arrow.DictionaryType{
   IndexType: arrow.PrimitiveTypes.Uint32,
   ValueType: arrow.ListOf(arrow.BinaryTypes.String),
}

but that yield the error

panic: arrow/array: unsupported builder for value type of *arrow.DictionaryType

Any idea what are my options ?

I am thinking doing an extension type and build the dictionary and listOf(String) myself, but if there is standard way to do this, even better.

Thanks


Solution

  • Not super confident this is correct, or exactly what you are asking for. But if there's someone here who knows more, please chime in, since I'd also like to do this!

    If you have a nested list of strings which need to be stored. For example:

    [
     ["a"], 
     ["b", "c"],
     ["b"],
     ["c"],
     ["d"],
    ]
    

    I think this can be stored in array by setting the type to:

    arrow.ListOf(&arrow.DictionaryType{
            IndexType: &arrow.Int64Type{},
            ValueType: &arrow.StringType{},
            Ordered:   false,
        })
    

    Full code example:

    package main
    
    import (
        "fmt"
    
        "github.com/apache/arrow/go/v15/arrow"
        "github.com/apache/arrow/go/v15/arrow/array"
        "github.com/apache/arrow/go/v15/arrow/memory"
    )
    
    func main() {
        type_ := arrow.ListOf(&arrow.DictionaryType{
            IndexType: &arrow.Int64Type{},
            ValueType: &arrow.StringType{},
            Ordered:   false,
        })
    
        fields := []arrow.Field{{
            Name:     "foo",
            Type:     type_,
            Nullable: false,
        }}
    
        schema := arrow.NewSchema(fields, nil)
    
        builder := array.NewRecordBuilder(memory.DefaultAllocator, schema)
        builder.Retain()
        defer builder.Release()
    
        vals := [][]string{
            {"a", "b", "c"},
            {"x", "b"},
            {"x", "y", "z"},
        }
    
        b := builder.Field(0).(array.VarLenListLikeBuilder)
        db := b.ValueBuilder().(*array.BinaryDictionaryBuilder)
    
        for i := range vals {
            val := vals[i]
            b.AppendWithSize(true, len(val))
            for j := range val {
                err := db.AppendString(val[j])
                if err != nil {
                    panic(err)
                }
            }
        }
    
        rec := builder.NewRecord()
        rec.Retain()
        defer rec.Release()
    
        t := array.NewTableFromRecords(schema, []arrow.Record{rec})
        t.Retain()
    
        fmt.Println(t.String())
    
        defer t.Release()
    }