julia

How to flatten a dict type column in a DF


i have a df with a dict type column named measures like below:

enter image description here

How can I flatten this column as new columns in the same DF?


Solution

  • I recently had the same problem, wanting to extract and flatten data from a JSON, it might be overkill for your issue and a bit obscure but here it is:

    This expects Dicts and ignores missing or malformed data

    function extract_flatten(data::AbstractDict, extract::AbstractDict; cmdchar::AbstractChar='%')
        res = Dict()
        for (key, val) in extract
            temp = Any[data]
            keys = [key]
            for v in val
                if v isa AbstractString
                    if v[1] == cmdchar
                        v = split(v[2:end], ':')
                        if v[1] == "all"
                            temp2 = []
                            keys2 = String[]
                            for (t,k) in zip(temp, keys)
                                for (kt,vt) in pairs(t)
                                    push!(keys2, join([k; v[2:end]; kt], '_'))
                                    push!(temp2, vt)
                                end
                            end
                            temp = temp2
                            keys = keys2
                        elseif v[1] == "name"
                            keys .*= '_' * join(v[2:end], '_')
                        else
                            error("$(repr(v)) is not a valid command")
                        end
                    else
                        temp .= getdefnothing.(temp, Ref(v))
                    end
                elseif v isa Integer
                    temp .= getdefnothing.(temp, Ref(v))
                else
                    error("$(repr(v)) is not a valid key")
                end
                nothings = isnothing.(temp)
                deleteat!(temp, nothings)
                deleteat!(keys, nothings)
                isempty(temp) && break
            end
            push!.(Ref(res), keys .=> temp)
        end
        return res
    end
    
    getdefnothing(x, y) = nothing
    getdefnothing(x::AbstractDict, y) = get(x, y, nothing)
    getdefnothing(x::AbstractArray, y) = get(x, y, nothing)
    

    example use:

    using Test
    const d = Dict
    
    schema = d(
        "a" => ["b", "c", "d"],
        "b" => ["e"],
        "c" => ["f", "%all:z", "g"]
    )
    
    a = d("z" => 3)
    @test extract_flatten(a, schema) == d()
    
    b = d("e" => 0.123)
    @test extract_flatten(b, schema) == d("b" => 0.123)
    
    c = d("e" => true, "b" => d("c" => d("d" => "ABC")))
    @test extract_flatten(c, schema) == d("b" => true, "a" => "ABC")
    
    e = d("f" => d(
        "a" => d("g" => "A"),
        "b" => d("g" => "B")
    ))
    @test extract_flatten(e, schema) == d("c_z_a" => "A", "c_z_b" => "B")
    
    f = d("f" => [
        d("g" => "A"),
        d("g" => "B")
    ])
    @test extract_flatten(f, schema) == d("c_z_1" => "A", "c_z_2" => "B")
    
    g = d("e" => nothing, "f" => [1,2,3])
    @test extract_flatten(g, schema) == d()