pythonjsonpandasdataframe

Converting a nested json three levels deep to dataframe


I have a json that is three levels deep. I want to flatten it into a dataframe that has five columns.

id name code level parent_id

So: The part I struggle with is that I can extract each nested item, but I can't find an elegant way to keep the "parent_id". There's got to be more elegant ways of doing this. Any pointers appreciated.

source = response.json()
print ('there are ' + str(len(source)) + ' records')    

df_L1 = df
df_L2 = json_normalize(df_L1['subLevelCategories'][0])   
df_L3 = json_normalize(df_L2['subLevelCategories'][0])   #store for later   !!!!!

df_L2_wrapper = df_L2['id']
df_L3_wrapper = df_L3['id']
df_L2_wrapper.name = 'parent_id'
df_L3_wrapper.name = 'parent_id'
    
df_L1 = df_L1.head(5)
df_L2 = df_L2.head(5)
df_L3 = df_L3.head(5)
df_L3_wrapper = df_L3_wrapper.head(5)
df_L2_wrapper = df_L2_wrapper.head(5)

# Build of df_L1
df_L1 = df_L1.drop(['subLevelCategories'], axis=1) 
df_L1['parentid']=0

# Build of df_L2
df_L2 = df_L2.drop(['name','code','level'], axis=1) 
 


# Rename the Series
df_L2 = json_normalize(df_L2['subLevelCategories'][0])
# Concatenate the DataFrame and the renamed Series
df_L2 = pd.concat([df_L2, df_L2_wrapper], axis=1)
df_L2 = df_L2.drop(['subLevelCategories'], axis=1) 
# //////    L2 is built. 

# Build of df_L3
df_L3 = df_L3.drop(['subLevelCategories'], axis=1) 
df_L3 = pd.concat([df_L3, df_L3_wrapper], axis=1)

df_combined = pd.concat([df_L1, df_L2, df_L3], ignore_index=True)

EDIT: The sample has been corrected by enclosing it with the '[' and ']'

source originates from

response = requests.get(url, headers=headers)
source = response.json()

The sample JSON is as follows:

[
{
  "id": 3372,
  "name": "Archive",
  "code": null,
  "level": 1,
  "subLevelCategories": [
    {
      "id": 16708,
      "name": ".....",
      "code": null,
      "level": 2,
      "subLevelCategories": [
        {
          "id": 16727,
          "name": ".........",
          "code": null,
          "level": 3,
          "subLevelCategories": null
        },
        {
          "id": 16726,
          "name": "........",
          "code": null,
          "level": 3,
          "subLevelCategories": null
        }
      ]
    },
    {
      "id": 16701,
      "name": ".......",
      "code": null,
      "level": 2,
      "subLevelCategories": [
        {
          "id": 16782,
          "name": "......",
          "code": null,
          "level": 3,
          "subLevelCategories": null
        },
        {
          "id": 16785,
          "name": "......",
          "code": null,
          "level": 3,
          "subLevelCategories": null
        }
      ]
    }
  ]
}
]

Solution

  • Used recursive function to handle the nested Json data,it stores the current id, name, code, level, and parent_id. The parent_id is passed down as a parameter so that we can track the parent-child relationship as we go deeper into the hierarchy.

    EDIT : After Sample Json is added with '[]'

    import pandas as pd
    data = [
        {
            "id": 3372,
            "name": "Archive",
            "code": None,
            "level": 1,
            "subLevelCategories": [
                {
                    "id": 16708,
                    "name": ".....",
                    "code": None,
                    "level": 2,
                    "subLevelCategories": [
                        {
                            "id": 16727,
                            "name": ".........",
                            "code": None,
                            "level": 3,
                            "subLevelCategories": None
                        },
                        {
                            "id": 16726,
                            "name": "........",
                            "code": None,
                            "level": 3,
                            "subLevelCategories": None
                        }
                    ]
                },
                {
                    "id": 16701,
                    "name": ".......",
                    "code": None,
                    "level": 2,
                    "subLevelCategories": [
                        {
                            "id": 16782,
                            "name": "......",
                            "code": None,
                            "level": 3,
                            "subLevelCategories": None
                        },
                        {
                            "id": 16785,
                            "name": "......",
                            "code": None,
                            "level": 3,
                            "subLevelCategories": None
                        }
                    ]
                }
            ]
        }
    ]
    
    def flatten_categories(data, parent_id=0):  # default parent_id for top-level categories
        flattened_data = []
        for item in data:
            flat_item = {
                'id': item['id'],
                'name': item['name'],
                'code': item['code'],
                'level': item['level'],
                'parent_id': parent_id
            }
            flattened_data.append(flat_item)
            if 'subLevelCategories' in item and item['subLevelCategories']:
                flattened_data.extend(flatten_categories(item['subLevelCategories'], parent_id=item['id']))
        
        return flattened_data
    
    flattened_data = flatten_categories(data)
    df = pd.DataFrame(flattened_data)
    print(df)
    

    Output

          id       name  code  level  parent_id
    0   3372    Archive  None      1          0
    1  16708      .....  None      2       3372
    2  16727  .........  None      3      16708
    3  16726   ........  None      3      16708
    4  16701    .......  None      2       3372
    5  16782     ......  None      3      16701
    6  16785     ......  None      3      16701