pythonjsonpandasgroup-by

Python DataFrame groupby convert NaN to None to generate a valid JSON


I get several JSONs which I have to transform and join, what I do with pandas and afterwards I have to generate a JSON as well. The structure of the final JSON is fixed. sometimes some fields in the JSONs are missing (which is correct) but I have to keep the fields in the joined object which also works fine. I have to convert all NaN values to None to get afterwards a valid json with null values but after the groupby operation it converts some None values back to NaN.

See the example attached:

import pandas as pd
import json
dict1 =  {
     "items": [
    {
      "name": "Project1",
      "projectId": "1",
    },
    {
      "name": "Project2",
      "projectId": "2",
    },
    {
      "name": "Project3",    
      "projectId": "3",
    }
    ]
}

dict2 = {
     "items": [
    {
      "attr1": "ABC",
      "attr2": "DEF1",
      "attr3": "GHI1",
      "projectId": "1",
      "services":[
          {
              "sname": "Service1",
          },
          {
              "sname": "Service2",
          }
      ]
    },
    {
      "attr1": "ABC",
      "attr2": "DEF2",
      "attr3": "GHI2",
      "projectId": "2",
      "services":[
          {
              "sname": "Service1",
          },
          {
              "sname": "Service2",
          }
      ]
    }
  ]
}

dict_head = {
    "id":"some-guid",
    "name":"some name",
    "content" :[
    ]
}

df1 = pd.DataFrame(dict1["items"])
# df2 = pd.DataFrame(dict2["items"])
df2 = pd.json_normalize(
        data = dict2['items'],
        record_path = ['services'], 
        meta = [
            'projectId', 
            'attr1',
            'attr2',
            'attr3'
        ]
    )


df_joined = df1.set_index("projectId").join(df2.set_index("projectId"))
print("df_joined_1")
print(df_joined)

#convert all NaN vales to None whichs works well
df_joined= df_joined.where(pd.notnull(df_joined), None)
print("df_joined_2")
print(df_joined)

df_grouped = df_joined.groupby(['projectId','name','attr1','attr2','attr3'], dropna=False)['sname'].apply(list).reset_index().to_dict(orient='records')
#suddenly the None values of the grouped fields are conveted back to NaN???
print("df_grouped:")
print(df_grouped)

df_grouped = [{'type': 'Project', 'id': d['projectId'], 'data': d} for d in df_grouped.to_dict(orient='records')]

dict_head["content"] = df_grouped
print("dict_head:")
print(dict_head)


print("dict_head as json:")
print(json.dumps(dict_head, indent=3))

output of Project 3 where you see both NaN an null, I would expect that all NaN are null values

 {
      "type": "Project",
      "id": "3",
      "data": {
         "projectId": "3",
         "name": "Project3",
         "attr1": NaN,
         "attr2": NaN,
         "attr3": NaN,
         "sname": [
            null
         ]
      }
   }

Solution

  • You don't need to convert NaN to None or convert the dataframe to a dict explicitly. Pandas will do that for you if you use pandas.to_json.

    You can use that for the content, then load the converted json again as a list to place it in your desired structure:

    df_joined = df1.set_index("projectId").join(df2.set_index("projectId"))
    
    json_str = (
        df_joined.groupby(["projectId", "name", "attr1", "attr2", "attr3"], dropna=False)[
            "sname"
        ]
        .apply(list)
        .reset_index()
        .to_json(orient="records")
    )
    
    json_list = json.loads(json_str)
    
    json_content = [{"type": "Project", "id": d["projectId"], "data": d} for d in json_list]
    
    dict_head["content"] = json_content
    
    json.dumps(dict_head, indent=3)
    
    {
       "id": "some-guid",
       "name": "some name",
       "content": [
          {
             "type": "Project",
             "id": "1",
             "data": {
                "projectId": "1",
                "name": "Project1",
                "attr1": "ABC",
                "attr2": "DEF1",
                "attr3": "GHI1",
                "sname": [
                   "Service1",
                   "Service2"
                ]
             }
          },
          {
             "type": "Project",
             "id": "2",
             "data": {
                "projectId": "2",
                "name": "Project2",
                "attr1": "ABC",
                "attr2": "DEF2",
                "attr3": "GHI2",
                "sname": [
                   "Service1",
                   "Service2"
                ]
             }
          },
          {
             "type": "Project",
             "id": "3",
             "data": {
                "projectId": "3",
                "name": "Project3",
                "attr1": null,
                "attr2": null,
                "attr3": null,
                "sname": [
                   null
                ]
             }
          }
       ]
    }