pythonjsondataframeexplodejson-normalize

Python Converting Nested array of objects in JSON to flattened excel


Python Converting Nested array of objects in JSON to flattened excel

Current Code

def read_json(filename): jsonData = {} try: with open(filename, "r", encoding="utf-8") as f: jsonData = json.loads(f.read()) except: raise Exception(f"Reading {filename} file encountered an error") return jsonData json_data = read_json(filename=".json")

df = pd.json_normalize(json_data,meta=['Documents']).explode('Documents').reset_index(drop=True)
df = df.join(pd.json_normalize(df.pop('Documents')))
df = df.join(pd.json_normalize(df.pop("lossInfo").apply(pd.Series).stack().reset_index(drop=True)))

Expected:

  metaData.form metaData.userNm report.date policy.effdate policy.efftime year.current year.previous
0             1               1           1             11             11           11            11
1             1               1           1             12             12           12            12
3             2               2           2             21             21           21            21
4             2               2           2             22             22           22            22

How to flatten below JSON file. Collection of 2 or more array of objects with nested data in a single json:

{
  "Documents": [
    {
      "metaData": {
        "form": "1",
        "userNm": "1"
      },
      "report": {
        "date": "1"
      },
      "lossInfo": [
        {
          "policy": {
            "effdate": "11",
            "efftime": "11"
          },
          "year": {
            "current": "11",
            "previous": "11"
          }
        },
        {
          "policy": {
            "effdate": "12",
            "efftime": "12"
          },
          "year": {
            "current": "12",
            "previous": "12"
          }
        }
      ]
    },
    {
      "metaData": {
        "form": "2",
        "userNm": "2"
      },
      "report": {
        "date": "2"
      },
      "lossInfo": [
        {
          "policy": {
            "effdate": "21",
            "efftime": "21"
          },
          "year": {
            "current": "22",
            "previous": "22"
          }
        },
        {
          "policy": {
            "effdate": "21",
            "efftime": "21"
          },
          "year": {
            "current": "22",
            "previous": "22"
          }
        }
      ]
    }
  ]
}

Solution

  • @Chetan

    For your specific case, this should do the trick.

    import pandas as pd
    
    data = {'Documents':[{'metaData':{'form':'1','userNm':'1'},'report':{'date':'1'},'lossInfo':[{'policy':{'effdate':'11','efftime':'11'},'year':{'current':'11','previous':'11'}},{'policy':{'effdate':'12','efftime':'12'},'year':{'current':'12','previous':'12'}}]},{'metaData':{'form':'2','userNm':'2'},'report':{'date':'2'},'lossInfo':[{'policy':{'effdate':'21','efftime':'21'},'year':{'current':'22','previous':'22'}},{'policy':{'effdate':'21','efftime':'21'},'year':{'current':'22','previous':'22'}}]}]}
    
    def flattened_dataframe(data: dict) -> pd.DataFrame:
        metadata_cols = list(pd.json_normalize(data["Documents"]).columns)[1:]
    
        df = pd.json_normalize(
            data["Documents"], record_path="lossInfo", meta=["metaData", "report"]
        )
    
        data_pending_right_cols = pd.concat(
            [df, df["metaData"].apply(pd.Series), df["report"].apply(pd.Series)], axis=1
        ).drop(columns=["metaData", "report"])
    
        lossinfo_cols = list(data_pending_right_cols.columns)[:-3]
    
        return data_pending_right_cols.set_axis(
            lossinfo_cols + metadata_cols, axis="columns"
        )
    
    flattened_dataframe(data)
    

    Result sought