pythonjsonpandasjson-normalize

How to use pd.json_normalize to retrieve the data I need from 2 parts (I'm almost there already, only need last piece of data)


I have this JSON list in Python:

[{'id': 'TD3$-FFA',
  'shortCode': 'TD3$-FFA',
  'dataSet': {'id': 'TD3C',
   'shortCode': 'TD3C',
   'shortDescription': 'Dirty Middle East Gulf to China',
   'displayGroup': 'BDTI',
   'datumUnit': 'Worldscale',
   'datumPrecision': 2,
   'data': [{'value': 56.67, 'date': '2023-06-30'}],
   'apiIdentifier': 'RDSSYGSJBFEV9P2FLSCXGQC3510G2EGE'},
  'datumUnit': '$/mt',
  'datumPrecision': 3,
  'projectionStartOn': '2010-05-10T00:00:00',
  'projectionEndOn': '2023-06-30T00:00:00',
  'groupings': [{'date': '2023-06-30T00:00:00',
    'groups': [{'periodType': 'm',
      'projections': [{'identifier': 'TD3$BALMO',
        'period': 'Jul 23',
        'value': 14.4,
        'validFrom': '2023-07-01',
        'validTo': '2023-07-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$CURMON',
        'period': 'Jul 23',
        'value': 14.4,
        'validFrom': '2023-07-01',
        'validTo': '2023-07-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+1_M',
        'period': 'Aug 23',
        'value': 13.662,
        'validFrom': '2023-08-01',
        'validTo': '2023-08-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+2_M',
        'period': 'Sep 23',
        'value': 13.716,
        'validFrom': '2023-09-01',
        'validTo': '2023-09-29',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+3_M',
        'period': 'Oct 23',
        'value': 13.83,
        'validFrom': '2023-10-01',
        'validTo': '2023-10-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+4_M',
        'period': 'Nov 23',
        'value': 14.619,
        'validFrom': '2023-11-01',
        'validTo': '2023-11-30',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+5_M',
        'period': 'Dec 23',
        'value': 16.389,
        'validFrom': '2023-12-01',
        'validTo': '2023-12-22',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'}]},
     {'periodType': 'q',
      'projections': [{'identifier': 'TD3$CURQ',
        'period': 'Q3 23',
        'value': 13.926,
        'validFrom': '2023-07-01',
        'validTo': '2023-09-29',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+1Q',
        'period': 'Q4 23',
        'value': 14.946,
        'validFrom': '2023-10-01',
        'validTo': '2023-12-22',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+2Q',
        'period': 'Q1 24',
        'value': 13.056,
        'validFrom': '2024-01-01',
        'validTo': '2024-03-29',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+3Q',
        'period': 'Q2 24',
        'value': 11.818,
        'validFrom': '2024-04-01',
        'validTo': '2024-06-28',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+4Q',
        'period': 'Q3 24',
        'value': 11.407,
        'validFrom': '2024-07-01',
        'validTo': '2024-09-30',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'}]},
     {'periodType': 'y',
      'projections': [{'identifier': 'TD3$+1CAL',
        'period': 'Cal 24',
        'value': 12.693,
        'validFrom': '2024-01-01',
        'validTo': '2024-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+2CAL',
        'period': 'Cal 25',
        'value': 12.057,
        'validFrom': '2025-01-01',
        'validTo': '2025-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+3CAL',
        'period': 'Cal 26',
        'value': 11.756,
        'validFrom': '2026-01-01',
        'validTo': '2026-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD3$+4CAL',
        'period': 'Cal 27',
        'value': 11.683,
        'validFrom': '2027-01-01',
        'validTo': '2027-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'}]}]}],
  'apiIdentifier': 'RPSVJJTJBXBCAF2FAG2PQAVYN4UGQ9LN'},
 {'id': 'TD20$-FFA',
  'shortCode': 'TD20$-FFA',
  'dataSet': {'id': 'TD20',
   'shortCode': 'TD20',
   'shortDescription': 'Dirty West Africa to UK-Continent',
   'displayGroup': 'BDTI',
   'datumUnit': 'Worldscale',
   'datumPrecision': 2,
   'data': [{'value': 101.14, 'date': '2023-06-30'}],
   'apiIdentifier': 'RDSU23QH0OX6DZZDC5BYZTQIZ9TXHUQR'},
  'datumUnit': '$/mt',
  'datumPrecision': 3,
  'projectionStartOn': '2014-08-01T00:00:00',
  'projectionEndOn': '2023-06-30T00:00:00',
  'groupings': [{'date': '2023-06-30T00:00:00',
    'groups': [{'periodType': 'm',
      'projections': [{'identifier': 'TD20$BALMO',
        'period': 'Jul 23',
        'value': 19.093,
        'validFrom': '2023-07-01',
        'validTo': '2023-07-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$CURMON',
        'period': 'Jul 23',
        'value': 19.093,
        'validFrom': '2023-07-01',
        'validTo': '2023-07-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+1_M',
        'period': 'Aug 23',
        'value': 17.896,
        'validFrom': '2023-08-01',
        'validTo': '2023-08-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+2_M',
        'period': 'Sep 23',
        'value': 17.832,
        'validFrom': '2023-09-01',
        'validTo': '2023-09-29',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+3_M',
        'period': 'Oct 23',
        'value': 18.61,
        'validFrom': '2023-10-01',
        'validTo': '2023-10-31',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+4_M',
        'period': 'Nov 23',
        'value': 19.417,
        'validFrom': '2023-11-01',
        'validTo': '2023-11-30',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+5_M',
        'period': 'Dec 23',
        'value': 20.272,
        'validFrom': '2023-12-01',
        'validTo': '2023-12-22',
        'nextRolloverDate': '2023-07-29',
        'archiveDate': '2023-06-30'}]},
     {'periodType': 'q',
      'projections': [{'identifier': 'TD20$CURQ',
        'period': 'Q3 23',
        'value': 18.274,
        'validFrom': '2023-07-01',
        'validTo': '2023-09-29',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+1Q',
        'period': 'Q4 23',
        'value': 19.433,
        'validFrom': '2023-10-01',
        'validTo': '2023-12-22',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+2Q',
        'period': 'Q1 24',
        'value': 17.142,
        'validFrom': '2024-01-01',
        'validTo': '2024-03-29',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+3Q',
        'period': 'Q2 24',
        'value': 14.091,
        'validFrom': '2024-04-01',
        'validTo': '2024-06-28',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+4Q',
        'period': 'Q3 24',
        'value': 12.478,
        'validFrom': '2024-07-01',
        'validTo': '2024-09-30',
        'nextRolloverDate': '2023-09-29',
        'archiveDate': '2023-06-30'}]},
     {'periodType': 'y',
      'projections': [{'identifier': 'TD20$+1CAL',
        'period': 'Cal 24',
        'value': 14.904,
        'validFrom': '2024-01-01',
        'validTo': '2024-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+2CAL',
        'period': 'Cal 25',
        'value': 14.184,
        'validFrom': '2025-01-01',
        'validTo': '2025-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'},
       {'identifier': 'TD20$+3CAL',
        'period': 'Cal 26',
        'value': 13.831,
        'validFrom': '2026-01-01',
        'validTo': '2026-12-24',
        'nextRolloverDate': '2023-12-22',
        'archiveDate': '2023-06-30'}]}]}],
  'apiIdentifier': 'RPSRTIFJYJVDT9TFWIYQMLXN2ZN7RRK1'}]

Now I use df_usd_mt = pd.json_normalize(response_usd_mt, record_path=['groupings', 'groups', 'projections'], meta=['shortCode', 'datumUnit']) to ALMOST get everything I need.

my current dataframe looks like this

    identifier  period   value  ... archiveDate  shortCode datumUnit
0     TD3$BALMO  Jul 23  14.400  ...  2023-06-30   TD3$-FFA      $/mt
1    TD3$CURMON  Jul 23  14.400  ...  2023-06-30   TD3$-FFA      $/mt
2      TD3$+1_M  Aug 23  13.662  ...  2023-06-30   TD3$-FFA      $/mt
3      TD3$+2_M  Sep 23  13.716  ...  2023-06-30   TD3$-FFA      $/mt
4      TD3$+3_M  Oct 23  13.830  ...  2023-06-30   TD3$-FFA      $/mt
5      TD3$+4_M  Nov 23  14.619  ...  2023-06-30   TD3$-FFA      $/mt
6      TD3$+5_M  Dec 23  16.389  ...  2023-06-30   TD3$-FFA      $/mt
7      TD3$CURQ   Q3 23  13.926  ...  2023-06-30   TD3$-FFA      $/mt
8       TD3$+1Q   Q4 23  14.946  ...  2023-06-30   TD3$-FFA      $/mt
9       TD3$+2Q   Q1 24  13.056  ...  2023-06-30   TD3$-FFA      $/mt
10      TD3$+3Q   Q2 24  11.818  ...  2023-06-30   TD3$-FFA      $/mt
11      TD3$+4Q   Q3 24  11.407  ...  2023-06-30   TD3$-FFA      $/mt
12    TD3$+1CAL  Cal 24  12.693  ...  2023-06-30   TD3$-FFA      $/mt
13    TD3$+2CAL  Cal 25  12.057  ...  2023-06-30   TD3$-FFA      $/mt
14    TD3$+3CAL  Cal 26  11.756  ...  2023-06-30   TD3$-FFA      $/mt
15    TD3$+4CAL  Cal 27  11.683  ...  2023-06-30   TD3$-FFA      $/mt
16   TD20$BALMO  Jul 23  19.093  ...  2023-06-30  TD20$-FFA      $/mt
17  TD20$CURMON  Jul 23  19.093  ...  2023-06-30  TD20$-FFA      $/mt
18    TD20$+1_M  Aug 23  17.896  ...  2023-06-30  TD20$-FFA      $/mt
19    TD20$+2_M  Sep 23  17.832  ...  2023-06-30  TD20$-FFA      $/mt
20    TD20$+3_M  Oct 23  18.610  ...  2023-06-30  TD20$-FFA      $/mt
21    TD20$+4_M  Nov 23  19.417  ...  2023-06-30  TD20$-FFA      $/mt
22    TD20$+5_M  Dec 23  20.272  ...  2023-06-30  TD20$-FFA      $/mt
23    TD20$CURQ   Q3 23  18.274  ...  2023-06-30  TD20$-FFA      $/mt
24     TD20$+1Q   Q4 23  19.433  ...  2023-06-30  TD20$-FFA      $/mt
25     TD20$+2Q   Q1 24  17.142  ...  2023-06-30  TD20$-FFA      $/mt
26     TD20$+3Q   Q2 24  14.091  ...  2023-06-30  TD20$-FFA      $/mt
27     TD20$+4Q   Q3 24  12.478  ...  2023-06-30  TD20$-FFA      $/mt
28   TD20$+1CAL  Cal 24  14.904  ...  2023-06-30  TD20$-FFA      $/mt
29   TD20$+2CAL  Cal 25  14.184  ...  2023-06-30  TD20$-FFA      $/mt
30   TD20$+3CAL  Cal 26  13.831  ...  2023-06-30  TD20$-FFA      $/mt

I only wish to have one additional column with the information under 'dataSet'>>'id' for example for the first 16 rows, I need 'TD3C' as values of this additional column, this info could be seen at the top of the Json list. to be accurate, the 3rd row of the data, for the following rows I need value "TD20"

I really can't figure it out and asked Chatgpt, it gave me this code which looks correct but just generate Traceback

df_usd_mt = pd.json_normalize(response_usd_mt, 
                               record_path=['groupings', 'groups', 'projections'], 
                              meta=['shortCode', 'datumUnit', ['dataSet', 'id']])

Solution

  • You can add dataSet to meta and then parse out the dict. You will get all the columns from dataSet keys but can filter out only the columns you want.

    df = pd.json_normalize(
        data=data,
        meta_prefix="meta.",
        meta=["dataSet", "shortCode", "datumUnit"],
        record_path=["groupings", "groups", "projections"]
    )
    df = df.join(pd.DataFrame(df.pop("meta.dataSet").tolist()))
    df = df[["identifier", "period", "value", "validFrom", "validTo", "nextRolloverDate",
             "archiveDate", "meta.shortCode", "meta.datumUnit", "id"]]
    df.columns = df.columns.str.split(".").str[-1]
    print(df)
    

    Output:

         identifier  period  value   validFrom     validTo nextRolloverDate archiveDate  shortCode datumUnit    id
    0     TD3$BALMO  Jul 23   14.4  2023-07-01  2023-07-31       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    1    TD3$CURMON  Jul 23   14.4  2023-07-01  2023-07-31       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    2      TD3$+1_M  Aug 23 13.662  2023-08-01  2023-08-31       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    3      TD3$+2_M  Sep 23 13.716  2023-09-01  2023-09-29       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    4      TD3$+3_M  Oct 23  13.83  2023-10-01  2023-10-31       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    5      TD3$+4_M  Nov 23 14.619  2023-11-01  2023-11-30       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    6      TD3$+5_M  Dec 23 16.389  2023-12-01  2023-12-22       2023-07-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    7      TD3$CURQ   Q3 23 13.926  2023-07-01  2023-09-29       2023-09-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    8       TD3$+1Q   Q4 23 14.946  2023-10-01  2023-12-22       2023-09-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    9       TD3$+2Q   Q1 24 13.056  2024-01-01  2024-03-29       2023-09-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    10      TD3$+3Q   Q2 24 11.818  2024-04-01  2024-06-28       2023-09-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    11      TD3$+4Q   Q3 24 11.407  2024-07-01  2024-09-30       2023-09-29  2023-06-30   TD3$-FFA      $/mt  TD3C
    12    TD3$+1CAL  Cal 24 12.693  2024-01-01  2024-12-24       2023-12-22  2023-06-30   TD3$-FFA      $/mt  TD3C
    13    TD3$+2CAL  Cal 25 12.057  2025-01-01  2025-12-24       2023-12-22  2023-06-30   TD3$-FFA      $/mt  TD3C
    14    TD3$+3CAL  Cal 26 11.756  2026-01-01  2026-12-24       2023-12-22  2023-06-30   TD3$-FFA      $/mt  TD3C
    15    TD3$+4CAL  Cal 27 11.683  2027-01-01  2027-12-24       2023-12-22  2023-06-30   TD3$-FFA      $/mt  TD3C
    16   TD20$BALMO  Jul 23 19.093  2023-07-01  2023-07-31       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    17  TD20$CURMON  Jul 23 19.093  2023-07-01  2023-07-31       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    18    TD20$+1_M  Aug 23 17.896  2023-08-01  2023-08-31       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    19    TD20$+2_M  Sep 23 17.832  2023-09-01  2023-09-29       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    20    TD20$+3_M  Oct 23  18.61  2023-10-01  2023-10-31       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    21    TD20$+4_M  Nov 23 19.417  2023-11-01  2023-11-30       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    22    TD20$+5_M  Dec 23 20.272  2023-12-01  2023-12-22       2023-07-29  2023-06-30  TD20$-FFA      $/mt  TD20
    23    TD20$CURQ   Q3 23 18.274  2023-07-01  2023-09-29       2023-09-29  2023-06-30  TD20$-FFA      $/mt  TD20
    24     TD20$+1Q   Q4 23 19.433  2023-10-01  2023-12-22       2023-09-29  2023-06-30  TD20$-FFA      $/mt  TD20
    25     TD20$+2Q   Q1 24 17.142  2024-01-01  2024-03-29       2023-09-29  2023-06-30  TD20$-FFA      $/mt  TD20
    26     TD20$+3Q   Q2 24 14.091  2024-04-01  2024-06-28       2023-09-29  2023-06-30  TD20$-FFA      $/mt  TD20
    27     TD20$+4Q   Q3 24 12.478  2024-07-01  2024-09-30       2023-09-29  2023-06-30  TD20$-FFA      $/mt  TD20
    28   TD20$+1CAL  Cal 24 14.904  2024-01-01  2024-12-24       2023-12-22  2023-06-30  TD20$-FFA      $/mt  TD20
    29   TD20$+2CAL  Cal 25 14.184  2025-01-01  2025-12-24       2023-12-22  2023-06-30  TD20$-FFA      $/mt  TD20
    30   TD20$+3CAL  Cal 26 13.831  2026-01-01  2026-12-24       2023-12-22  2023-06-30  TD20$-FFA      $/mt  TD20