pythonjsonpandaspysparkjsonlines

Unable to load jsonl nested file into a flattened dataframe


I have been trying this for 2 days now, to load this jsonl nested file into a dataframe but unable to succeed. I want to load this into a flattened data frame as I want to perform some join and aggregations after inserting into a database.

pasting first few lines of the file

{"metadata": {"timestamp": "2022-02-19T02:55:54", "collection_id": "a8b7c401-fafd-4e4c-924a-5935526722fd", "session_id": "452eb9e8-e090-4a08-b836-d23d05b1f400", "profile_id": "21636369-8b52-4b4a-97b7-50923ceb3ffd"}, "behaviour": {"mobile": {"swipe": [{"timestamp": 0, "x": 0.4230440650862826, "y": -1.1079966897549942}, {"timestamp": 13, "x": -0.6336283415951725, "y": -0.5487586450523316}, {"timestamp": 24, "x": -0.7667096573026922, "y": -0.17038917226708414}, {"timestamp": 27, "x": -0.2492178616850527, "y": -0.07701361360512053}, {"timestamp": 36, "x": -0.4191304177115883, "y": -0.3397303353601901}, {"timestamp": 30, "x": 0.2320275902893194, "y": 0.4445459453058296}, {"timestamp": 48, "x": 1.4065844499882725, "y": -0.6494710011790842}, {"timestamp": 84, "x": 0.6409924815827414, "y": 1.084704106728884}, {"timestamp": 112, "x": -0.12560819944452944, "y": 0.3951936918404698}, {"timestamp": 72, "x": -1.2564732537980146, "y": 0.5162185447218673}, {"timestamp": 140, "x": -0.11225893044809712, "y": -1.0759589427931768}, {"timestamp": 99, "x": 0.4055014359919277, "y": -0.8092838008674887}, {"timestamp": 108, "x": 0.5696685733474918, "y": -0.026973393894283243}, {"timestamp": 104, "x": -0.721861152315641, "y": 0.9688713713582879}, {"timestamp": 112, "x": -0.517704263457985, "y": 0.11542994879900434}, {"timestamp": 75, "x": -1.605496286221394, "y": -0.14112093019514352}, {"timestamp": 128, "x": -0.6291370576878567, "y": -1.6601324818355463}, {"timestamp": 255, "x": 0.40843440157409044, "y": 0.12849855301903662}, {"timestamp": 126, "x": -0.41855530399404134, "y": 0.9280432396602861}, {"timestamp": 190, "x": 0.8272200887537559, "y": -2.8837656534871665}, {"timestamp": 220, "x": -0.46418914345820755, "y": 1.461671361359334}, {"timestamp": 315, "x": 1.4982587078714356, "y": -0.02798356032708405}, {"timestamp": 154, "x": 0.8495531080382587, "y": -0.21588769612222933}, {"timestamp": 230, "x": -0.7817274589055877, "y": 0.10459280883004704}, {"timestamp": 288, "x": -0.6199621973185518, "y": 0.005292257129691979}, {"timestamp": 250, "x": 0.30746861940044845, "y": 1.0241480367845581}, {"timestamp": 312, "x": -0.06819093424015772, "y": 0.5494263659551994}, {"timestamp": 162, "x": 2.0174773870865987, "y": -0.37867315639172483}, {"timestamp": 392, "x": -0.6091712707845026, "y": 0.9772389197482354}, {"timestamp": 348, "x": -1.2571532898911881, "y": 0.5790041974234325}, {"timestamp": 210, "x": -0.10627207941819672, "y": 0.5625704328169008}, {"timestamp": 434, "x": 0.7250857141705704, "y": 0.04705329465119688}, {"timestamp": 448, "x": -2.778612588025202, "y": -0.5212241988490687}, {"timestamp": 330, "x": 0.7224408299657998, "y": 1.7002044390937856}, {"timestamp": 170, "x": -1.016443134380015, "y": 0.08434308734071132}, {"timestamp": 385, "x": 0.0740135245013153, "y": -0.6937385355902548}, {"timestamp": 288, "x": 0.6873913431881746, "y": 2.0759013166848317}, {"timestamp": 481, "x": 0.561396110432751, "y": 0.49573900401513177}, {"timestamp": 456, "x": -0.1773451988572836, "y": -1.759401045101997}, {"timestamp": 195, "x": -0.3695919636514994, "y": -0.9902143983832665}, {"timestamp": 560, "x": -0.04585535029678639, "y": -1.7036174960872914}, {"timestamp": 205, "x": 0.5111813501633438, "y": 0.47367897298849226}, {"timestamp": 546, "x": -1.9309270698843783, "y": 0.3806555477944176}, {"timestamp": 559, "x": 0.4629684313263635, "y": 0.9452816436897316}, {"timestamp": 616, "x": 0.8893217401839395, "y": 0.4248136684216903}, {"timestamp": 495, "x": -0.316562829967066, "y": 0.6125479349850482}, {"timestamp": 414, "x": 0.8445187708427919, "y": 0.9843145760818928}, {"timestamp": 564, "x": 0.691084533796371, "y": 0.7790856500722466}, {"timestamp": 336, "x": -1.4869830646379347, "y": -1.3299596471768933}, {"timestamp": 686, "x": 0.175807248613561, "y": 0.8389891374495156}, {"timestamp": 700, "x": -0.05181686823382315, "y": -0.6072622860567705}, {"timestamp": 408, "x": 0.9032138095285404, "y": -0.7372126870214335}, {"timestamp": 572, "x": 0.9090357577884669, "y": -1.0933486637672398}, {"timestamp": 371, "x": -0.21968857950449133, "y": 1.2297936396937472}, {"timestamp": 594, "x": -0.903683811769815, "y": -0.962349247335407}, {"timestamp": 770, "x": 1.363974698178015, "y": -2.45398664951111}]}}}
{"metadata": {"timestamp": "2022-01-20T11:58:31", "collection_id": "b29d1647-684e-4c5f-856a-87fbabdfcd7e", "session_id": "43dbf234-6207-4ba8-a32f-64bccb8948be", "profile_id": "21636369-8b52-4b4a-97b7-50923ceb3ffd"}, "behaviour": {"mobile": {"pin": [{"timestamp": 0, "x": -1.635364533608917, "y": -0.9233169601939333}, {"timestamp": 6, "x": -0.6138268672129017, "y": -0.7333714325660339}, {"timestamp": 18, "x": 1.5371807147417926, "y": -0.1772820460807428}, {"timestamp": 21, "x": 0.9848563399999479, "y": -0.47868567228881614}]}}}
{"metadata": {"timestamp": "2022-01-04T02:15:37", "collection_id": "781aa808-074f-4f1f-af27-667a490a55ea", "session_id": "de8877cb-3e8e-4713-8403-e4fea7cd0a38", "profile_id": "6018366c-f658-47a7-9ed3-4fe53a096533"}, "behaviour": {"mobile": {"keystrokes": [{"timestamp": 0, "key_hash": -1.2626154136500727}, {"timestamp": 8, "key_hash": 0.9900211973859506}, {"timestamp": 30, "key_hash": -1.5075769024975958}, {"timestamp": 45, "key_hash": -1.0967796122154305}, {"timestamp": 24, "key_hash": -0.3627313749623099}, {"timestamp": 40, "key_hash": 1.0158181665717492}, {"timestamp": 84, "key_hash": 0.4517722807573615}, {"timestamp": 63, "key_hash": 0.47838264462494967}, {"timestamp": 96, "key_hash": 0.15833055712400226}, {"timestamp": 63, "key_hash": 0.09400350314454811}, {"timestamp": 60, "key_hash": -0.3262757188314575}, {"timestamp": 66, "key_hash": -0.1975736686989061}, {"timestamp": 168, "key_hash": 2.0745969522246765}, {"timestamp": 182, "key_hash": 0.6457012666514507}]}}}

I have tried the following code snippet to parse the json using json_normalize()

    collections=(f'../test/input/collections.jsonl')

    collections_data = [json.loads(line) for line in open(collections, 'r')]
    collections_df = pd.json_normalize(collections_data)
    print(collections_df)

but it only normalizes the 'metadata' part of the json, not 'behaviour'. resulting df is like this

     metadata.timestamp                metadata.collection_id  ...                               behaviour.mobile.pin                        behaviour.mobile.keystrokes
0   2022-02-19T02:55:54  a8b7c401-fafd-4e4c-924a-5935526722fd  ...                                                NaN                                                NaN
1   2022-01-20T11:58:31  b29d1647-684e-4c5f-856a-87fbabdfcd7e  ...  [{'timestamp': 0, 'x': -1.635364533608917, 'y'...                                                NaN
2   2022-01-04T02:15:37  781aa808-074f-4f1f-af27-667a490a55ea  ...                                                NaN  [{'timestamp': 0, 'key_hash': -1.2626154136500...
3   2022-04-14T15:07:29  ba103592-f361-4cc8-86ed-4f03bde3702a  ...                                                NaN  [{'timestamp': 0, 'key_hash': 0.05626071817640...

My output schema should be like

['metadata.timestamp','metadata.collection_id','metadata.session_id','metadata.profile_id','behaviour.mobile.swipe.timestamp','behaviour.mobile.swipe.x','behaviour.mobile.swipe.y','behaviour.mobile.pin.timestamp','behaviour.mobile.pin.x','behaviour.mobile.pin.y','behaviour.mobile.keystrokes.timestamp','behaviour.mobile.keystrokes.key_hash']

I have also tried to use the 'record_path' argument while calling json_normalize() but cant figure out what 'key' to pass.

Any help will be a appreciated a lot


Solution

  • import pandas as pd
    import json
    
    
    collections=(f'collections.jsonl')
    collections_data = [json.loads(line) for line in open(collections, 'r')]
    collections_df = pd.json_normalize(collections_data)
    
    def flatten_pandas(df_):
        #The same as flatten but for pandas
    
        have_list = df_.columns[df_.applymap(lambda x: isinstance(x, list)).any()].tolist()
        have_dict = df_.columns[df_.applymap(lambda x: isinstance(x, dict)).any()].tolist()
        have_nested = len(have_list) + len(have_dict)
        
        while have_nested!=0:
            if len(have_list)!=0:
                for _ in have_list:
                    df_ = df_.explode(_)
                    
            elif have_dict !=0:
                df_ = pd.json_normalize(json.loads(df_.to_json(force_ascii=False, orient="records")), sep=".")
            
            have_list = df_.columns[df_.applymap(lambda x: isinstance(x, list)).any()].tolist()
            have_dict = df_.columns[df_.applymap(lambda x: isinstance(x, dict)).any()].tolist()
            have_nested = len(have_list) + len(have_dict)
            
        return df_
    
    
    flattend = flatten_pandas(collections_df)
    
    flattend.info()
    
    <class 'pandas.core.frame.DataFrame'>
    RangeIndex: 74 entries, 0 to 73
    Data columns (total 15 columns):
     #   Column                                 Non-Null Count  Dtype  
    ---  ------                                 --------------  -----  
     0   metadata.timestamp                     74 non-null     object 
     1   metadata.collection_id                 74 non-null     object 
     2   metadata.session_id                    74 non-null     object 
     3   metadata.profile_id                    74 non-null     object 
     4   behaviour.mobile.pin                   0 non-null      float64
     5   behaviour.mobile.keystrokes            0 non-null      float64
     6   behaviour.mobile.swipe.timestamp       56 non-null     float64
     7   behaviour.mobile.swipe.x               56 non-null     float64
     8   behaviour.mobile.swipe.y               56 non-null     float64
     9   behaviour.mobile.swipe                 0 non-null      float64
     10  behaviour.mobile.pin.timestamp         4 non-null      float64
     11  behaviour.mobile.pin.x                 4 non-null      float64
     12  behaviour.mobile.pin.y                 4 non-null      float64
     13  behaviour.mobile.keystrokes.timestamp  14 non-null     float64
     14  behaviour.mobile.keystrokes.key_hash   14 non-null     float64
    dtypes: float64(11), object(4)
    memory usage: 8.8+ KB