I have been trying this for 2 days now, to load this jsonl nested file into a dataframe but unable to succeed. I want to load this into a flattened data frame as I want to perform some join and aggregations after inserting into a database.
pasting first few lines of the file
{"metadata": {"timestamp": "2022-02-19T02:55:54", "collection_id": "a8b7c401-fafd-4e4c-924a-5935526722fd", "session_id": "452eb9e8-e090-4a08-b836-d23d05b1f400", "profile_id": "21636369-8b52-4b4a-97b7-50923ceb3ffd"}, "behaviour": {"mobile": {"swipe": [{"timestamp": 0, "x": 0.4230440650862826, "y": -1.1079966897549942}, {"timestamp": 13, "x": -0.6336283415951725, "y": -0.5487586450523316}, {"timestamp": 24, "x": -0.7667096573026922, "y": -0.17038917226708414}, {"timestamp": 27, "x": -0.2492178616850527, "y": -0.07701361360512053}, {"timestamp": 36, "x": -0.4191304177115883, "y": -0.3397303353601901}, {"timestamp": 30, "x": 0.2320275902893194, "y": 0.4445459453058296}, {"timestamp": 48, "x": 1.4065844499882725, "y": -0.6494710011790842}, {"timestamp": 84, "x": 0.6409924815827414, "y": 1.084704106728884}, {"timestamp": 112, "x": -0.12560819944452944, "y": 0.3951936918404698}, {"timestamp": 72, "x": -1.2564732537980146, "y": 0.5162185447218673}, {"timestamp": 140, "x": -0.11225893044809712, "y": -1.0759589427931768}, {"timestamp": 99, "x": 0.4055014359919277, "y": -0.8092838008674887}, {"timestamp": 108, "x": 0.5696685733474918, "y": -0.026973393894283243}, {"timestamp": 104, "x": -0.721861152315641, "y": 0.9688713713582879}, {"timestamp": 112, "x": -0.517704263457985, "y": 0.11542994879900434}, {"timestamp": 75, "x": -1.605496286221394, "y": -0.14112093019514352}, {"timestamp": 128, "x": -0.6291370576878567, "y": -1.6601324818355463}, {"timestamp": 255, "x": 0.40843440157409044, "y": 0.12849855301903662}, {"timestamp": 126, "x": -0.41855530399404134, "y": 0.9280432396602861}, {"timestamp": 190, "x": 0.8272200887537559, "y": -2.8837656534871665}, {"timestamp": 220, "x": -0.46418914345820755, "y": 1.461671361359334}, {"timestamp": 315, "x": 1.4982587078714356, "y": -0.02798356032708405}, {"timestamp": 154, "x": 0.8495531080382587, "y": -0.21588769612222933}, {"timestamp": 230, "x": -0.7817274589055877, "y": 0.10459280883004704}, {"timestamp": 288, "x": -0.6199621973185518, "y": 0.005292257129691979}, {"timestamp": 250, "x": 0.30746861940044845, "y": 1.0241480367845581}, {"timestamp": 312, "x": -0.06819093424015772, "y": 0.5494263659551994}, {"timestamp": 162, "x": 2.0174773870865987, "y": -0.37867315639172483}, {"timestamp": 392, "x": -0.6091712707845026, "y": 0.9772389197482354}, {"timestamp": 348, "x": -1.2571532898911881, "y": 0.5790041974234325}, {"timestamp": 210, "x": -0.10627207941819672, "y": 0.5625704328169008}, {"timestamp": 434, "x": 0.7250857141705704, "y": 0.04705329465119688}, {"timestamp": 448, "x": -2.778612588025202, "y": -0.5212241988490687}, {"timestamp": 330, "x": 0.7224408299657998, "y": 1.7002044390937856}, {"timestamp": 170, "x": -1.016443134380015, "y": 0.08434308734071132}, {"timestamp": 385, "x": 0.0740135245013153, "y": -0.6937385355902548}, {"timestamp": 288, "x": 0.6873913431881746, "y": 2.0759013166848317}, {"timestamp": 481, "x": 0.561396110432751, "y": 0.49573900401513177}, {"timestamp": 456, "x": -0.1773451988572836, "y": -1.759401045101997}, {"timestamp": 195, "x": -0.3695919636514994, "y": -0.9902143983832665}, {"timestamp": 560, "x": -0.04585535029678639, "y": -1.7036174960872914}, {"timestamp": 205, "x": 0.5111813501633438, "y": 0.47367897298849226}, {"timestamp": 546, "x": -1.9309270698843783, "y": 0.3806555477944176}, {"timestamp": 559, "x": 0.4629684313263635, "y": 0.9452816436897316}, {"timestamp": 616, "x": 0.8893217401839395, "y": 0.4248136684216903}, {"timestamp": 495, "x": -0.316562829967066, "y": 0.6125479349850482}, {"timestamp": 414, "x": 0.8445187708427919, "y": 0.9843145760818928}, {"timestamp": 564, "x": 0.691084533796371, "y": 0.7790856500722466}, {"timestamp": 336, "x": -1.4869830646379347, "y": -1.3299596471768933}, {"timestamp": 686, "x": 0.175807248613561, "y": 0.8389891374495156}, {"timestamp": 700, "x": -0.05181686823382315, "y": -0.6072622860567705}, {"timestamp": 408, "x": 0.9032138095285404, "y": -0.7372126870214335}, {"timestamp": 572, "x": 0.9090357577884669, "y": -1.0933486637672398}, {"timestamp": 371, "x": -0.21968857950449133, "y": 1.2297936396937472}, {"timestamp": 594, "x": -0.903683811769815, "y": -0.962349247335407}, {"timestamp": 770, "x": 1.363974698178015, "y": -2.45398664951111}]}}}
{"metadata": {"timestamp": "2022-01-20T11:58:31", "collection_id": "b29d1647-684e-4c5f-856a-87fbabdfcd7e", "session_id": "43dbf234-6207-4ba8-a32f-64bccb8948be", "profile_id": "21636369-8b52-4b4a-97b7-50923ceb3ffd"}, "behaviour": {"mobile": {"pin": [{"timestamp": 0, "x": -1.635364533608917, "y": -0.9233169601939333}, {"timestamp": 6, "x": -0.6138268672129017, "y": -0.7333714325660339}, {"timestamp": 18, "x": 1.5371807147417926, "y": -0.1772820460807428}, {"timestamp": 21, "x": 0.9848563399999479, "y": -0.47868567228881614}]}}}
{"metadata": {"timestamp": "2022-01-04T02:15:37", "collection_id": "781aa808-074f-4f1f-af27-667a490a55ea", "session_id": "de8877cb-3e8e-4713-8403-e4fea7cd0a38", "profile_id": "6018366c-f658-47a7-9ed3-4fe53a096533"}, "behaviour": {"mobile": {"keystrokes": [{"timestamp": 0, "key_hash": -1.2626154136500727}, {"timestamp": 8, "key_hash": 0.9900211973859506}, {"timestamp": 30, "key_hash": -1.5075769024975958}, {"timestamp": 45, "key_hash": -1.0967796122154305}, {"timestamp": 24, "key_hash": -0.3627313749623099}, {"timestamp": 40, "key_hash": 1.0158181665717492}, {"timestamp": 84, "key_hash": 0.4517722807573615}, {"timestamp": 63, "key_hash": 0.47838264462494967}, {"timestamp": 96, "key_hash": 0.15833055712400226}, {"timestamp": 63, "key_hash": 0.09400350314454811}, {"timestamp": 60, "key_hash": -0.3262757188314575}, {"timestamp": 66, "key_hash": -0.1975736686989061}, {"timestamp": 168, "key_hash": 2.0745969522246765}, {"timestamp": 182, "key_hash": 0.6457012666514507}]}}}
I have tried the following code snippet to parse the json using json_normalize()
collections=(f'../test/input/collections.jsonl')
collections_data = [json.loads(line) for line in open(collections, 'r')]
collections_df = pd.json_normalize(collections_data)
print(collections_df)
but it only normalizes the 'metadata' part of the json, not 'behaviour'. resulting df is like this
metadata.timestamp metadata.collection_id ... behaviour.mobile.pin behaviour.mobile.keystrokes
0 2022-02-19T02:55:54 a8b7c401-fafd-4e4c-924a-5935526722fd ... NaN NaN
1 2022-01-20T11:58:31 b29d1647-684e-4c5f-856a-87fbabdfcd7e ... [{'timestamp': 0, 'x': -1.635364533608917, 'y'... NaN
2 2022-01-04T02:15:37 781aa808-074f-4f1f-af27-667a490a55ea ... NaN [{'timestamp': 0, 'key_hash': -1.2626154136500...
3 2022-04-14T15:07:29 ba103592-f361-4cc8-86ed-4f03bde3702a ... NaN [{'timestamp': 0, 'key_hash': 0.05626071817640...
My output schema should be like
['metadata.timestamp','metadata.collection_id','metadata.session_id','metadata.profile_id','behaviour.mobile.swipe.timestamp','behaviour.mobile.swipe.x','behaviour.mobile.swipe.y','behaviour.mobile.pin.timestamp','behaviour.mobile.pin.x','behaviour.mobile.pin.y','behaviour.mobile.keystrokes.timestamp','behaviour.mobile.keystrokes.key_hash']
I have also tried to use the 'record_path' argument while calling json_normalize() but cant figure out what 'key' to pass.
Any help will be a appreciated a lot
import pandas as pd
import json
collections=(f'collections.jsonl')
collections_data = [json.loads(line) for line in open(collections, 'r')]
collections_df = pd.json_normalize(collections_data)
def flatten_pandas(df_):
#The same as flatten but for pandas
have_list = df_.columns[df_.applymap(lambda x: isinstance(x, list)).any()].tolist()
have_dict = df_.columns[df_.applymap(lambda x: isinstance(x, dict)).any()].tolist()
have_nested = len(have_list) + len(have_dict)
while have_nested!=0:
if len(have_list)!=0:
for _ in have_list:
df_ = df_.explode(_)
elif have_dict !=0:
df_ = pd.json_normalize(json.loads(df_.to_json(force_ascii=False, orient="records")), sep=".")
have_list = df_.columns[df_.applymap(lambda x: isinstance(x, list)).any()].tolist()
have_dict = df_.columns[df_.applymap(lambda x: isinstance(x, dict)).any()].tolist()
have_nested = len(have_list) + len(have_dict)
return df_
flattend = flatten_pandas(collections_df)
flattend.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74 entries, 0 to 73
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 metadata.timestamp 74 non-null object
1 metadata.collection_id 74 non-null object
2 metadata.session_id 74 non-null object
3 metadata.profile_id 74 non-null object
4 behaviour.mobile.pin 0 non-null float64
5 behaviour.mobile.keystrokes 0 non-null float64
6 behaviour.mobile.swipe.timestamp 56 non-null float64
7 behaviour.mobile.swipe.x 56 non-null float64
8 behaviour.mobile.swipe.y 56 non-null float64
9 behaviour.mobile.swipe 0 non-null float64
10 behaviour.mobile.pin.timestamp 4 non-null float64
11 behaviour.mobile.pin.x 4 non-null float64
12 behaviour.mobile.pin.y 4 non-null float64
13 behaviour.mobile.keystrokes.timestamp 14 non-null float64
14 behaviour.mobile.keystrokes.key_hash 14 non-null float64
dtypes: float64(11), object(4)
memory usage: 8.8+ KB