I have a complicated json file that take me a lot of time to make it in Excel format, I do my best but i think I miss something so can someone help please,
the json file is below and my script did half of work :
json file : {"logFormatVersion": "log_security_v3", "data": [ { "logAlertUid": "a2fee3b7e2824c", "request": { "body": "", "cookies": [ { "key": "info_1", "value": "info_2" }, { "key": "info_3", "value": "info_4" }, { "key": "info_5", "value": "info_6" } ], "headers": [ { "key": "Host", "value": "ip_address" }, { "key": "Accept-Charset", "value": "iso-8859-1,utf-8;q=0.9,*;q=0.1" }, { "key": "Accept-Language", "value": "info_7" }, { "key": "Connection", "value": "Keep-Alive" }, { "key": "Referer", "value": "info_8" } ], "hostname": "FQDN", "ipDst": "Y.Y.Y.Y", "ipSrc": "X.X.X.X", "method": "GET", "path": "/xampp/cgi.cgi", "portDst": 443, "protocol": "HTTP/1.1", "query": "", "requestUid": "info_9" }, "websocket": [], "context": { "tags": "", "geoipCode": "", "geoipName": "", "applianceName": "name_device", "applianceUid": "18539", "backendHost": "ip_address", "backendPort": 80, "reverseProxyName": "FQDN", "reverseProxyUid": "info_10", "tunnelName": "info_11", "tunnelUid": "6d531c", "workflowName": "name-workflow", "workflowUid": "77802" }, "events": [ { "eventUid": "e62d8b", "tokens": { "date": "time", "matchingParts": [ { "part": "info_17", "partKey": "info_18", "partKeyOperator": "info_19", "partKeyPattern": "info_20", "partKeyMatch": "info_21", "partValue": "info_21", "partValueOperator": "info_22", "partValuePatternUid": "info_23", "partValuePatternName": "info_24", "partValuePatternVersion": "00614", "partValueMatch": "info_25", "attackFamily": "info_26", "riskLevel": 80, "riskLevelOWASP": 8, "cwe": "CWE-name" } ], "reason": "info_27", "securityExceptionConfigurationUids": [ "info_28" ], "securityExceptionMatchedRuleUids": [ "info_24" ] } }, { "eventUid": "e62d8b", "tokens": { "date": "time", "matchingParts": [ { "part": "info_17", "partKey": "info_18", "partKeyOperator": "info_19", "partKeyPattern": "info_20", "partKeyMatch": "info_21", "partValue": "info_21", "partValueOperator": "info_22", "partValuePatternUid": "info_23", "partValuePatternName": "info_24", "partValuePatternVersion": "00614", "partValueMatch": "info_25", "attackFamily": "info_26", "riskLevel": 80, "riskLevelOWASP": 8, "cwe": "CWE-name" } ], "reason": "info_27", "securityExceptionConfigurationUids": [ "info_28" ], "securityExceptionMatchedRuleUids": [ "info_24" ] } }
],
"timestampImport": null,
"timestamp": "info30",
"uid": "AYYm"
} ]}
cordially
my script is like that :
import json
import pandas as pd
with open("eventlogs.json", "r") as f:
objectfile = json.load(f)
data = objectfile["data"]
df =pd.DataFrame(data)
df_request = pd.json_normalize(df["request"])
df_headers = pd.DataFrame(df_request["headers"])
df_headers = df_headers.explode("headers")
df_headers[["header-key", "header-value"]] = df_headers["headers"].apply(pd.Series)
df_headers.drop("headers", axis=1, inplace=True)
df = df.drop("request", axis=1)
new_df = pd.concat([df, df_request, df_headers], axis=1)
new_df.to_excel("all-data.xlsx", index=False)
if you execute the script it give a result like this : result without flatten "events" columns and "cookies" with a lot of line.
like I said the result is not really nice specially when I want to have result like this : Dream - File
I will be very thankful if you can help with idea to arrive to flatten this json to excel with multiindexed columns .
You could use further json_normalize()
calls on the event
and cookies
columns to "flatten" them.
As an alternative, you could try something like this which attempts to flatten everything "automatically":
import json
import pandas as pd
from pathlib import Path
def _json_flatten(data):
rows = []
parents = set()
for parent, children in data:
if isinstance(children, dict):
row = {}
for name, child in children.items():
key = (parent,) + (name if isinstance(name, tuple) else (name,))
row.setdefault(key, []).append(child)
rows.append(row)
parents.add(parent)
if isinstance(children, list) and len(children) > 0:
if all(isinstance(child, dict) for child in children):
if set(children[0]) == set().union(*children):
row = {}
for name in children[0]:
key = (parent,) + (name if isinstance(name, tuple) else (name,))
for child in children:
if len(children) == 1:
row[key] = child[name]
else:
row.setdefault(key, []).append(child[name])
rows.append(row)
parents.add(parent)
data = dict(data)
for parent in parents:
del data[parent]
for row in rows:
data.update(row)
return {
(key,) if not isinstance(key, tuple) else key: value
for key, value in data.items()
}
rows = json.loads(
json.dumps(json.loads(Path("json/eventlog.json").read_bytes())["data"]),
object_pairs_hook=_json_flatten,
)
if isinstance(rows, dict):
rows = [rows]
rows = [
{
key: value[0] if (isinstance(value, list) and len(value) == 1) else value
for key, value in row.items()
}
for row in rows
]
df = pd.DataFrame.from_records(rows)
print(df)
Exploding the request headers:
columns = [ ("request", "headers", name) for name in ("key", "value") ]
df.explode(columns)[columns]
(request, headers, key) (request, headers, value)
0 Host ip_address
0 Accept-Charset iso-8859-1,utf-8;q=0.9,*;q=0.1
0 Accept-Language info_7
0 Connection Keep-Alive
0 Referer info_8