I have MongoDB collections forms and submissions where forms define dynamic UI components (textfield, checkbox, radio, selectboxes, columns, tables, datagrids) and submissions contain the user data in a flattened structure.
I have loaded it into a data frame, with a column for the components of a form joined with all the submissions of the form in the column 'data' which is a json key-value pair where key is the component key and value is the user input.
Im trying to transform all data of submissions into a CSV, track the index if its inside an array and get information such as label from the component.
Example form with all possible components:
[
{
"key": "name",
"label": "Name",
"type": "textfield"
},
{
"label": "Age",
"key": "age",
"type": "number"
},
{
"key": "adult",
"label": "18 Plus",
"type": "checkbox"
},
{
"label": "Gender",
"type": "radio",
"key": "gender",
"values": [
{
"label": "Male",
"value": "male"
},
{
"label": "Female",
"value": "female"
},
{
"label": "Other",
"value": "other"
}
]
},
{
"label": "Countries Visited",
"type": "selectboxes",
"key": "countriesVisited",
"values": [
{
"label": "France",
"value": "fr"
},
{
"label": "India",
"value": "in"
}
]
},
{
"label": "Columns",
"type": "columns",
"key": "columns",
"columns": [
{
"components": [
{
"label": "Column Field 1",
"key": "columnField1",
"type": "textfield"
}
]
},
{
"components": [
{
"label": "Column Field 2",
"type": "number",
"key": "columnField2"
}
]
}
]
},
{
"type": "table",
"rows": [
[
{
"components": [
{
"label": "Table Field 1",
"key": "textField",
"type": "textfield"
}
]
},
{
"components": [
{
"key": "number",
"type": "number",
"label": "Number"
}
]
}
],
[
{
"components": [
{
"label": "Table Field 3",
"key": "tableField3",
"type": "number"
}
]
},
{
"components": [
{
"label": "Table Field 4",
"key": "tableField4",
"type": "number"
}
]
}
]
]
},
{
"label": "Data Grid",
"key": "dataGrid",
"type": "datagrid",
"components": [
{
"key": "gridField1",
"label": "Grid Field 1",
"type": "textfield"
},
{
"label": "Gird Field 2",
"key": "girdField2",
"type": "number"
},
{
"label": "Grid Field 4",
"key": "gridField4",
"type": "checkbox"
}
]
}
]
Example submission for above:
{
"_id": "69283e5d3703daffa931277c",
"data": {
"name": "John",
"age": 22,
"adult": true,
"gender": "male",
"countriesVisited": {
"fr": true,
"in": false
},
"columnField1": "ewvew",
"columnField2": 1234,
"textField": "sdv",
"number": 12341,
"tableField3": 123421,
"tableField4": 2323,
"dataGrid": [
{
"gridField1": "Data 1",
"girdField2": 234,
"gridField4": true
},
{
"gridField1": "Data 23",
"gridField4": false
}
]
},
}
Expected output for the example shown above:
key,value,label,index
name,John,Name,0
age,22,Age,0
adult,true,18 Plus,0
gender,male,Gender,0
fr,true,France,0
in,false,India,0
columnField1,ewvew,Column Field 1,0
columnField2,1234,Column Field 2,0
textField,sdv,Table Field 1,0
number,12341,Number,0
tableField3,123421,Table Field 3,0
tableField4,2323,Table Field 4,0
gridField1,Data 1,Grid Field 1,0
girdField2,234,Gird Field 2,0
gridField4,true,Grid Field 4,0
gridField1,Data 23,Grid Field 1,1
gridField4,false,Grid Field 4,1
Used a UDF to build a component lookup dict by key, recursively parsed submissions into indexed arrays, then explode for proper nested rows.
def parse_if_str(value):
'''
Util to load string as json
'''
if isinstance(value, str):
return json.loads(value)
else:
return value
def build_component_lookup(component_list, result):
'''
Build a lookup dictionary for components to retrieve metadata by key.
'''
for component in parse_if_str(component_list):
component = parse_if_str(component)
result[component['key']] = {
'label': component.get('label', key),
}
if 'components' in component:
build_component_lookup(component['components'], result)
def build_key_value_index_list(value, key, index, result):
'''
Recursively build a list of (key, value, index) tuples from the nested data structure.
'''
if isinstance(value, dict):
for new_key, new_value in value.items():
build_key_value_index_list(new_value, new_key, index, result)
elif isinstance(value, list):
for new_index, item in enumerate(value):
build_key_value_index_list(item, key, new_index, result)
else:
result.append((key, value, index))
@udf(returnType=ArrayType(
StructType([
StructField("key", StringType(), True),
StructField("value", StringType(), True),
StructField("index", IntegerType(), True),
StructField("label", StringType(), True),
])
))
def flatten(data, components):
'''
Flatten the nested data structure based on component definitions.
'''
component_lookup = {}
build_component_lookup(json.loads(components), component_lookup)
key_value_index_list = []
build_key_value_index_list(json.loads(data), None, 0, key_value_index_list)
result = []
for (key, value, index) in key_value_index_list:
result.append({
"key": key,
"value": value,
"index": index,
"label": component_lookup['key']
})
return result
df_flatten = df \
.withColumn("flatten", explode(flatten(to_json(col("data")), to_json(col("components"))) \
.withColumn('key', col('flatten.key')) \
.withColumn('value', col('flatten.value')) \
.withColumn('index', col('flatten.index')) \
.withColumn('label', col('flatten.label')) \