I want to create a nested json file from data in PySpark from the following data.
I wanted to convert this into Nested json file which should have following structure.
{ "NewData" : [ {"id":"1","number":"smith","name":"uber","age":12}, {"id":"2","number":"jon","name":"lunch","age":13}, {"id":"3","number":"jocelyn","name":"rental","age":15}, {"id":"4","number":"megan","name":"sds","age":15}
] }
How to put the correct output in a json file
Can you help me achieve this?
data = [(1,12,"smith", "uber"),
(2,13,"jon","lunch"),
(3,15,"jocelyn","rental"),
(4,15,"megan","sds")
]
schema = StructType([
StructField('id', IntegerType(), True),
StructField('age', IntegerType(), True),
StructField('number', StringType(), True),
StructField('name', StringType(), True)
])
df = spark.createDataFrame(data,schema)
df.show(truncate=False)
df = df.withColumn("NewData", F.lit("NewData"))
df2 = df.groupBy('NewData').agg(F.collect_list(
F.to_json(F.struct('id','number', 'name', 'age'))
).alias('values')
))
df2.show(truncate=False)
You don't have to use to_json
function as this function results string json object
.
groupBy
-> on constant valueagg()
-> with alias name as Newdata
Example:
from pyspark.sql.functions import *
from pyspark.sql.types import *
data = [(1,12,"smith", "uber"),
(2,13,"jon","lunch"),
(3,15,"jocelyn","rental"),
(4,15,"megan","sds")
]
schema = StructType([
StructField('id', IntegerType(), True),
StructField('age', IntegerType(), True),
StructField('number', StringType(), True),
StructField('name', StringType(), True)
])
df = spark.createDataFrame(data,schema)
df.show(truncate=False)
df2 = df.groupBy(lit(1)).agg(collect_list(struct('id','number', 'name', 'age')).alias('NewData')).\
drop("1")
df2.write.mode("overwrite").format("json").save("<directory_path>")
print(dbutils.fs.head("<file_path>"))
#{"NewData":[{"id":1,"number":"smith","name":"uber","age":12},{"id":2,"number":"jon","name":"lunch","age":13},{"id":3,"number":"jocelyn","name":"rental","age":15},{"id":4,"number":"megan","name":"sds","age":15}]}