apache-spark apache-spark-sql apache-spark-xml

Add extra column for child data frame from parent data frame in nested XML in Spark

I am creating a data after loading many XML files . Each xml file has one unique field fun:DataPartitionId I am creating many rows from one XML files .

Now I want to add this fun:DataPartitionId for each row in the resulting rows from the XML.

For example suppose 1st XML has 100 rows then each 100 rows will have same fun:DataPartitionId field .

So fun:DataPartitionId is as a header filed in each XML.

This is what I am doing .

val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    import sqlContext.implicits._
    import org.apache.spark.{ SparkConf, SparkContext }
    import java.sql.{Date, Timestamp}
    import org.apache.spark.sql.Row
    import org.apache.spark.sql.types._
    import org.apache.spark.sql.functions.udf


val getDataPartition =  udf { (DataPartition: String) => 
    if (DataPartition=="1") "SelfSourcedPublic"
    else  if (DataPartition=="2") "Japan"
    else  if (DataPartition=="3") "SelfSourcedPrivate"
    else "ThirdPartyPrivate"
}

val getFFActionParent =  udf { (FFAction: String) => 
    if (FFAction=="Insert") "I|!|"
    else if (FFAction=="Overwrite") "I|!|"
    else "D|!|" 
}

val getFFActionChild =  udf { (FFAction: String) => 
    if (FFAction=="Insert") "I|!|"
    else if (FFAction=="Overwrite") "O|!|"
    else "D|!|" 
}

val dfContentEnvelope = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "env:ContentEnvelope").load("s3://trfsmallfffile/XML")
val dfDataPartition=getDataPartition(dfContentEnvelope("env:Header.fun:DataPartitionId"))


val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select("column1.*")
val df =dfContentItem.withColumn("DataPartition",dfDataPartition)
df.show()

Solution

When you read your xml file using

val dfContentEnvelope = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "env:ContentEnvelope").load("s3://trfsmallfffile/XML")

DataParitionId column is read as Long

fun:DataPartitionId: long (nullable = true)

so you should change the udf function as

val getDataPartition =  udf { (DataPartition: Long) =>
  if (DataPartition== 1) "SelfSourcedPublic"
  else  if (DataPartition== 2) "Japan"
  else  if (DataPartition== 3) "SelfSourcedPrivate"
  else "ThirdPartyPrivate"
}

If possible you should be using when function instead of udf function to boost the processing speed and memory usage

Now I want to add this fun:DataPartitionId for each row in the resulting rows from the xml .

Your mistake is that you forgot to select that particular column, so the following code

val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select("column1.*")

should be

val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select($"env:Header.fun:DataPartitionId".as("DataPartitionId"),$"column1.*")

Then you can apply the udf function

val df = dfContentItem.select(getDataPartition($"DataPartitionId"), $"env:Data.sr:Source.*", $"_action".as("FFAction|!|"))

So working code as a whole should be

val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    import sqlContext.implicits._
    import org.apache.spark.{ SparkConf, SparkContext }
    import java.sql.{Date, Timestamp}
    import org.apache.spark.sql.Row
    import org.apache.spark.sql.types._
    import org.apache.spark.sql.functions.udf


val getDataPartition =  udf { (DataPartition: Long) => 
    if (DataPartition=="1") "SelfSourcedPublic"
    else  if (DataPartition=="2") "Japan"
    else  if (DataPartition=="3") "SelfSourcedPrivate"
    else "ThirdPartyPrivate"
}

val dfContentEnvelope = sqlContext.read.format("com.databricks.spark.xml").option("rowTag", "env:ContentEnvelope").load("s3://trfsmallfffile/XML")

val dfContentItem = dfContentEnvelope.withColumn("column1", explode(dfContentEnvelope("env:Body.env:ContentItem"))).select($"env:Header.fun:DataPartitionId".as("DataPartitionId"),$"column1.*")
val df = dfContentItem.select(getDataPartition($"DataPartitionId"), $"env:Data.sr:Source.*", $"_action".as("FFAction|!|"))
df.show(false)

And you can proceed with the rest of the code.