apache-sparkapache-spark-sqlspark-excel

how to read a whole directory of XLSX with apache spark scala?


I have to read a whole directory of xlsx files, and I need to load all the directory with Apache Spark using Scala.

Actually I'm using this dependency : "com.crealytics" %% "spark-excel" % "0.12.3", and I don't know how to load all.


Solution

  • There doesnt seem a shortcut option to be put into the path through option method. So I have created a workaround as below(assuming each excel file has same number of columns). Created a method to get all the paths of every file in the source directory and ran a loop over those file paths creating new dataframe and appending to the previous one.

    import java.io.File
    import org.apache.spark.sql.Row
    import org.apache.spark.sql.types._
    
    def getListOfFiles(dir : String) : List[File] = {
      val d = new File(dir)
      if (d.exists && d.isDirectory){
        d.listFiles().filter(_.isFile).toList
      } else {
          List[File]()
      }
    }
    
    val path = " \\directory path"
    
    // shows list of files with fully qualified paths
    println(getListOfFiles(path))
    
    val schema = StructType(
        StructField("id", IntegerType, true) ::
        StructField("name", StringType, false) ::
        StructField("age", IntegerType, false) :: Nil)
    
    
    // Created Empty dataframe with as many columns as in each excel
    var data = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)
    for(filePath <- getListOfFiles(path)){
      var tempDF = spark.read.format("com.crealytics.spark.excel")
        .option("location", s"$filePath")
        .option("useHeader", "true")
        .option("treatEmptyValuesAsNulls", "true")
        .option("inferSchema", "true")
        .option("addColorColumns", "False")
        .load()
      data = data.union(tempDF)
    }
    
    data.show()