With spark I can do for example:
spark.read.orc("/path/to/file").printSchema
But I would like to get something like the output of show create table
in hive. Is it possible?
This should handle most cases (tailor it to your specific case if needed):
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{ArrayType, BooleanType, DoubleType, IntegerType, LongType, StringType, StructField}
object Main {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").getOrCreate()
val types = spark.read.orc("/path/to/orc/orc_file.orc").schema
println("CREATE EXTERNAL TABLE name (")
types.foreach {
//case (name, typ) => println(" " + name + " " + getType(typ))
case StructField(name, dataType, nullable, metadata) =>
println(" " + name.toLowerCase + " " + getType(dataType) + ",")
}
println(")")
}
def getType(typ: Any): String = {
typ match {
case StringType => "string"
case IntegerType => "int"
case DoubleType => "double"
case LongType => "bigint"
case BooleanType => "boolean"
case ArrayType(elementType, containsNull) => "array<" + getType(elementType) + ">"
case StructField(name, dataType, nullable, metadata) => s"${name.toLowerCase}:${getType(dataType)}"
case seq: Seq[StructField] => "struct<" + seq.map(e => getType(e)).mkString(",") + ">"
}
}
}