scalaapache-sparkhiveapache-iceberg

configure apache iceberg with apache spark


I'm trying to configure Apache Spark with Apache Iceberg. But got this error:

Exception in thread "main" java.lang.NoSuchMethodError:

'org.apache.hadoop.hive.metastore.IMetaStoreClient org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(org.apache.hadoop.hive.conf.HiveConf, org.apache.hadoop.hive.metastore.HiveMetaHookLoader, java.util.concurrent.ConcurrentHashMap, java.lang.String, boolean)'

I tried different versions of "hive-metastore" from 3.0.0 to 3.1.3.

With 3.1.3 I got:

Exception in thread "main" java.lang.UnsupportedOperationException: Unsupported Hive Metastore version (3.1.3). Please set spark.sql.hive.metastore.version with a valid version.

Here is my build.sbt:

ThisBuild / version := "0.1.0-SNAPSHOT"

ThisBuild / scalaVersion := "2.12.15"

lazy val root = (project in file("."))
  .settings(
    name := "iceberg"
  

val sparkVersion = "3.1.1"


libraryDependencies ++= Seq(
  "com.github.pureconfig" %% "pureconfig" % "0.14.0",

  "org.apache.spark" %% "spark-core" % sparkVersion % Compile,
  "org.apache.spark" %% "spark-sql" % sparkVersion % Compile,
  "org.apache.hadoop" % "hadoop-aws" % sparkVersion % Compile,

  "com.github.housepower" % "clickhouse-integration-spark_2.12" % "2.7.1",
  "com.github.housepower" % "clickhouse-native-jdbc" % "2.7.1",

  "org.apache.iceberg" %% "iceberg-spark-runtime-3.1" % "1.3.0" % Compile,
  "org.apache.spark" %% "spark-hive" % sparkVersion % Compile,
  "org.apache.hive" % "hive-metastore" % "3.1.0",
  "org.apache.iceberg" % "iceberg-hive-metastore" % "1.3.1"

)

dependencyOverrides ++= Seq(
  "com.fasterxml.jackson.module" % "jackson-module-scala_2.12" % "2.14.2"
)

And my scala code:


import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types._
import org.apache.log4j.{Level, Logger}
import ru.gpmdata.datafactory.dooh.config.AppConfig


object MyJob extends App {

  val appConfig = AppConfig()
  val s3Config = appConfig.s3
  val chConfig = appConfig.clickhouse

    val spark = SparkSession.builder()
      .appName("MyJob")
      .master("local")

      // Enabling Iceberg Metastore
      .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
      .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
      .config("spark.sql.catalog.spark_catalog.type", "rest")
      .config("spark.sql.catalog.spark_catalog.uri", "https://iceberg.foo.org")

      .config("spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.endpoint", "http://s3.foo.org:9000")
      .config("spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.access.key", appConfig.s3.accessKey)
      .config("spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.secret.key", appConfig.s3.secretKey)

      .config("spark.sql.catalog.spark_catalog.warehouse.dir", "s3a://foo-iceberg-prod")
      .config("spark.sql.hive.metastore.version", "3.1.0")


      .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
      .config("spark.sql.catalog.local.type", "hadoop")
      .config("spark.sql.catalog.local.warehouse", "./warehouse")
      .config("spark.sql.defaultCatalog", "spark_catalog")
      .enableHiveSupport()

      .getOrCreate()

  val icebergDF = spark.sql("select 1 as n")

  icebergDF.show()

  spark.stop()
}



Solution

  • This solved exactly same problem for me:

      "org.apache.hive" % "hive-metastore" % "2.3.7"
    

    and

      .config("spark.sql.catalog.iceberg_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    
      .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")