I'm building apache-spark application with Apache Spark Hive. So far everything was ok - I've been running tests and whole application in Intellij IDEA and all tests together using maven.
Now I want to run whole application from bash and let it run with local single-node cluster. I'm using maven-shade-plugin to build single executable JAR.
Application crashes when it tries to create new HiveContext out of SparkContext. Thrown exception tells me that hive can't create metastore because there is some problem with datanucleus and its plugin system. I tried to follow several questions how to run datanucleus plugin system with shade but out of luck. For example: Datanucleus, JDO and executable jar - how to do it?
What is the best way to compose executable JAR of application using hive and run it from bash? Perhaps some settings of datanucleus and its plugin system?
pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>test</groupId>
<artifactId>hive-test</artifactId>
<version>1.0.0</version>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.7</version>
</dependency>
<!-- spark -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>1.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>1.6.0</version>
</dependency>
</dependencies>
<properties>
<!-- To be specified in child pom: <main.class></main.class> -->
<final.jar.name>${project.artifactId}-${project.version}</final.jar.name>
<main.class>com.test.HiveTest</main.class>
</properties>
<build>
<plugins>
<!-- the Maven compiler plugin will compile Java source files -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
<!-- the Maven Scala plugin will compile Scala source files -->
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>process-resources</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
<configuration>
<args>
<arg>-Xmax-classfile-name</arg>
<arg>110</arg>
</args>
</configuration>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-Xmax-classfile-name</arg>
<arg>110</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>${main.class}</mainClass>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.XmlAppendingTransformer">
<resource>plugin.xml</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
SampleCode
object HiveTest {
def main(args: Array[String]) {
// Set up Spark
val conf = new SparkConf(true)
.setMaster("local")
.setAppName("hive-test")
println("Initializing spark context")
val sc = new SparkContext(conf)
println("Initializing hive context")
val hc = new HiveContext(sc)
}
}
Thrown exception
java.lang.RuntimeException: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:522)
at org.apache.spark.sql.hive.client.ClientWrapper.<init>(ClientWrapper.scala:194)
at org.apache.spark.sql.hive.client.IsolatedClientLoader.createClient(IsolatedClientLoader.scala:238)
at org.apache.spark.sql.hive.HiveContext.executionHive$lzycompute(HiveContext.scala:218)
at org.apache.spark.sql.hive.HiveContext.executionHive(HiveContext.scala:208)
at org.apache.spark.sql.hive.HiveContext.functionRegistry$lzycompute(HiveContext.scala:462)
at org.apache.spark.sql.hive.HiveContext.functionRegistry(HiveContext.scala:461)
at org.apache.spark.sql.UDFRegistration.<init>(UDFRegistration.scala:40)
at org.apache.spark.sql.SQLContext.<init>(SQLContext.scala:330)
at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:97)
at org.apache.spark.sql.hive.HiveContext.<init>(HiveContext.scala:101)
at com.test.HiveTest$.main(HiveTest.scala:21)
at com.test.HiveTest.main(HiveTest.scala)
Caused by: java.lang.RuntimeException: Unable to instantiate org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1523)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:86)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:132)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:104)
at org.apache.hadoop.hive.ql.metadata.Hive.createMetaStoreClient(Hive.java:3005)
at org.apache.hadoop.hive.ql.metadata.Hive.getMSC(Hive.java:3024)
at org.apache.hadoop.hive.ql.session.SessionState.start(SessionState.java:503)
... 12 more
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1521)
... 18 more
Caused by: javax.jdo.JDOFatalInternalException: Unexpected exception caught.
NestedThrowables:
java.lang.reflect.InvocationTargetException
at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1193)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:808)
at javax.jdo.JDOHelper.getPersistenceManagerFactory(JDOHelper.java:701)
at org.apache.hadoop.hive.metastore.ObjectStore.getPMF(ObjectStore.java:365)
at org.apache.hadoop.hive.metastore.ObjectStore.getPersistenceManager(ObjectStore.java:394)
at org.apache.hadoop.hive.metastore.ObjectStore.initialize(ObjectStore.java:291)
at org.apache.hadoop.hive.metastore.ObjectStore.setConf(ObjectStore.java:258)
at org.apache.hadoop.util.ReflectionUtils.setConf(ReflectionUtils.java:73)
at org.apache.hadoop.util.ReflectionUtils.newInstance(ReflectionUtils.java:133)
at org.apache.hadoop.hive.metastore.RawStoreProxy.<init>(RawStoreProxy.java:57)
at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:66)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStore(HiveMetaStore.java:593)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:571)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:624)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:461)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:66)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:72)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:5762)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:199)
at org.apache.hadoop.hive.ql.metadata.SessionHiveMetaStoreClient.<init>(SessionHiveMetaStoreClient.java:74)
... 23 more
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at javax.jdo.JDOHelper$16.run(JDOHelper.java:1965)
at java.security.AccessController.doPrivileged(Native Method)
at javax.jdo.JDOHelper.invoke(JDOHelper.java:1960)
at javax.jdo.JDOHelper.invokeGetPersistenceManagerFactoryOnImplementation(JDOHelper.java:1166)
... 42 more
Caused by: org.datanucleus.exceptions.NucleusUserException: Persistence process has been specified to use a ClassLoaderResolver of name "datanucleus" yet this has not been found by the DataNucleus plugin mechanism. Please check your CLASSPATH and plugin specification.
at org.datanucleus.NucleusContext.<init>(NucleusContext.java:283)
at org.datanucleus.NucleusContext.<init>(NucleusContext.java:247)
at org.datanucleus.NucleusContext.<init>(NucleusContext.java:225)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.<init>(JDOPersistenceManagerFactory.java:416)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.createPersistenceManagerFactory(JDOPersistenceManagerFactory.java:301)
at org.datanucleus.api.jdo.JDOPersistenceManagerFactory.getPersistenceManagerFactory(JDOPersistenceManagerFactory.java:202)
... 50 more
Thank you in advance
I have found solution to my problem. Solution is described in answer to original question for datanucleus in executable jar: https://stackoverflow.com/a/27030103/6390361
Edit MANIFEST.MF to pretend that it is datanucleus OSGi bundle. This can be done by adding Bundle-SymbolicName and Premain-Class entries from datanucleus-core manifest.
Create file plugin.xml in your classpath (resource folder) and use root tag from datanucleus-core project.
Put all extension-point tags from datanucleus-core and datanucleus-rdbms at the beginning of the plugin tag. All extension-points from RDBMS projects has to be prefixed with store.rdbms. This is very important because datanucleus uses fully classified IDs including part from root plugin tag.
Merge all extension tags from projects datanucleus-core, datanucleus-rdbms and datanucleus-api-jdo and put them behind all extension points. Be careful some extensions are present in more projects so you need to merge content of extensions with same IDs.
Manifest entries
Bundle-SymbolicName: org.datanucleus;singleton:=true
Premain-Class: org.datanucleus.enhancer.DataNucleusClassFileTransformer
plugin.xml
File plugin.xml is too big to be pasted here but you should be able to merge it by hand. Following code contains all RDBMS extension points with fixed IDs.
<?xml version="1.0" encoding="UTF-8"?>
<?eclipse version="3.2"?>
<plugin id="org.datanucleus" name="DataNucleus Core" provider-name="DataNucleus">
<!-- Extension points from datanucleus-core -->
<extension-point id="api_adapter" name="Api Adapter" schema="schema/apiadapter.exsd"/>
...
<!-- extension points from datanucleus-rdbms - fixed IDs -->
<extension-point id="store.rdbms.connectionprovider" name="Connection Provider" schema="schema/connectionprovider.exsd"/>
<extension-point id="store.rdbms.connectionpool" name="ConnectionPool" schema="schema/connectionpool.exsd"/>
<extension-point id="store.rdbms.sql_expression" name="SQL Expressions" schema="schema/sql_expression.exsd"/>
<extension-point id="store.rdbms.sql_method" name="SQL Methods" schema="schema/sql_method.exsd"/>
<extension-point id="store.rdbms.sql_operation" name="SQL Expressions" schema="schema/sql_operation.exsd"/>
<extension-point id="store.rdbms.sql_tablenamer" name="SQL Table Namer" schema="schema/sql_tablenamer.exsd"/>
<extension-point id="store.rdbms.rdbms_mapping" name="RDBMS Mapping" schema="schema/rdbms_mapping.exsd"/>
<!-- Merged extensions from datanucleus-core, datanucleus-rdbms and datanucleus-api-jdo -->
<extension point="org.datanucleus.persistence_properties">...</extension>
...
</plugin>
maven-shade-plugin
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<manifestEntries>
<Main-Class>${main.class}</Main-Class>
<Premain-Class>org.datanucleus.enhancer.DataNucleusClassFileTransformer</Premain-Class>
<Bundle-SymbolicName>org.datanucleus;singleton:=true</Bundle-SymbolicName>
</manifestEntries>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>reference.conf</resource>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>