elasticsearchhiveelasticsearch-hadoop

Hive to Elastic search ingestion issues


Using Elastic search version 6.8.0

Complete Hive Job gets failed for a single malformed json record, I tried changing the 'es.write.rest.error.handler.es.return.default'='PASS/HANDLED' But no luck

Refer : https://www.elastic.co/guide/en/elasticsearch/hadoop/master/errorhandlers.html

Below is the DDL Script which is ran on hive prompt for ingestion

ADD JAR /home/smrafi/elasticsearch-hadoop-6.8.0/dist/elasticsearch-hadoop-6.8.0.jar;
CREATE external TABLE hive_es_with_handler10( data STRING)
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES('es.resource' = 'test_eshadoop/healthCareProvider','es.nodes' = 'xyzpqr','es.input.json' = 'yes','es.index.auto.create' = 'true','es.write.operation'='upsert',
'es.nodes.wan.only' = 'true','es.port' = '443','es.net.ssl'='true','es.batch.size.entries'='1','es.mapping.id' ='id','es.batch.write.retry.count'='-1',
'es.batch.write.retry.wait'='60s',
'es.write.data.error.handlers' = 'es',
'es.write.rest.error.handler.es.client.nodes' = 'vpc-pid-pre-prod-es-cluster-b7thvqfj3tp45arxl34gge3yyi.us-east-2.es.amazonaws.com',
'es.write.rest.error.handler.es.client.port' = '443',
'es.write.rest.error.handler.es.client.resource'='error_es_index',
'es.write.rest.error.handler.es.return.default'='PASS',
'es.write.rest.error.handler.es.return.error'='PASS'); 
insert into hive_es_with_handler10 select * from provider;

Below is exception trace, it failed complaining the error.handler index is not present

Caused by: org.elasticsearch.hadoop.EsHadoopIllegalArgumentException: Could not locate write resource for ES error handler.
        at org.elasticsearch.hadoop.util.Assert.hasText(Assert.java:30)
        at org.elasticsearch.hadoop.handler.impl.elasticsearch.ElasticsearchHandler.init(ElasticsearchHandler.java:145)
        at org.elasticsearch.hadoop.serialization.handler.write.impl.DelegatingErrorHandler.init(DelegatingErrorHandler.java:40)
        at org.elasticsearch.hadoop.handler.impl.AbstractHandlerLoader.loadHandlers(AbstractHandlerLoader.java:114)
        at org.elasticsearch.hadoop.serialization.bulk.BulkEntryWriter.<init>(BulkEntryWriter.java:56)
        at org.elasticsearch.hadoop.rest.RestRepository.lazyInitWriting(RestRepository.java:138)
        at org.elasticsearch.hadoop.rest.RestRepository.writeProcessedToIndex(RestRepository.java:185)
        at org.elasticsearch.hadoop.hive.EsHiveOutputFormat$EsHiveRecordWriter.write(EsHiveOutputFormat.java:64)
        at org.apache.hadoop.hive.ql.exec.FileSinkOperator.process(FileSinkOperator.java:762)
        at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:897)
        at org.apache.hadoop.hive.ql.exec.SelectOperator.process(SelectOperator.java:95)
        at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:897)
        at org.apache.hadoop.hive.ql.exec.TableScanOperator.process(TableScanOperator.java:130)
        at org.apache.hadoop.hive.ql.exec.MapOperator$MapOpCtx.forward(MapOperator.java:148)
        at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:550)
        ... 9 more

Solution

  • Below is the configuration to properly collect all the bad json record errors, Still there are issues with Hive, Hive doesnt support Malformed json records Please check this ElasticSearch hive SerializationError handler

    ADD JAR /home/smrafi/elasticsearch-hadoop-6.8.0/dist/elasticsearch-hadoop-6.8.0.jar;
    CREATE external TABLE hive_es_with_handler32( data STRING)
    STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
    TBLPROPERTIES('es.resource' = 'test_eshadoop/healthCareProvider','es.nodes' = 'xyz','es.input.json' = 'yes','es.index.auto.create' = 'true','es.write.operation'='upsert',
    'es.nodes.wan.only' = 'true','es.port' = '443','es.net.ssl'='true','es.batch.size.entries'='1','es.mapping.id' ='id','es.batch.write.retry.count'='-1',
    'es.batch.write.retry.wait'='60s',
    'es.write.rest.error.handlers' = 'es, ignoreBadRecords',
    'es.write.data.error.handlers' = 'log, customLog, badJsonHandler',
    'es.write.data.error.handler.customLog' = 'com.xyz.elshandler.CustomLogOnError',
    'es.write.data.error.handler.badJsonHandler' = 'com.xyz.elshandler.BadJsonHandler',
    'es.write.rest.error.handler.es.client.resource'="error_es_index/error",
    'es.write.rest.error.handler.es.return.default'='HANDLED',
    'es.write.rest.error.handler.log.logger.name' = 'BulkErrors',
    'es.write.data.error.handler.log.logger.name' = 'SerializationErrors',
    'es.write.rest.error.handler.ignoreBadRecords' = 'com.xyz.elshandler.IgnoreBadRecordHandler',
    'es.write.rest.error.handler.es.return.error'='HANDLED');