elasticsearchelasticsearch-query

Return a value if field is missing in Elastic search document


I have an index user. I'm filtering user on LOCATION_ID and _source consists of PHONE_NUMBER & USER_ID, some of the documents do not have PHONE_NUMBER data. It returns a response with just USER_ID.

Is there any way I will get some default or predefined value (passing in query, like we do in count for missing field) for field PHONE_NUMBER in cases where its missing in the document?

Mapping:

{
  "PHONE_NUMBER": {
    "type": "long",
    "store": true
  },
  "USER_ID": {
    "type": "long",
    "store": true
  },
  "LOCATION_ID": {
    "type": "long",
    "store": true
  }
}

Query:

{
  "_source":[
     "PHONE_NUMBER",
     "USER_ID"
  ],
  "query":{
     "bool":{
        "must":[
           {
              "terms":{
                 "LOCATION_ID":[
                    "5001"
                 ]
              }
           }
        ],
        "must_not":[
           
        ]
     }
  },
  "from":0,
  "size":2000
}

Response:

{
  "took":0,
  "timed_out":false,
  "_shards":{
     "total":1,
     "successful":1,
     "skipped":0,
     "failed":0
  },
  "hits":{
     "total":{
        "value":4,
        "relation":"eq"
     },
     "max_score":2.0,
     "hits":[
        {
           "_index":"user",
           "_id":"39788",
           "_score":2.0,
           "_source":{
              "USER_ID":39788
           }
        },
        {
           "_index":"user",
           "_id":"30784",
           "_score":2.0,
           "_source":{
              "USER_ID":30784,
              "PHONE_NUMBER":1234567890
           }
        },
        {
           "_index":"user",
           "_id":"36373",
           "_score":2.0,
           "_source":{
              "USER_ID":36373,
              "PHONE_NUMBER":1234567893
           }
        },
        {
           "_index":"user",
           "_id":"36327",
           "_score":2.0,
           "_source":{
              "USER_PROJECT_USER_ID":36327
           }
        }
     ]
  }
}

In above response, PHONE_NUMBER is missing in first last document. I want some default or predefined value (set in query, like we do in count for missing field) to be returned if field is missing.

Expected response:

{
  "took":0,
  "timed_out":false,
  "_shards":{
     "total":1,
     "successful":1,
     "skipped":0,
     "failed":0
  },
  "hits":{
     "total":{
        "value":4,
        "relation":"eq"
     },
     "max_score":2.0,
     "hits":[
        {
           "_index":"user",
           "_id":"39788",
           "_score":2.0,
           "_source":{
              "USER_ID":39788,
              "PHONE_NUMBER":9876543210.     <- Default or Predefined value (set in query, like we do in count for missing field)
           }
        },
        {
           "_index":"user",
           "_id":"30784",
           "_score":2.0,
           "_source":{
              "USER_ID":30784,
              "PHONE_NUMBER":1234567890
           }
        },
        {
           "_index":"user",
           "_id":"36373",
           "_score":2.0,
           "_source":{
              "USER_ID":36373,
              "PHONE_NUMBER":1234567893
           }
        },
        {
           "_index":"user",
           "_id":"36327",
           "_score":2.0,
           "_source":{
              "USER_PROJECT_USER_ID":36327,
              "PHONE_NUMBER":9876543210      <- Default or predefined value (set in query, like we do in count for missing field)
           }
        }
     ]
  }
}

Any help would be greatly appreciated.


Solution

  • Tldr

    This exact result can not be achieved a query time. You will have to do it during the ingestion.

    But there is a solution that is close enough at query time, using the runtime fields.

    Solutions

    1. At ingest time

    You can set your mapping to be:

    {
      "PHONE_NUMBER": {
        "type": "long",
        "store": true,
        "null_value": "9876543210" <- the specific / default number
      },
      "USER_ID": {
        "type": "long",
        "store": true
      },
      "LOCATION_ID": {
        "type": "long",
        "store": true
      }
    }
    

    Your document with no phone number will now have a default value. The down side is, it is not dynamic. You can not update this value at query time.

    2. At query time

    set up:

    POST /_bulk
    {"index":{"_index":"75278567"}}
    {"USER_ID":123456,"PHONE_NUMBER":12345}
    {"index":{"_index":"75278567"}}
    {"USER_ID":234567,"PHONE_NUMBER":234567}
    {"index":{"_index":"75278567"}}
    {"USER_ID":345678}
    {"index":{"_index":"75278567"}}
    {"USER_ID":456789}
    

    Using the runtime fields you could create the following query:

    GET /75278567/_search
    {
      "runtime_mappings": {
        "NUMBER": {
          "type": "keyword",
          "script": {
            "source": """
            if (doc["PHONE_NUMBER"].size() == 0){
              emit("000000")
            } else
            {
              emit(doc["PHONE_NUMBER"].value.toString())
            }
            """
          }
        }
      },
      "fields": [
        "NUMBER"
      ]
    }
    

    This is going to give you the following results:

    {
      "took": 5,
      "timed_out": false,
      "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
      },
      "hits": {
        "total": {
          "value": 4,
          "relation": "eq"
        },
        "max_score": 1,
        "hits": [
          {
            "_index": "75278567",
            "_id": "3njWAYYBArbKoMpIcXFp",
            "_score": 1,
            "_source": {
              "USER_ID": 123456,
              "PHONE_NUMBER": 12345
            },
            "fields": {
              "NUMBER": [
                "12345"
              ]
            }
          },
          {
            "_index": "75278567",
            "_id": "33jWAYYBArbKoMpIcXFp",
            "_score": 1,
            "_source": {
              "USER_ID": 234567,
              "PHONE_NUMBER": 234567
            },
            "fields": {
              "NUMBER": [
                "234567"
              ]
            }
          },
          {
            "_index": "75278567",
            "_id": "4HjWAYYBArbKoMpIcXFp",
            "_score": 1,
            "_source": {
              "USER_ID": 345678
            },
            "fields": {
              "NUMBER": [
                "000000"
              ]
            }
          },
          {
            "_index": "75278567",
            "_id": "4XjWAYYBArbKoMpIcXFp",
            "_score": 1,
            "_source": {
              "USER_ID": 456789
            },
            "fields": {
              "NUMBER": [
                "000000"
              ]
            }
          }
        ]
      }
    }
    

    This not not in the _source but you can access the default values in fields.