elasticsearchlogstashkibanaelasticsearch-5elasticsearch-dsl

How to discard the Duplicate values in ElasticSearch using DSL Query?


Here I am trying to get the attribute_name on the basis of query customer the Problem here is there is lots of duplicate value in attribute name which I want to discard , can someone pls help me with this

 {
  "_source": [
    "attribute_name"
  ],
  "size": 500, 
  "query": {
    "multi_match": {
      "query": "CUSTOMER",
      "fields": [
        "hierarchy_name",
        "attribute_name"
      ]
    }
  }
}

LIke suppose this is my output , here I want to discard duplicate attribute_name

 {
        "_index": "planlytx_records",
        "_type": "_doc",
        "_id": "tD6WDnkBQTXQIneq8Ypr",
        "_score": 2.5454113,
        "_source": {
          "attribute_name": "CUSTOMER"
        }
      },
      {
        "_index": "planlytx_records",
        "_type": "_doc",
        "_id": "3j6WDnkBQTXQIneq8Yps",
        "_score": 2.5454113,
        "_source": {
          "attribute_name": "CUSTOMER"
        }
      },
      {
        "_index": "planlytx_records",
        "_type": "_doc",
        "_id": "nT6WDnkBQTXQIneqyonu",
        "_score": 1.8101583,
        "_source": {
          "attribute_name": "REGION"
        }
      },
      {
        "_index": "planlytx_records",
        "_type": "_doc",
        "_id": "6D6WDnkBQTXQIneq8Yps",
        "_score": 1.8101583,
        "_source": {
          "attribute_name": "REGION"
        }
      },

Myoutput should be something like this ..

{
        "_index": "planlytx_records",
        "_type": "_doc",
        "_id": "3j6WDnkBQTXQIneq8Yps",
        "_score": 2.5454113,
        "_source": {
          "attribute_name": "CUSTOMER"
        }
      },
      {
        "_index": "planlytx_records",
        "_type": "_doc",
        "_id": "nT6WDnkBQTXQIneqyonu",
        "_score": 1.8101583,
        "_source": {
          "attribute_name": "REGION"
        }
      },

Solution

  • You can use collapse parameter, to remove duplicate from your search result based on field value

    Adding a working example with index data, mapping, search query and search result

    Index Mapping:

    {
      "mappings": {
        "properties": {
          "attribute_name": {
            "type": "text",
            "fields": {
              "keyword": {
                "type": "keyword"
              }
            }
          }
        }
      }
    }
    

    Index Data:

    {
      "attribute_name": "CUSTOMER-ALL"
    }
    {
      "attribute_name": "CUSTOMER-ALL"
    }
    {
      "attribute_name": "CUSTOMER"
    }
    {
      "attribute_name": "CUSTOMER"
    }
    

    Search Query:

    {
      "query": {
        "multi_match": {
          "query": "CUSTOMER",
          "fields": [
            "attribute_name"
          ]
        }
      },
      "collapse": {
        "field": "attribute_name.keyword"
      }
    }
    

    Search Result:

     "hits": [
          {
            "_index": "67260491",
            "_type": "_doc",
            "_id": "1",
            "_score": 0.12199639,
            "_source": {
              "attribute_name": "CUSTOMER"
            },
            "fields": {
              "attribute_name.keyword": [
                "CUSTOMER"
              ]
            }
          },
          {
            "_index": "67260491",
            "_type": "_doc",
            "_id": "3",
            "_score": 0.09271726,
            "_source": {
              "attribute_name": "CUSTOMER-ALL"
            },
            "fields": {
              "attribute_name.keyword": [
                "CUSTOMER-ALL"
              ]
            }
          }
        ]
    

    Update 1:

    If you just want to delete the duplicate data, you can run the below query

    {
      "collapse": {
        "field": "attribute_name.keyword"
      }
    }
    

    Search Result will be

    "hits": [
          {
            "_index": "67276433",
            "_type": "_doc",
            "_id": "1",
            "_score": 1.0,
            "_source": {
              "attribute_name": "CUSTOMER"
            },
            "fields": {
              "attribute_name.keyword": [
                "CUSTOMER"
              ]
            }
          },
          {
            "_index": "67276433",
            "_type": "_doc",
            "_id": "3",
            "_score": 1.0,
            "_source": {
              "attribute_name": "REGION"
            },
            "fields": {
              "attribute_name.keyword": [
                "REGION"
              ]
            }
          }
        ]