elasticsearchdeduplication

Remove duplicate documents from a search in Elasticsearch


I have an index with a lot of paper with the same value for the same field. I have one deduplication on this field.

Aggregators will come to me as counters. I would like a list of documents.

My index :

I want this result (deduplication result by domain field) :


Solution

  • You could use field collapsing, group the results on the name field and set the size of the top_hits aggregator to 1.

    /POST http://localhost:9200/test/dedup/_search?search_type=count&pretty=true
    {
      "aggs":{
        "dedup" : {
          "terms":{
            "field": "name"
           },
           "aggs":{
             "dedup_docs":{
               "top_hits":{
                 "size":1
               }
             }
           }    
        }
      }
    }
    

    this returns:

    {
      "took" : 192,
      "timed_out" : false,
      "_shards" : {
        "total" : 1,
        "successful" : 1,
        "failed" : 0
      },
      "hits" : {
        "total" : 6,
        "max_score" : 0.0,
        "hits" : [ ]
      },
      "aggregations" : {
        "dedup" : {
          "buckets" : [ {
            "key" : "name1",
            "doc_count" : 2,
            "dedup_docs" : {
              "hits" : {
              "total" : 2,
              "max_score" : 1.0,
              "hits" : [ {
                "_index" : "test",
                "_type" : "dedup",
                "_id" : "1",
                "_score" : 1.0,
                "_source":{domain: "domain1.fr", name: "name1", date: "01-01-2014"}
              } ]
            }
          }
        }, {
          "key" : "name2",
          "doc_count" : 2,
          "dedup_docs" : {
            "hits" : {
              "total" : 2,
              "max_score" : 1.0,
              "hits" : [ {
                "_index" : "test",
                "_type" : "dedup",
                "_id" : "3",
                "_score" : 1.0,
                "_source":{domain: "domain1.fr", name: "name2", date: "01-03-2014"}
              } ]
            }
          }
        }, {
          "key" : "name3",
          "doc_count" : 2,
          "dedup_docs" : {
            "hits" : {
              "total" : 2,
              "max_score" : 1.0,
              "hits" : [ {
                "_index" : "test",
                "_type" : "dedup",
                "_id" : "5",
                "_score" : 1.0,
                "_source":{domain: "domain1.fr", name: "name3", date: "01-05-2014"}
               } ]
             }
           }
         } ]
       }
     }
    }