jsonamazon-web-serviceselasticsearchamazon-elasticsearch

Elasticsearch: Aggregation of null fields in a facet bucket


I'm trying to implement facets with a date range aggregation in the current version of Amazon Elasticsearch Service (version 7.10). The key for what I want the article documents to group for, is publishedAt, what is a date. I want one bucket, where publishedAt is in the past, which means, it is published, one where it is in the future, which means scheduled and one for all articles without a publishedAt, which are drafts. published and scheduled are working as they should. For drafts I can't enter a filter or date range as they are null. So I want to make use of the "Missing Values" feature. This should treat the documents with publishedAt = null like to have the date given in the missing field. Unfortunately it has no effect on the results. Even if I change the date of missing to let it match with published or scheduled.

My request:

GET https://es.amazonaws.com/articles/_search

{
    "size": 10,
    "aggs": {
        "facet_bucket_all": {
            "aggs": {
                "channel": {
                    "terms": {
                        "field": "channel.keyword",
                        "size": 5
                    }
                },
                "brand": {
                    "terms": {
                        "field": "brand.keyword",
                        "size": 5
                    }
                },
                "articleStatus": {
                    "date_range": {
                        "field": "publishedAt",
                        "format": "dd-MM-yyyy",
                        "missing": "01-07-1886",
                        "ranges": [
                            { "key": "published", "from": "now-99y/M", "to": "now/M" },
                            { "key": "scheduled", "from": "now+1s/M", "to": "now+99y/M" },
                            { "key": "drafts", "from": "01-01-1886", "to": "31-12-1886" }
                        ]
                    }
                }
            },
            "filter": {
                "bool": {
                    "must": []
                }
            }
        },
        "facet_bucket_publishedAt": {
            "aggs": {},
            "filter": {
                "bool": {
                    "must": []
                }
            }
        },
        "facet_bucket_author": {
            "aggs": {
                "author": {
                    "terms": {
                        "field": "author",
                        "size": 10
                    }
                }
            },
            "filter": {
                "bool": {
                    "must": []
                }
            }
        }
    },
    "query": {
        "bool": {
            "filter": [
                {
                    "range": {
                        "publishedAt": {
                            "lte": "2021-08-09T09:52:19.975Z"
                        }
                    }
                }
            ]
        }
    },
    "from": 0,
    "sort": [
        {
            "_score": "desc"
        }
    ]
}

And in the result, the drafts are empty:

"articleStatus": {
    "buckets": [
        {
            "key": "published",
            "from": -1.496448E12,
            "from_as_string": "01-08-1922",
            "to": 1.627776E12,
            "to_as_string": "01-08-2021",
            "doc_count": 47920
        },
        {
            "key": "scheduled",
            "from": 1.627776E12,
            "from_as_string": "01-08-2021",
            "to": 4.7519136E12,
            "to_as_string": "01-08-2120",
            "doc_count": 3
        },
        {
            "key": "drafts",
            "from": 1.67252256E13,
            "from_as_string": "01-01-1886",
            "to": 1.67566752E13,
            "to_as_string": "31-12-1886",
            "doc_count": 0
        }
    ]
}

Solution

  • SearchKit added this part to the query:

    "query": {
        "bool": {
            "filter": [
                {
                    "range": {
                        "publishedAt": {
                            "lte": "2021-08-09T09:52:19.975Z"
                        }
                    }
                }
            ]
        }
    }
    

    This had to be removed, because it filters out null values, before the missing filter makes its job.

    Now I get the correct result:

    "articleStatus": {
        "buckets": [
            {
                "key": "drafts",
                "from": -2.650752E12,
                "from_as_string": "01-01-1886",
                "to": -2.6193024E12,
                "to_as_string": "31-12-1886",
                "doc_count": 7
            },
            {
                "key": "published",
                "from": -1.496448E12,
                "from_as_string": "01-08-1922",
                "to": 1.627776E12,
                "to_as_string": "01-08-2021",
                "doc_count": 47920
            },
            {
                "key": "scheduled",
                "from": 1.627776E12,
                "from_as_string": "01-08-2021",
                "to": 4.7519136E12,
                "to_as_string": "01-08-2120",
                "doc_count": 3
            }
        ]
    }