elasticsearchelasticsearch-java-api

Elastic-search java - Nested field count and sum sub aggregation


I have following structure in elastic search index

      {
    "_index" : "hotel",
    "_type" : "_doc",
    "_id" : "13171",
    "_score" : 6.072218,
    "_source" : {
      "_class" : "hotel",
      "id" : 13171,
      "places" : [
        {
          "type" : "MAIN_LOCATION",
          "placeId" : 2032
        }
      ],
      "numberOfRecommendations" : 0
    }
  },
  {
    "_index" : "hotel",
    "_type" : "_doc",
    "_id" : "7146",
    "_score" : 6.072218,
    "_source" : {
      "_class" : "hotel",
      "id" : 7146,
      "places" : [
        {
          "type" : "MAIN_LOCATION",
          "placeId" : 2032
        }
      ],
      "numberOfRecommendations" : 1
    }  
  },
  {
    "_index" : "hotel",
    "_type" : "_doc",
    "_id" : "7146",
    "_score" : 6.072218,
    "_source" : {
      "_class" : "hotel",
      "id" : 7146,
      "places" : [
        {
          "type" : "AFFILIATE",
          "placeId" : 2032
        }
      ],
      "numberOfRecommendations" : 3
    }  
  }

Note the places is nested type and has two types "Main Location" and affiliate. I am creating an aggregation to count the hotels for a particular place and total number of recommendations for main location.

In the above example for main location i should get hotels 2 and numberOfRecommendations 1

I am using java and have created the following code

  public List<PlaceHotelStats> getHotelOfferStats() {

// Create aggregation filter for considering only places with PlaceType from filter(in current
// case main location)
String placeFilterAggregationName = "placeFilter";
BoolQueryBuilder nestedPlaceQuery = boolQuery();
nestedPlaceQuery.must(termQuery("places.type", "MAIN_LOCATION"));
nestedPlaceQuery.must(termsQuery("places.placeId", filter.getPlaceIds()));
AggregationBuilder placeAggregationFilter =
    AggregationBuilders.filters(placeFilterAggregationName, nestedPlaceQuery);

// Add Terms filter to group by field placeId and then add sub aggregation for
// totalRecommendations to have buckets
String aggregationGroupByPlaceId = "group_by_place_id";
var includedPlaceIds = filter.getPlaceIds().stream().mapToLong(l -> l).toArray();
TermsAggregationBuilder aggregationBuilders =
    AggregationBuilders.terms(aggregationGroupByPlaceId)
        .field("places.placeId")
        .size(filter.getPlaceIds().size())
        .includeExclude(new IncludeExclude(includedPlaceIds, null))
        .subAggregation(
            AggregationBuilders.sum("totalRecommendationsForPlace")
                .field("numberOfRecommendations"));

// Add place term aggregation along with recommendation to Filter aggregation
placeAggregationFilter.subAggregation(aggregationBuilders);

// The final aggregration which has filter first then subaggregation of place terms with buckets
// and review counts
var nestedPlacesAggregation =
    AggregationBuilders.nested(NESTED_PLACES_AGGREGATION_NAME, PLACES)
        .subAggregation(placeAggregationFilter);
var query =
    new NativeSearchQueryBuilder()
        .withQuery(builder.query())
        .addAggregation(nestedPlacesAggregation)
        .build();

var result = elasticsearchOperations.search(query, EsHotel.class, ALIAS_COORDS);

if (!result.hasAggregations()) {
  throw new IllegalStateException("No aggregations found after query with aggregations!");
}

ParsedFilters aggregationParsedFilters =
    ((ParsedNested) result.getAggregations().get(NESTED_PLACES_AGGREGATION_NAME))
        .getAggregations()
        .get(placeFilterAggregationName);
var buckets =
    ((ParsedTerms)
            aggregationParsedFilters
                .getBuckets()
                .get(0)
                .getAggregations()
                .get(aggregationGroupByPlaceId))
        .getBuckets();

List<PlaceHotelStats> placeHotelStats= new ArrayList<>();
buckets.forEach(
    bucket ->
        placeHotelStats.add(
            new PlaceHotelStats(
                bucket.getKeyAsNumber().longValue(),
                Math.toIntExact(bucket.getDocCount()),
                getTotalRecommendationsForPlace(bucket))));

return placeOfferStats;

}

  private int getTotalRecommendationsForPlace(Terms.Bucket bucket) {
    var aggregationTotalRecommendation =
        bucket.getAggregations().get("totalRecommendationsForPlace");
    if (aggregationTotalRecommendation != null) {
      return (int) ((ParsedSum) aggregationTotalRecommendation).getValue();
    }
    return 0;
  }

This gives me correct count of total places but not correct sum of all recommendations

I check elastic search query and it looks something like this

{
  "query": {
  "bool" : {
    "must" : [
      {
        "nested" : {
          "query" : {
            "bool" : {
              "must" : [
                {
                  "term" : {
                    "places.type" : {
                      "value" : "MAIN_LOCATION",
                      "boost" : 1.0
                    }
                  }
                },
                {
                  "terms" : {
                    "places.placeId" : [
                      7146
                    ],
                    "boost" : 1.0
                  }
                }
              ],
              "adjust_pure_negative" : true,
              "boost" : 1.0
            }
          },
          "path" : "places",
          "ignore_unmapped" : false,
          "score_mode" : "min",
          "boost" : 1.0
        }
      },
      
      
      {
        "nested" : {
          "query" : {
            "exists" : {
              "field" : "places",
              "boost" : 1.0
            }
          },
          "path" : "places",
          "ignore_unmapped" : false,
          "score_mode" : "none",
          "boost" : 1.0
        }
      }
    ],
    "adjust_pure_negative" : true,
    "boost" : 1.0
  }
},
"aggs": {
  "nestedPlaces":{
    "nested":{"path":"places"},
    "aggregations":{
      "placeFilter":{
        "filters":{
          "filters":[{
            "bool":{
              "must":[{
                "term":{"places.type":{"value":"MAIN_LOCATION","boost":1.0}}},
                {"terms":{"places.placeId":[7146],"boost":1.0}}],
                "adjust_pure_negative":true,
                "boost":1.0}
            
          }],
          "other_bucket":false,
          "other_bucket_key":"_other_"},
          "aggregations":{
            "group_by_place_id":{
              "terms":{
                "field":"places.placeId",
                "size":193,
                "min_doc_count":1,
                "shard_min_doc_count":0,
                "show_term_doc_count_error":false,
                "order":[
                  {"_count":"desc"},
                  {"_key":"asc"}],
                  "include":["7146"]},
                  "aggregations":{
                    "totalRecommendationsForPlace":{
                      "sum":{
                        "field":"numberOfRecommendations"
                        
                      }
                      
                    }
                    
                  }
              
            }
            
          }
        
      }
      
    }
    
  }
  
}
}

Current output of the query is where total hotels are corrent but totalrecommendations are wrong and is always 0 which means subaggregation is not working as expected

"aggregations" : {
    "nestedPlaces" : {
      "doc_count" : 7,
      "placeFilter" : {
        "buckets" : [
          {
            "doc_count" : 3,
            "group_by_place_id" : {
              "doc_count_error_upper_bound" : 0,
              "sum_other_doc_count" : 0,
              "buckets" : [
                {
                  "key" : 2032,
                  "doc_count" : 3,
                  "totalRecommendationsForPlace" : {
                    "value" : 0.0
                  }
                }
              ]
            }
          }
        ]
      }
    }
  }

Not sure where I went wrong


Solution

  • Your query is basically correct up to the point where you try to get the sum of numberOfRecommendations. Since that field is at the root-level of the document and not inside the nested document itself, you need to first add a reverse_nested aggregation to get back to the top-level document, and then only you can have your sum aggregation, like this:

            "group_by_place_id": {
              "terms": {
                "field": "places.placeId",
                "size": 193,
                "min_doc_count": 1,
                "shard_min_doc_count": 0,
                "show_term_doc_count_error": false,
                "order": [
                  {
                    "_count": "desc"
                  },
                  {
                    "_key": "asc"
                  }
                ],
                "include": [
                  "7146"
                ]
              },
              "aggregations": {
                "back_to_root": {               <----- add this
                  "reverse_nested": {},         <----- add this
                  "aggs": {
                    "totalRecommendationsForPlace": {
                      "sum": {
                        "field": "numberOfRecommendations"
                      }
                    }
                  }
                }
              }
            }
    

    PS: if you can have a different number of recommendations based on the type (main location or affiliate), then you should have that number at the nested level and your query would work as is.