elasticsearch kibana elastic-stack elasticsearch-aggregation elasticsearch-dsl

How to Count Unique Assets with Specific Severity in Elasticsearch?

I have data as follows:

{
        "_index": "index",
        "_id": "...",
        "_score": 4.512486,
        "_source": {
          "division": {
            "name": "Ben",
            "id": "678"
          },
          "summary": {
            "severity": 2,
            "last_issue_at": "...",
            "first_issue_at": "...",
            "issues_count": 1
          },
          "company": {
            "id": "..."
          },
          "ingest_timestamp": "..",
          "asset": {
            "name": "Bus",
            "vin": "123",
            "id": "...",
            "manufacturer": ...
          },
          "timestamp": "..."
        } 
    {
        "_index": "index",
        "_id": "...",
        "_score": 4.512486,
        "_source": {
          "division": {
            "name": "Ben",
            "id": "678"
          },
          "summary": {
            "severity": 1,
            "last_issue_at": "...",
            "first_issue_at": "...",
            "issues_count": 1
          },
          "company": {
            "id": "..."
          },
          "ingest_timestamp": "..",
          "asset": {
            "name": "Bus",
            "vin": "123",
            "id": "...",
            "manufacturer": ...
          },
          "timestamp": "..."
        }

Goal: Count unique assets with a severity of 1 or 2. If an asset has both severities, only count the one with severity 1.

Here is my query so far:

{
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "company.id": "..."
          }
        },
        {
          "range": {
            "timestamp": {
              "gte": "...",
              "lte": "..."
            }
          }
        }
      ]
    }
  },
  "runtime_mappings": {
    "asset_id_and_vin": {
      "type": "keyword",
      "script": {
        "source": "if (doc.containsKey('asset.vin')) { emit(doc['asset.id'] + ' ' + doc['asset.vin']) } else { emit(doc['asset.id'].value + ' ' + 'N/A') }"
      }
    }
  },
  "aggs": {
    "asset_count": {
      "cardinality": {
        "field": "asset_id_and_vin"
      }
    },
    "assets": {
      "multi_terms": {
        "terms": [
          {
            "field": "asset.name"
          },
          {
            "field": "asset.vin",
            "missing": "N/A"
          },
          {
            "field": "division.name"
          },
          {
            "field": "asset.id"
          },
          {
            "field": "division.id"
          }
        ],
        "order": {
          "earliest_date": "desc"
        },
        "size": 100
      },
      "aggs": {
        "severity": {
          "min": {
            "field": "summary.severity"
          }
        },
        "earliest_date": {
          "min": {
            "field": "summary.first_issue_at"
          }
        },
        "latest_date": {
          "max": {
            "field": "summary.last_issue_at"
          }
        },
        "issues": {
          "sum": {
            "field": "summary.issues_count"
          }
        },
        "filtered_assets": {
          "bucket_selector": {
            "buckets_path": {
              "min_severity": "severity"
            },
            "script": "params.min_severity == 1 || params.min_severity == 2"
          }
        },
        "pagination": {
          "bucket_sort": {
            "size": 100,
            "from": 0
          }
        }
      }
    }
  }
}

I added the bucket_selector to return severities that are either 1 or 2, not both. However, this doesn't account for assets that can have both severities, as it checks one document at a time. How can I modify my query to ensure I only count the asset with severity 1 if it has both severities?

Any advice or suggestions for improving my query would be greatly appreciated. Thanks in advance!

Solution

Instead of bucket_selector, you need to use bucket_sort, it is a bit tricky since bucket sort cannot work with terms directly, so you need to add another metric aggregation underneath that would simply mirror the keys in terms (min_value in the example below).

Unfortunately, your requirement of sorting by earliest date is not feasible in this setup. The earliest date can only be found in a pipeline aggregation, which cannot be used for composite aggs sorting. To sort by earliest date we would need to how you index the data.

DELETE test

PUT test
{
  "mappings": {
    "properties": {
      "division": {
        "properties": {
          "id": {
            "type": "keyword"
          },
          "name": {
            "type": "keyword"
          }
        }
      },
      "asset": {
        "properties": {
          "id": {
            "type": "keyword"
          },
          "vin": {
            "type": "keyword"
          },
          "name": {
            "type": "keyword"
          }
        }
      },
      "company": {
        "properties": {
          "id": {
            "type": "keyword"
          }
        }
      }
    }
  }
}


POST test/_bulk?refresh
{ "index": { "_id": "1" } }
{"division":{"name":"Ben","id":"678"},"summary":{"severity":1,"last_issue_at":"2016-01-01","first_issue_at":"2015-01-01","issues_count":1},"company":{"id":"..."},"ingest_timestamp":"..","asset":{"name":"Bus","vin":"123","id":"...","manufacturer":"..."},"timestamp":"..."}
{ "index": { "_id": "2" } }
{"division":{"name":"Ben","id":"678"},"summary":{"severity":2,"last_issue_at":"2024-01-01","first_issue_at":"2020-01-01","issues_count":1},"company":{"id":"..."},"ingest_timestamp":"..","asset":{"name":"Bus","vin":"123","id":"...","manufacturer":"..."},"timestamp":"..."}
  

POST test/_search
{
  "size": 0,
  "query": {
    "bool": {
      "must": [
        {
          "match": {
            "company.id": "..."
          }
        }
      ]
    }
  },
  "runtime_mappings": {
    "asset_id_and_vin": {
      "type": "keyword",
      "script": {
        "source": "if (doc.containsKey('asset.vin')) { emit(doc['asset.id'] + ' ' + doc['asset.vin']) } else { emit(doc['asset.id'].value + ' ' + 'N/A') }"
      }
    }
  },
  "aggs": {
    "asset_count": {
      "cardinality": {
        "field": "asset_id_and_vin"
      }
    },
    "assets": {
      "multi_terms": {
        "terms": [
          {
            "field": "asset.name"
          },
          {
            "field": "asset.vin",
            "missing": "N/A"
          },
          {
            "field": "division.name"
          },
          {
            "field": "asset.id"
          },
          {
            "field": "division.id"
          }
        ],
        "size": 100
      },
      "aggs": {
        "severity": {
          "terms": {
            "field": "summary.severity"
          },
          "aggs": {
            "min_value": {
              "min": {
                "field": "summary.severity"
              }
            },
            "earliest_date": {
              "min": {
                "field": "summary.first_issue_at"
              }
            },
            "latest_date": {
              "max": {
                "field": "summary.last_issue_at"
              }
            },
            "issues": {
              "sum": {
                "field": "summary.issues_count"
              }
            },
            "min_severity": {
              "bucket_sort": {
                "sort": [{
                  "min_value": {
                    "order": "asc"
                  }
                }],
                "size": 1
              }
            }
          }
        }
      }
    }
  }
}