kqlazure-data-explorer

How to combine records with overlapping date ranges?


Using kql, I'm trying to get records that overlap getting the min start date and the max start date for each group of records that overlap. The "g_id" field signifies a container of "s_id"s each having a datetime range (start and end). I want to get the start and end times for each "g_id" where any overlapping time ranges within a g_id will be a single record (min(start_time) and max(end_time)) of all the overlapping record and if there is a gap where a record exists but no other record overlaps it, then it should be a second record in the result (see g_id == 200 below).

let T = datatable(g_id:int, s_id:int, s_start_time:datetime, s_end_time:datetime)
[
   100, 1, datetime('2025-05-01 5:23:00'), datetime('2025-05-01 5:44:00'),
   100, 2, datetime('2025-05-01 5:00:00'), datetime('2025-05-01 5:10:00'),
   100, 3, datetime('2025-05-01 5:05:00'), datetime('2025-05-01 5:20:00'),
   100, 4, datetime('2025-05-01 5:10:00'), datetime('2025-05-01 5:30:00'),
   200, 1, datetime('2025-05-01 5:05:00'), datetime('2025-05-01 5:15:00'),
   100, 1, datetime('2025-05-01 5:30:00'), datetime('2025-05-01 5:35:00'),
   100, 2, datetime('2025-05-01 5:40:00'), datetime('2025-05-01 5:50:00'),
   100, 3, datetime('2025-05-01 5:45:00'), datetime('2025-05-01 5:47:00'),
   200, 2, datetime('2025-05-01 5:35:00'), datetime('2025-05-01 5:40:00'),
   200, 3, datetime('2025-05-01 5:37:00'), datetime('2025-05-01 5:38:00'),
   230, 4, datetime('2025-05-01 5:25:00'), datetime('2025-05-01 5:45:00'),
];
T
| order by g_id asc, s_id asc, s_start_time asc

Result should be below:

g_id s_start_time s_end_time
100 2025-05-01 5:00:00 2025-05-01 5:50:00
200 2025-05-01 5:05:00 2025-05-01 5:15:00
200 2025-05-01 5:35:00 2025-05-01 5:40:00
230 2025-05-01 5:25:00 2025-05-1 5:45:00

Solution

  • I figured it out. What I ended up doing is creating a single record per min based on the min and max dates per g_id. Then I do the same except only between the date ranges for each g_id. Let I can join the two together on g_id and timestamp. Then do a dense ranking function to group the overlap ranges and then filter out the non-overlaps.

    The solution works but there probably is a more optimized way to do this.

    let T = datatable(g_id:int, s_id:int, s_start_time:datetime, s_end_time:datetime)
    [
       100, 1, datetime('2025-05-01 5:23:00'), datetime('2025-05-01 5:44:00'),
       100, 2, datetime('2025-05-01 5:00:00'), datetime('2025-05-01 5:10:00'),
       100, 3, datetime('2025-05-01 5:05:00'), datetime('2025-05-01 5:20:00'),
       100, 4, datetime('2025-05-01 5:10:00'), datetime('2025-05-01 5:30:00'),
       200, 1, datetime('2025-05-01 5:05:00'), datetime('2025-05-01 5:15:00'),
       100, 1, datetime('2025-05-01 5:30:00'), datetime('2025-05-01 5:35:00'),
       100, 2, datetime('2025-05-01 5:40:00'), datetime('2025-05-01 5:50:00'),
       100, 3, datetime('2025-05-01 5:45:00'), datetime('2025-05-01 5:47:00'),
       200, 2, datetime('2025-05-01 5:35:00'), datetime('2025-05-01 5:40:00'),
       200, 3, datetime('2025-05-01 5:37:00'), datetime('2025-05-01 5:38:00'),
       230, 4, datetime('2025-05-01 5:25:00'), datetime('2025-05-01 5:45:00'),
    ];
    let filler_range =
        T
        | summarize min_start = min(s_start_time), max_end = max(s_end_time) by g_id
        | extend timestamp_range = range(min_start, max_end, 1m)
        | mv-expand ts = timestamp_range
        | project g_id, timestamp = todatetime(ts)
    ;
    let expanded_ranges = T
        | extend timestamp_range = range(s_start_time, s_end_time, 1m)
        | mv-expand ts = timestamp_range
        | project g_id, timestamp = todatetime(ts)
        | distinct g_id, timestamp
    ;
    filler_range
    | join kind=leftouter (expanded_ranges) on timestamp and g_id
    | extend overlaps = iff(isnotempty(timestamp1), true, false)
    | order by g_id asc, timestamp asc
    | extend grp=row_rank_dense(overlaps, isempty(g_id1) and overlaps)
    | summarize start=min(timestamp), end=max(timestamp) by g_id, grp, overlaps
    | where overlaps == true // only get me the overlaps
    | project g_id, start, end
    | order by g_id asc, start asc