sqlpostgresqlapache-sparkapache-spark-sqlduration

Consecutive Activity using Analytical Function


I have a table containing the fields: user_ip, datetime, year, month, day, hour, tag_id, country, device_type, brand. I need to check if given a IP was active for a continuous period of 4 or more hours and flag as true / false.

Query tried:

WITH sample_data (user_ip, datetime, year, month, day, hour, tag_id, country, device_type, brand) AS
(
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525000000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 0, 1001, 'US', 1, 'Samsung' union all
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525010000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 1, 1001, 'US', 1, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525020000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 2, 1001, 'US', 1, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525030000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 3, 1001, 'US', 1, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525040000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 4, 1001, 'US', 2, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525050000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 5, 1002, 'US', 2, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525060000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 6, 1002, 'US', 2, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525070000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 7, 1002, 'US', 2, 'Samsung' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525080000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 8, 1002, 'US', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525090000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 9, 1003, 'US', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525100000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 10, 1003, 'US', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525110000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 11, 1003, 'US', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525215100', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 21, 1004, 'CA', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525215200', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 21, 1005, 'CA', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525215300', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 21, 1006, 'CA', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525215400', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 21, 1007, 'CA', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525215500', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 21, 1008, 'CA', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525220000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 22, 1004, 'CA', 3, 'Apple' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250525230000', 'YYYYMMDDHH24MISS'), 2025, 5, 25, 23, 1004, 'CA', 4, 'LG' union all 
  
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526000000', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 0, 1005, 'CA', 4, 'LG' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526010000', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 1, 1005, 'CA', 4, 'LG' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526020000', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 2, 1005, 'CA', 4, 'LG' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526030000', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 3, 1006, 'CA', 5, 'Sony' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526040000', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 4, 1006, 'CA', 5, 'Sony' union all 

  SELECT '192.168.0.101', TO_TIMESTAMP('20250526085100', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 8, 1006, 'CA', 5, 'Sony' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526085200', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 8, 1007, 'CA', 5, 'Sony' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526085300', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 8, 1008, 'CA', 5, 'Sony' union all 
  SELECT '192.168.0.101', TO_TIMESTAMP('20250526085400', 'YYYYMMDDHH24MISS'), 2025, 5, 26, 8, 1009, 'CA', 5, 'Sony' 
  )
,
grid AS 
(
    SELECT user_ip,datetime, year,month,day,hour, 
                Coalesce( Case When LAG(hour) Over(Partition By user_ip Order By datetime, hour) IN (hour, hour - 1)
                            OR Min(hour) Over(Partition By user_ip) = hour 
                            Then 'Y' 
                    End, Cast(hour as Varchar(2))
                    ) as consequtive_hour,
                hour - LAG(hour) Over(Partition By user_ip Order By datetime, hour) as diff_hours,
                Row_Number() Over(Partition By user_ip Order By datetime, hour
                                Rows Between Unbounded Preceding And Current Row) as rn,
        tag_id,country,device_type,brand
    from sample_data
)

Select user_ip,datetime, year,month,day,hour, 
       Case When Count(consequtive_hour) 
                       Over(Partition By user_ip, consequtive_hour
                            Order By datetime, hour
                            Rows Between Unbounded Preceding And Current Row) >= 4
                AND consequtive_hour = 'Y' And Sum(Case When consequtive_hour = 'Y' Then diff_hours Else 0 End) Over(Partition By user_ip, consequtive_hour
                            Order By datetime, hour
                            Rows Between Unbounded Preceding And Current Row) >= 3
            Then 'TRUE'
       Else 'FALSE'
       End as is_active_min_4hr, 
       consequtive_hour, diff_hours,rn,tag_id,country,device_type,brand
From grid 
Order By user_ip, datetime, hour, rn

fiddle : https://dbfiddle.uk/Q0vyCsJN

Expected result:


Solution

  • Try it using datetime and intervals like below:
    1. CTE grid

    WITH 
    grid AS 
      ( SELECT     user_ip, datetime, year, month, day, hour, 
                   tag_id, country, device_type, brand, 
                   Row_Number() Over(Partition By user_ip Order By datetime) as rn,
                   datetime - LAG(datetime) Over(Partition By user_ip Order By datetime) as hour_diff
        FROM       sample_data
      ), 
    

    2. CTE grps

    grps AS
      ( Select   g.*, 
                 Case When Extract( Hour From g.datetime - LAG(g.datetime) Over(Partition By user_ip Order By datetime) ) > 1
                          OR g.hour_diff Is Null
                      Then rn
                 End grp_start_rn ,
            CASE WHEN 
                      Case When Extract( Hour From g.datetime - LAG(g.datetime) Over(Partition By user_ip Order By datetime) ) > 1
                                 OR g.hour_diff Is Null
                           Then rn
                      End Is Not Null
               THEN '1 Hour':: Interval
           ELSE g.hour_diff
           END as grp_hour_diff
       From         grid g
       Order By     g.rn
     )
    

    3. Main SQL

    SELECT    x.user_ip, x.datetime, x.year, x.month, x.day, x.hour, 
              Sum(Coalesce(x.grp_hour_diff, '1 Hour'::Interval)) 
                          Over(Partition By x.user_ip, x.grp Order By x.datetime
                          Rows Between Unbounded Preceding And Current Row) as total_grp_hours, 
        CASE WHEN
                Sum(Coalesce(x.grp_hour_diff, '1 Hour'::Interval)) 
                          Over(Partition By x.user_ip, x.grp Order By x.datetime
                          Rows Between Unbounded Preceding And Current Row) >= '4 Hour'::Interval
             THEN 'TRUE'
       ELSE 'False'
       END as flag, 
       x.grp, x.rn
    FROM    ( Select      g.*, Max(g.grp_start_rn) Over(Partition By user_ip Order By datetime
                                  Rows Between Unbounded Preceding And Current Row) as grp
              From        grps g
           ) x
    
    
    
    user_ip datetime year month day hour total_grp_hours flag grp rn
    192.168.0.101 2025-05-25 00:00:00+00 2025 5 25 0 01:00:00 False 1 1
    192.168.0.101 2025-05-25 01:00:00+00 2025 5 25 1 02:00:00 False 1 2
    192.168.0.101 2025-05-25 02:00:00+00 2025 5 25 2 03:00:00 False 1 3
    192.168.0.101 2025-05-25 03:00:00+00 2025 5 25 3 04:00:00 TRUE 1 4
    192.168.0.101 2025-05-25 04:00:00+00 2025 5 25 4 05:00:00 TRUE 1 5
    192.168.0.101 2025-05-25 05:00:00+00 2025 5 25 5 06:00:00 TRUE 1 6
    192.168.0.101 2025-05-25 06:00:00+00 2025 5 25 6 07:00:00 TRUE 1 7
    192.168.0.101 2025-05-25 07:00:00+00 2025 5 25 7 08:00:00 TRUE 1 8
    192.168.0.101 2025-05-25 08:00:00+00 2025 5 25 8 09:00:00 TRUE 1 9
    192.168.0.101 2025-05-25 09:00:00+00 2025 5 25 9 10:00:00 TRUE 1 10
    192.168.0.101 2025-05-25 10:00:00+00 2025 5 25 10 11:00:00 TRUE 1 11
    192.168.0.101 2025-05-25 11:00:00+00 2025 5 25 11 12:00:00 TRUE 1 12
    192.168.0.101 2025-05-25 21:51:00+00 2025 5 25 21 01:00:00 False 13 13
    192.168.0.101 2025-05-25 21:52:00+00 2025 5 25 21 01:01:00 False 13 14
    192.168.0.101 2025-05-25 21:53:00+00 2025 5 25 21 01:02:00 False 13 15
    192.168.0.101 2025-05-25 21:54:00+00 2025 5 25 21 01:03:00 False 13 16
    192.168.0.101 2025-05-25 21:55:00+00 2025 5 25 21 01:04:00 False 13 17
    192.168.0.101 2025-05-25 22:00:00+00 2025 5 25 22 01:09:00 False 13 18
    192.168.0.101 2025-05-25 23:00:00+00 2025 5 25 23 02:09:00 False 13 19
    192.168.0.101 2025-05-26 00:00:00+00 2025 5 26 0 03:09:00 False 13 20
    192.168.0.101 2025-05-26 01:00:00+00 2025 5 26 1 04:09:00 TRUE 13 21
    192.168.0.101 2025-05-26 02:00:00+00 2025 5 26 2 05:09:00 TRUE 13 22
    192.168.0.101 2025-05-26 03:00:00+00 2025 5 26 3 06:09:00 TRUE 13 23
    192.168.0.101 2025-05-26 04:00:00+00 2025 5 26 4 07:09:00 TRUE 13 24
    192.168.0.101 2025-05-26 08:51:00+00 2025 5 26 8 01:00:00 False 25 25
    192.168.0.101 2025-05-26 08:52:00+00 2025 5 26 8 01:01:00 False 25 26
    192.168.0.101 2025-05-26 08:53:00+00 2025 5 26 8 01:02:00 False 25 27
    192.168.0.101 2025-05-26 08:54:00+00 2025 5 26 8 01:03:00 False 25 28

    fiddle