sqlpostgresqlgaps-and-islandstraffic

use sql to sessionize traffic data - gaps and island problem


I have an activity table called myTable, the data looks like this:

CREATE TABLE myTable
(
    userid      text,
    webid         text,
    "ts" timestamp
);

INSERT INTO myTable
    ("userid", "webid", "ts")
VALUES ('1', 'A', '34', '2023-01-31 16:34:49.000'),
       ('2', 'A', '73', '2023-01-31 16:34:50.000'),
       ('3', 'A', '97', '2023-01-31 16:34:58.000'),
       ('4', 'A', '17', '2023-01-31 17:35:02.000'),
       ('5', 'A', '17', '2023-01-31 17:35:07.000'),
       ('6', 'A', '17', '2023-01-31 17:35:18.000'),
       ('7', 'A', '17', '2023-01-31 17:35:30.000'),
       ('8', 'A', '1', '2023-01-31 17:35:37.000'),
       ('9', 'A', '1', '2023-01-31 17:35:38.000'),
       ('10', 'A', '77', '2023-01-31 17:35:41.000'),
       ('11', 'A', '77', '2023-01-31 17:35:42.000'),
       ('12', 'A', '15', '2023-01-31 17:35:42.000'),
       ('13', 'A', '44', '2023-01-31 17:35:42.000'),
       ('14', 'A', '1', '2023-01-31 17:35:42.000'),
       ('15', 'A', '77', '2023-01-31 17:35:45.000'),
       ('16', 'A', '44', '2023-01-31 17:35:45.000'),
       ('17', 'A', '1', '2023-01-31 17:37:10.000'),
       ('18', 'A', '1', '2023-01-31 17:37:12.000'),
       ('19', 'A', '77', '2023-01-31 17:37:14.000'),
       ('20', 'A', '77', '2023-01-31 17:52:14.000'),
       ('21', 'A', '77', '2023-01-31 18:12:14.000'),
       ('22', 'A', '77', '2023-01-31 18:45:14.000'),
       ('23', 'A', '77', '2023-01-31 18:55:15.000'),
       ('1', 'B', '33', '2023-01-31 06:37:15.000'),
       ('2', 'B', '56', '2023-01-31 06:40:15.000')
  ;


rowid userid webid ts
1 A 34 2023-01-31 16:34:49
2 A 73 2023-01-31 16:34:50
3 A 97 2023-01-31 16:34:58
4 A 17 2023-01-31 17:35:02
5 A 17 2023-01-31 17:35:07
6 A 17 2023-01-31 17:35:18
7 A 17 2023-01-31 17:35:30
8 A 1 2023-01-31 17:35:37
9 A 1 2023-01-31 17:35:38
10 A 77 2023-01-31 17:35:41
11 A 77 2023-01-31 17:35:42
12 A 15 2023-01-31 17:35:42
13 A 44 2023-01-31 17:35:42
14 A 1 2023-01-31 17:35:42
15 A 77 2023-01-31 17:35:45
16 A 44 2023-01-31 17:35:45
17 A 1 2023-01-31 17:37:10
18 A 1 2023-01-31 17:37:12
19 A 77 2023-01-31 17:37:14
20 A 77 2023-01-31 17:52:14
21 A 77 2023-01-31 18:12:14
22 A 77 2023-01-31 18:45:14
23 A 77 2023-01-31 18:55:15
1 B 33 2023-01-31 06:37:15
2 B 56 2023-01-31 06:40:15

The output I want to return as following:

rowid userid webid session_id ts first_ts
1 A 34 1 2023-01-31 16:34:49 2023-01-31 16:34:49
2 A 73 2 2023-01-31 16:34:50 2023-01-31 16:34:50
3 A 97 3 2023-01-31 16:34:58 2023-01-31 16:34:58
4 A 17 4 2023-01-31 17:35:02 2023-01-31 17:35:02
5 A 17 4 2023-01-31 17:35:07 2023-01-31 17:35:02
6 A 17 4 2023-01-31 17:35:18 2023-01-31 17:35:02
7 A 17 4 2023-01-31 17:35:30 2023-01-31 17:35:02
8 A 1 5 2023-01-31 17:35:37 2023-01-31 17:35:37
9 A 1 5 2023-01-31 17:35:38 2023-01-31 17:35:37
10 A 77 6 2023-01-31 17:35:41 2023-01-31 17:35:41
11 A 77 6 2023-01-31 17:35:42 2023-01-31 17:35:42
12 A 15 7 2023-01-31 17:35:42 2023-01-31 17:35:42
13 A 44 8 2023-01-31 17:35:42 2023-01-31 17:35:42
14 A 1 9 2023-01-31 17:35:42 2023-01-31 17:35:42
15 A 77 10 2023-01-31 17:35:45 2023-01-31 17:35:45
16 A 44 11 2023-01-31 17:35:45 2023-01-31 17:35:45
17 A 1 12 2023-01-31 17:37:10 2023-01-31 17:37:10
18 A 1 12 2023-01-31 17:37:12 2023-01-31 17:37:10
19 A 77 13 2023-01-31 17:37:14 2023-01-31 17:37:14
20 A 77 13 2023-01-31 17:52:14 2023-01-31 17:37:14
21 A 77 13 2023-01-31 18:12:14 2023-01-31 17:37:14
22 A 77 14 2023-01-31 18:45:14 2023-01-31 18:45:14
23 A 77 14 2023-01-31 18:55:15 2023-01-31 18:45:14
1 B 33 1 2023-01-31 06:37:15 2023-01-31 06:37:15
2 B 56 2 2023-01-31 06:40:15 2023-01-31 06:40:15

first_ts means first timestamp. The logic for first_ts is If a userid visits same webid at consecutive timestamp and each timestamp's interval falls within 30 minutes, these will be recorded as a session. For example, row 4,5,6,7. Each time interval are less than 30 minutes, their first_ts is the timestamp of the first event, which is 2023-01-31 17:35:02.

If a userid visits a webid and jump to another webid and return back to the first webid, the first_ts will be refreshed to current timestamp. For example, row 8,9,10,11,12,13,14,15,16,17,18. userid A visits webid=1 at first, but jump to webid=77,15,44, after return back to webid=1, then row 12, row 13 and row 14 should be recorded as a new session with current first_ts.

If a userid visits a webid at consecutive timestamps, and one timestamp interval is greater than 30 minutes, it will also break the session and the first_ts will be refreshed as well. For example, row 19,20,21,22,23. Between row 21 and row 22, their time interval are over 30 minutes, so for row 22 and row 23, their first_ts refreshed to current timestamp and they are to be a new session.

I share my script here, the script includes rowid for view easily and it seems like it return correct first_ts, but session_id is wrong.

SQL version:

create temp table lags as
select *, lag(webid,1,webid) over(partition by userid order by ts) lag_webid,
lag(ts,1,ts) over(partition by userid order by ts) lag_ts 
from mytable;

create temp table is_sessions as 
select *,
case when webid!=lag_webid or abs(datediff(minutes,ts,lag_ts)) > 30 then 1
     else 0
end as is_session
from lags;

create temp table sessions as 
select *,
sum(is_session) over(partition by userid order by ts)+1 as session_id
from is_sessions;

create temp table sessions_ts as 
select *,
min(ts) over(partition by userid, session_id order by ts) as first_ts
from sessions;

Equivalent PostgreSQL version:

select rowid, userid, webid, session_id, ts, min(ts) over(partition by userid, session_id order by ts) first_ts
from (
    select t.*,
        1 + count(*) 
            filter(where webid != lag_webid or ts > lag_ts + interval '30' minute) 
            over(partition by userid order by ts) session_id
    from (
        select t.*, 
            lag(webid, 1, webid) over(partition by userid order by ts) lag_webid,
            lag(ts, 1, ts)       over(partition by userid order by ts) lag_ts
        from mytable t
    ) t
) t

Anyone has a better solution to do the task? Thanks!

script reference: https://dbfiddle.uk/-c3ZNO65

I tried several queries but couldn't get correct session_id.


Solution

  • Your data is a bit tricky, but as you have a rowid as sorting order, wecan use it to make your wanted result

    WITH CTE as (
      SELECT
    "rowid", "userid", "webid", "ts"
      , CASE WHEN "webid" = LAG("webid") OVER(PARTITION BY "userid" ORDER BY "rowid") then 
      CASE WHEN EXTRACT(MINUTE from"ts"-LAG("ts") OVER(PARTITION BY "userid" ORDER BY "rowid")) > 30 
      THEn 1 ELSE  0 END
      ELSE 1 END tab
      FROM mytable
      )
    , CTE2 As
    (
      SELECT "rowid", "userid", "webid", "ts",tab, SUM(tab) OVER(PARTITION BY "userid" ORDER BY "rowid") session_id
    FROM CTE
      )
    SELECT  
      "rowid", "userid", "webid","session_id",  "ts"
    ,FIRST_VALUE("ts") OVER(PARTITION BY "userid","session_id" ORDER BY "ts") as first_ts
    FROM CTE2
    
    
    rowid userid webid session_id ts first_ts
    1 A 34 1 2023-01-31 16:34:49 2023-01-31 16:34:49
    2 A 73 2 2023-01-31 16:34:50 2023-01-31 16:34:50
    3 A 97 3 2023-01-31 16:34:58 2023-01-31 16:34:58
    4 A 17 4 2023-01-31 17:35:02 2023-01-31 17:35:02
    5 A 17 4 2023-01-31 17:35:07 2023-01-31 17:35:02
    6 A 17 4 2023-01-31 17:35:18 2023-01-31 17:35:02
    7 A 17 4 2023-01-31 17:35:30 2023-01-31 17:35:02
    8 A 1 5 2023-01-31 17:35:37 2023-01-31 17:35:37
    9 A 1 5 2023-01-31 17:35:38 2023-01-31 17:35:37
    10 A 77 6 2023-01-31 17:35:41 2023-01-31 17:35:41
    11 A 77 6 2023-01-31 17:35:42 2023-01-31 17:35:41
    12 A 15 7 2023-01-31 17:35:42 2023-01-31 17:35:42
    13 A 44 8 2023-01-31 17:35:42 2023-01-31 17:35:42
    14 A 1 9 2023-01-31 17:35:42 2023-01-31 17:35:42
    15 A 77 10 2023-01-31 17:35:45 2023-01-31 17:35:45
    16 A 44 11 2023-01-31 17:35:45 2023-01-31 17:35:45
    17 A 1 12 2023-01-31 17:37:10 2023-01-31 17:37:10
    18 A 1 12 2023-01-31 17:37:12 2023-01-31 17:37:10
    19 A 77 13 2023-01-31 17:37:14 2023-01-31 17:37:14
    20 A 77 13 2023-01-31 17:52:14 2023-01-31 17:37:14
    21 A 77 13 2023-01-31 18:12:14 2023-01-31 17:37:14
    22 A 77 14 2023-01-31 18:45:14 2023-01-31 18:45:14
    23 A 77 14 2023-01-31 18:55:15 2023-01-31 18:45:14
    1 B 33 1 2023-01-31 06:37:15 2023-01-31 06:37:15
    2 B 56 2 2023-01-31 06:40:15 2023-01-31 06:40:15
    SELECT 25
    

    fiddle