I am looking at a series of session dates of a user, and I am trying to “collapse” the event counts in a way that any event occurring in a 7 day time window is only counted once, starting from the first event and then again after the initial period of 7 days is over, and a new event (and a 7 day window) starts again.
I am trying to figure out a logic, where the count “breaks” and resets the window. Thank you!
WITH data1 AS (
SELECT 'bob' AS user, DATE('2024-04-08') AS session_date
UNION ALL
SELECT 'bob', DATE('2024-04-11')
UNION ALL
SELECT 'bob', DATE('2024-04-12')
UNION ALL
SELECT 'bob', DATE('2024-04-17')
UNION ALL
SELECT 'bob', DATE('2024-04-18')
UNION ALL
SELECT 'bob', DATE('2024-04-22')
UNION ALL
SELECT 'bob', DATE('2024-04-23')
UNION ALL
SELECT 'bob', DATE('2024-04-25')
)
SELECT
*,
DATE_ADD(session_date, INTERVAL 7 day) AS session_date_end,
COUNT(session_date) OVER (PARTITION BY user
ORDER BY UNIX_DATE(session_date)
RANGE BETWEEN CURRENT ROW AND 6 FOLLOWING)
AS next_7_days
FROM data1
input
desired output
Additional edit: For example, if a user logs in on the 1st, 10th and 16th the windows should be 1st to 7th and 10th to 16th to accommodate varying user activity.
WITH data1 AS (
SELECT 'bob' AS user, DATE('2024-04-08') AS session_date
UNION ALL
SELECT 'bob', DATE('2024-04-11')
UNION ALL
SELECT 'bob', DATE('2024-04-12')
UNION ALL
SELECT 'bob', DATE('2024-04-17')
UNION ALL
SELECT 'bob', DATE('2024-04-18')
UNION ALL
SELECT 'bob', DATE('2024-04-22')
UNION ALL
SELECT 'bob', DATE('2024-04-23')
UNION ALL
SELECT 'bob', DATE('2024-04-25')
),
Lookup the preceding session date...
look_behind AS
(
SELECT
data.*,
LAG(session_date) OVER (
PARTITION BY user
ORDER BY session_date
)
AS previous_session_date
FROM
data1
),
Get the most recent session date which is more than 7 days after its preceding session's date.
(CASE expression ensures the first date is considered to be 7 days after, even though there is no preceding session date.)
partitioned AS
(
SELECT
look_behind.*,
LAST_VALUE(
CASE WHEN session_date <= DATE_ADD(previous_session_date, INTERVAL 7 DAY) THEN NULL ELSE session_date END
IGNORE NULLS
) OVER (
PARTITION BY user
ORDER BY session_date
)
AS base_session_date
FROM
look_behind
)
Use dense rank to calculate the session id.
SELECT
partitioned.*,
DENSE_RANK() OVER (
PARTITION BY user
ORDER BY base_session_date,
DIV(DATE_DIFF(session_date, base_session_date, DAY), 7)
)
AS user_session_id
FROM
partitioned
ORDER BY
user, session_date