sqldb2

Tracking when colors appear for the first time under different conditions


I have this table in SQL (each color only appears once in each year):

CREATE TABLE myt (
    color VARCHAR(20),
    year INTEGER,
    var INTEGER
);

INSERT INTO myt (color, year, var) VALUES
('red', 2020, 1),
('red', 2021, 1),
('red', 2022, 0),
('red', 2023, 1),

('blue', 2020, 0),
('blue', 2021, 0),
('blue', 2022, 0),
('blue', 2023, 1),

('green', 2021, 1),
('green', 2022, 1),
('green', 2023, 0),

('yellow', 2021, 0),
('yellow', 2022, 0),
('yellow', 2023, 0),

('purple', 2022, 1),
('purple', 2023, 1),

('orange', 2022, 0),
('orange', 2023, 1),

('pink', 2023, 1),

('black', 2023, 0),

('white', 2020, 0),
('white', 2022, 1),
('white', 2023, 0);

The table looks like this:

| Color  | Year | Var |
|--------|------|-----|
| red    | 2020 | 1   |
| red    | 2021 | 1   |
| red    | 2022 | 0   |
| red    | 2023 | 1   |
| blue   | 2020 | 0   |
| blue   | 2021 | 0   |
| blue   | 2022 | 0   |
| blue   | 2023 | 1   |
| green  | 2021 | 1   |
| green  | 2022 | 1   |
| green  | 2023 | 0   |
| yellow | 2021 | 0   |
| yellow | 2022 | 0   |
| yellow | 2023 | 0   |
| purple | 2022 | 1   |
| purple | 2023 | 1   |
| orange | 2022 | 0   |
| orange | 2023 | 1   |
| pink   | 2023 | 1   |
| black  | 2023 | 0   |
| white  | 2020 | 0   |
| white  | 2022 | 1   |
| white  | 2023 | 0   |

I want to write SQL to answer the following question. In each year, I want to count: (note that all categories will sum to the total number of colors in that year):

I think the final answer should look like this:

year category1 category2 category3 category4 category5 category6
2020 1 2 0 0 0 0
2021 1 1 1 1 0 0
2022 1 1 2 1 1 1
2023 1 1 1 2 2 2

I tried to organize the SQL the following way. First, I transpose the data to make columns for whether a color appears in each year or not. Then, for each year, I do the analysis and perform a series of UNION ALLs:

WITH pivoted_data AS (
    SELECT 
        color,
        MAX(CASE WHEN year = 2020 THEN var END) as in_2020,
        MAX(CASE WHEN year = 2021 THEN var END) as in_2021,
        MAX(CASE WHEN year = 2022 THEN var END) as in_2022,
        MAX(CASE WHEN year = 2023 THEN var END) as in_2023,
        MIN(year) as first_year,
        MAX(year) as last_year
    FROM myt
    GROUP BY color
),
categorized_by_year AS (
    SELECT 
        color,
        2020 as year,
        in_2020 as current_var,
        CASE 
            WHEN in_2020 IS NULL THEN NULL
            WHEN first_year = 2020 AND in_2020 = 1 THEN 'category1'
            WHEN first_year = 2020 AND in_2020 = 0 THEN 'category2'
            ELSE 'other'
        END as category
    FROM pivoted_data
    WHERE in_2020 IS NOT NULL
    
    UNION ALL
    
    SELECT 
        color,
        2021 as year,
        in_2021 as current_var,
        CASE 
            WHEN in_2021 IS NULL THEN NULL
            WHEN first_year = 2021 AND in_2021 = 1 THEN 'category1'
            WHEN first_year = 2021 AND in_2021 = 0 THEN 'category2'
            WHEN first_year < 2021 AND in_2020 = 0 AND in_2021 = 0 THEN 'category3'
            WHEN first_year < 2021 AND in_2021 = 1 AND in_2020 = 0 THEN 'category6'
            WHEN first_year < 2021 AND in_2021 = 1 AND in_2020 = 1 THEN 'category4'
            WHEN first_year < 2021 AND in_2021 = 0 AND in_2020 = 1 THEN 'category5'
            ELSE 'other'
        END as category
    FROM pivoted_data
    WHERE in_2021 IS NOT NULL
    
    UNION ALL
    
    SELECT 
        color,
        2022 as year,
        in_2022 as current_var,
        CASE 
            WHEN in_2022 IS NULL THEN NULL
            WHEN first_year = 2022 AND in_2022 = 1 THEN 'category1'
            WHEN first_year = 2022 AND in_2022 = 0 THEN 'category2'
            WHEN first_year < 2022 AND 
                 COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 AND in_2022 = 0 THEN 'category3'
            WHEN first_year < 2022 AND in_2022 = 1 AND 
                 COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 THEN 'category6'
            WHEN first_year < 2022 AND in_2022 = 1 AND 
                 (COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1) THEN 'category4'
            WHEN first_year < 2022 AND in_2022 = 0 AND 
                 (COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1) THEN 'category5'
            ELSE 'other'
        END as category
    FROM pivoted_data
    WHERE in_2022 IS NOT NULL
    
    UNION ALL
    
    SELECT 
        color,
        2023 as year,
        in_2023 as current_var,
        CASE 
            WHEN in_2023 IS NULL THEN NULL
            WHEN first_year = 2023 AND in_2023 = 1 THEN 'category1'
            WHEN first_year = 2023 AND in_2023 = 0 THEN 'category2'
            WHEN first_year < 2023 AND 
                 COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 AND COALESCE(in_2022, 0) = 0 AND in_2023 = 0 THEN 'category3'
            WHEN first_year < 2023 AND in_2023 = 1 AND 
                 COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 AND COALESCE(in_2022, 0) = 0 THEN 'category6'
            WHEN first_year < 2023 AND in_2023 = 1 AND 
                 (COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1 OR COALESCE(in_2022, 0) = 1) THEN 'category4'
            WHEN first_year < 2023 AND in_2023 = 0 AND 
                 (COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1 OR COALESCE(in_2022, 0) = 1) THEN 'category5'
            ELSE 'other'
        END as category
    FROM pivoted_data
    WHERE in_2023 IS NOT NULL
)
SELECT 
    year,
    COUNT(CASE WHEN category = 'category1' THEN 1 END) as category1,
    COUNT(CASE WHEN category = 'category2' THEN 1 END) as category2,
    COUNT(CASE WHEN category = 'category3' THEN 1 END) as category3,
    COUNT(CASE WHEN category = 'category4' THEN 1 END) as category4,
    COUNT(CASE WHEN category = 'category5' THEN 1 END) as category5,
    COUNT(CASE WHEN category = 'category6' THEN 1 END) as category6,
    COUNT(*) as total_rows,
    COUNT(DISTINCT color) as distinct_colors
FROM categorized_by_year
WHERE category IS NOT NULL
GROUP BY year
ORDER BY year;

Is there a different way to do this so that I don't have to manually UNION ALL the results of each year? This approach will get quite lengthy when there are many years.


Solution

  • I recommend using window functions to work out the "what happened historically" conditions, then just aggregate.

    WITH
      summary AS
    (
      SELECT
        color, year, var,
        ROW_NUMBER() OVER (PARTITION BY color ORDER BY year)   AS occurrence,
        SUM(var)     OVER (PARTITION BY color ORDER BY year)   AS ones
      FROM
        myt
    )
    SELECT
      year,
      SUM(occurrence = 1 AND var = 1             )  AS category1,
      SUM(occurrence = 1 AND var = 0             )  AS category2,
      SUM(occurrence > 1 AND             ones = 0)  AS category3,
      SUM(occurrence > 1 AND var = 1             )  AS category4,
      SUM(occurrence > 1 AND var = 0 AND ones > 0)  AS category5,
      SUM(occurrence > 1 AND var = 1 AND ones = 1)  AS category6
    FROM
      summary
    GROUP BY
      year
    ORDER BY
      year
    
    YEAR CATEGORY1 CATEGORY2 CATEGORY3 CATEGORY4 CATEGORY5 CATEGORY6
    2020 1 2 0 0 0 0
    2021 1 1 1 1 0 0
    2022 1 1 2 2 1 1
    2023 1 1 1 4 2 2

    fiddle