I have this table in SQL (each color only appears once in each year):
CREATE TABLE myt (
color VARCHAR(20),
year INTEGER,
var INTEGER
);
INSERT INTO myt (color, year, var) VALUES
('red', 2020, 1),
('red', 2021, 1),
('red', 2022, 0),
('red', 2023, 1),
('blue', 2020, 0),
('blue', 2021, 0),
('blue', 2022, 0),
('blue', 2023, 1),
('green', 2021, 1),
('green', 2022, 1),
('green', 2023, 0),
('yellow', 2021, 0),
('yellow', 2022, 0),
('yellow', 2023, 0),
('purple', 2022, 1),
('purple', 2023, 1),
('orange', 2022, 0),
('orange', 2023, 1),
('pink', 2023, 1),
('black', 2023, 0),
('white', 2020, 0),
('white', 2022, 1),
('white', 2023, 0);
The table looks like this:
| Color | Year | Var |
|--------|------|-----|
| red | 2020 | 1 |
| red | 2021 | 1 |
| red | 2022 | 0 |
| red | 2023 | 1 |
| blue | 2020 | 0 |
| blue | 2021 | 0 |
| blue | 2022 | 0 |
| blue | 2023 | 1 |
| green | 2021 | 1 |
| green | 2022 | 1 |
| green | 2023 | 0 |
| yellow | 2021 | 0 |
| yellow | 2022 | 0 |
| yellow | 2023 | 0 |
| purple | 2022 | 1 |
| purple | 2023 | 1 |
| orange | 2022 | 0 |
| orange | 2023 | 1 |
| pink | 2023 | 1 |
| black | 2023 | 0 |
| white | 2020 | 0 |
| white | 2022 | 1 |
| white | 2023 | 0 |
I want to write SQL to answer the following question. In each year, I want to count: (note that all categories will sum to the total number of colors in that year):
I think the final answer should look like this:
year | category1 | category2 | category3 | category4 | category5 | category6 |
---|---|---|---|---|---|---|
2020 | 1 | 2 | 0 | 0 | 0 | 0 |
2021 | 1 | 1 | 1 | 1 | 0 | 0 |
2022 | 1 | 1 | 2 | 1 | 1 | 1 |
2023 | 1 | 1 | 1 | 2 | 2 | 2 |
I tried to organize the SQL the following way. First, I transpose the data to make columns for whether a color appears in each year or not. Then, for each year, I do the analysis and perform a series of UNION ALLs:
WITH pivoted_data AS (
SELECT
color,
MAX(CASE WHEN year = 2020 THEN var END) as in_2020,
MAX(CASE WHEN year = 2021 THEN var END) as in_2021,
MAX(CASE WHEN year = 2022 THEN var END) as in_2022,
MAX(CASE WHEN year = 2023 THEN var END) as in_2023,
MIN(year) as first_year,
MAX(year) as last_year
FROM myt
GROUP BY color
),
categorized_by_year AS (
SELECT
color,
2020 as year,
in_2020 as current_var,
CASE
WHEN in_2020 IS NULL THEN NULL
WHEN first_year = 2020 AND in_2020 = 1 THEN 'category1'
WHEN first_year = 2020 AND in_2020 = 0 THEN 'category2'
ELSE 'other'
END as category
FROM pivoted_data
WHERE in_2020 IS NOT NULL
UNION ALL
SELECT
color,
2021 as year,
in_2021 as current_var,
CASE
WHEN in_2021 IS NULL THEN NULL
WHEN first_year = 2021 AND in_2021 = 1 THEN 'category1'
WHEN first_year = 2021 AND in_2021 = 0 THEN 'category2'
WHEN first_year < 2021 AND in_2020 = 0 AND in_2021 = 0 THEN 'category3'
WHEN first_year < 2021 AND in_2021 = 1 AND in_2020 = 0 THEN 'category6'
WHEN first_year < 2021 AND in_2021 = 1 AND in_2020 = 1 THEN 'category4'
WHEN first_year < 2021 AND in_2021 = 0 AND in_2020 = 1 THEN 'category5'
ELSE 'other'
END as category
FROM pivoted_data
WHERE in_2021 IS NOT NULL
UNION ALL
SELECT
color,
2022 as year,
in_2022 as current_var,
CASE
WHEN in_2022 IS NULL THEN NULL
WHEN first_year = 2022 AND in_2022 = 1 THEN 'category1'
WHEN first_year = 2022 AND in_2022 = 0 THEN 'category2'
WHEN first_year < 2022 AND
COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 AND in_2022 = 0 THEN 'category3'
WHEN first_year < 2022 AND in_2022 = 1 AND
COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 THEN 'category6'
WHEN first_year < 2022 AND in_2022 = 1 AND
(COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1) THEN 'category4'
WHEN first_year < 2022 AND in_2022 = 0 AND
(COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1) THEN 'category5'
ELSE 'other'
END as category
FROM pivoted_data
WHERE in_2022 IS NOT NULL
UNION ALL
SELECT
color,
2023 as year,
in_2023 as current_var,
CASE
WHEN in_2023 IS NULL THEN NULL
WHEN first_year = 2023 AND in_2023 = 1 THEN 'category1'
WHEN first_year = 2023 AND in_2023 = 0 THEN 'category2'
WHEN first_year < 2023 AND
COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 AND COALESCE(in_2022, 0) = 0 AND in_2023 = 0 THEN 'category3'
WHEN first_year < 2023 AND in_2023 = 1 AND
COALESCE(in_2020, 0) = 0 AND COALESCE(in_2021, 0) = 0 AND COALESCE(in_2022, 0) = 0 THEN 'category6'
WHEN first_year < 2023 AND in_2023 = 1 AND
(COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1 OR COALESCE(in_2022, 0) = 1) THEN 'category4'
WHEN first_year < 2023 AND in_2023 = 0 AND
(COALESCE(in_2020, 0) = 1 OR COALESCE(in_2021, 0) = 1 OR COALESCE(in_2022, 0) = 1) THEN 'category5'
ELSE 'other'
END as category
FROM pivoted_data
WHERE in_2023 IS NOT NULL
)
SELECT
year,
COUNT(CASE WHEN category = 'category1' THEN 1 END) as category1,
COUNT(CASE WHEN category = 'category2' THEN 1 END) as category2,
COUNT(CASE WHEN category = 'category3' THEN 1 END) as category3,
COUNT(CASE WHEN category = 'category4' THEN 1 END) as category4,
COUNT(CASE WHEN category = 'category5' THEN 1 END) as category5,
COUNT(CASE WHEN category = 'category6' THEN 1 END) as category6,
COUNT(*) as total_rows,
COUNT(DISTINCT color) as distinct_colors
FROM categorized_by_year
WHERE category IS NOT NULL
GROUP BY year
ORDER BY year;
Is there a different way to do this so that I don't have to manually UNION ALL the results of each year? This approach will get quite lengthy when there are many years.
I recommend using window functions to work out the "what happened historically" conditions, then just aggregate.
WITH
summary AS
(
SELECT
color, year, var,
ROW_NUMBER() OVER (PARTITION BY color ORDER BY year) AS occurrence,
SUM(var) OVER (PARTITION BY color ORDER BY year) AS ones
FROM
myt
)
SELECT
year,
SUM(occurrence = 1 AND var = 1 ) AS category1,
SUM(occurrence = 1 AND var = 0 ) AS category2,
SUM(occurrence > 1 AND ones = 0) AS category3,
SUM(occurrence > 1 AND var = 1 ) AS category4,
SUM(occurrence > 1 AND var = 0 AND ones > 0) AS category5,
SUM(occurrence > 1 AND var = 1 AND ones = 1) AS category6
FROM
summary
GROUP BY
year
ORDER BY
year
YEAR | CATEGORY1 | CATEGORY2 | CATEGORY3 | CATEGORY4 | CATEGORY5 | CATEGORY6 |
---|---|---|---|---|---|---|
2020 | 1 | 2 | 0 | 0 | 0 | 0 |
2021 | 1 | 1 | 1 | 1 | 0 | 0 |
2022 | 1 | 1 | 2 | 2 | 1 | 1 |
2023 | 1 | 1 | 1 | 4 | 2 | 2 |