I have this table in SQL of student grades:
CREATE TABLE myt
(
id INT PRIMARY KEY,
name VARCHAR(50),
letter CHAR(1),
date DATE
);
INSERT INTO myt (id, name, letter, date)
VALUES
(1, 'Alice', 'C', '2023-01-28'),
(2, 'Alice', 'B', '2023-02-28'),
(3, 'Alice', 'A', '2023-03-28'),
(4, 'Bob', 'B', '2023-01-09'),
(5, 'Bob', 'C', '2023-02-09'),
(6, 'Bob', 'B', '2023-03-09'),
(7, 'Charlie', 'B', '2023-01-19'),
(8, 'Charlie', 'A', '2023-02-19'),
(9, 'Charlie', 'A', '2023-03-19'),
(10, 'Charlie', 'A', '2023-04-19'),
(11, 'Charlie', 'A', '2023-05-19'),
(12, 'David', 'B', '2023-01-05'),
(13, 'David', 'C', '2023-02-05'),
(14, 'David', 'A', '2023-03-05'),
(15, 'David', 'B', '2023-04-05'),
(16, 'David', 'B', '2023-05-05'),
(17, 'David', 'A', '2023-06-05'),
(18, 'Emma', 'B', '2023-01-15'),
(19, 'Emma', 'A', '2023-02-15'),
(20, 'Emma', 'A', '2023-03-15'),
(21, 'Emma', 'A', '2023-04-15'),
(22, 'Emma', 'A', '2023-05-15'),
(23, 'Frank', 'B', '2023-01-06'),
(24, 'Frank', 'A', '2023-02-06'),
(25, 'Frank', 'A', '2023-03-06'),
(26, 'Frank', 'A', '2023-04-06'),
(27, 'Grace', 'A', '2023-01-27'),
(28, 'Grace', 'B', '2023-02-27'),
(29, 'Grace', 'B', '2023-03-27'),
(30, 'Henry', 'B', '2023-01-31'),
(31, 'Henry', 'A', '2023-03-03'),
(32, 'Henry', 'A', '2023-03-31'),
(33, 'Henry', 'A', '2023-05-01'),
(34, 'Henry', 'A', '2023-05-31'),
(35, 'Isabel', 'B', '2023-01-20'),
(36, 'Isabel', 'C', '2023-02-20'),
(37, 'Isabel', 'A', '2023-03-20'),
(38, 'Isabel', 'A', '2023-04-20'),
(39, 'Isabel', 'C', '2023-05-20'),
(40, 'Jack', 'B', '2023-01-30'),
(41, 'Jack', 'C', '2023-03-02'),
(42, 'Jack', 'A', '2023-03-30'),
(43, 'Jack', 'B', '2023-04-30'),
(44, 'Jack', 'B', '2023-05-30'),
(45, 'Jack', 'C', '2023-06-30');
Consider the following problem: I want to see that for someone who begins with a B, once they get the first A: how many of them remain as purely A students for their remaining rows, and how many do not.
I wrote this SQL code which first identifies students that have their first grade as B and then sees how many of them only get A's after their first A:
WITH RankedGrades AS
(
SELECT
name,
letter,
date,
ROW_NUMBER() OVER (PARTITION BY name ORDER BY date) as grade_sequence
FROM
myt
),
BStarters AS
(
SELECT DISTINCT name
FROM RankedGrades
WHERE grade_sequence = 1 AND letter = 'B'
),
FirstASequence AS
(
SELECT
r.name,
MIN(r.grade_sequence) as first_a_sequence
FROM
RankedGrades r
JOIN
BStarters b ON r.name = b.name
WHERE
r.letter = 'A'
GROUP BY
r.name
),
FirstADetails AS
(
SELECT
r.name,
r.date as first_a_date,
fas.first_a_sequence as a_sequence_number
FROM
RankedGrades r
JOIN
FirstASequence fas ON r.name = fas.name
WHERE
r.grade_sequence = fas.first_a_sequence
),
SubsequentPerformance AS
(
SELECT
f.name,
CASE WHEN MAX(CASE WHEN r.letter != 'A' THEN 1 ELSE 0 END) = 0
THEN 'Pure A'
ELSE 'Mixed Grades'
END as grade_pattern
FROM
FirstADetails f
JOIN
RankedGrades r ON f.name = r.name
WHERE
r.grade_sequence > f.a_sequence_number
GROUP BY
f.name
),
Summary AS
(
SELECT
grade_pattern,
COUNT(*) as student_count,
SUM(COUNT(*)) OVER () as total_students
FROM
SubsequentPerformance
GROUP BY
grade_pattern
)
SELECT
grade_pattern,
student_count,
ROUND(CAST(student_count AS FLOAT) / total_students * 100, 2) as percentage
FROM
Summary
ORDER BY
student_count DESC;
The final answer looks like this:
grade_pattern student_count percentage
--------------------------------------
Pure A 4 57.14
Mixed Grades 3 42.86
I am trying to extend this code for all combinations. E.g. Students that start with B, after the first C how many have pure C's ... Students that start with B, after the first A how many have pure A's ... Students that start with A, after the first C how many have pure C's etc etc.
Is it possible to do this in the same query? Or does a new query have to be written each time?
You did the hard work, there only needed to replace the hardcoded A
and B
s with a list of all previously found values for the given user.
Below is the diff from the code you posted,
you'll find the full solution on an SQLFiddle (in PostgreSQL but this should be mostly compatible)
(you may want to uncomment the 'Koala' student, to introduce diversity, because as your example was B-to-A focused, all other categories fall 100 % into the "Mixed" category).
Note that to minimize the diff I kept the names as is,
but if you adopt the solution, first_a_sequence
and first_a_letter
should be rewritten first_other_letter_sequence
and first_other_letter
(in fact the "other letter" may be the same as the starter one, in case you want to know how many people started at A and kept an A the full time along).
--- /tmp/1.sql 2025-01-14 23:50:06.645054000 +0100
+++ /tmp/2.sql 2025-01-14 23:49:40.995233000 +0100
@@ -10,28 +10,28 @@
),
BStarters AS
(
- SELECT DISTINCT name
+ SELECT DISTINCT name, letter first_letter
FROM RankedGrades
- WHERE grade_sequence = 1 AND letter = 'B'
+ WHERE grade_sequence = 1
),
FirstASequence AS
(
SELECT
- r.name,
+ r.name, b.first_letter, r.letter,
MIN(r.grade_sequence) as first_a_sequence
FROM
RankedGrades r
JOIN
BStarters b ON r.name = b.name
- WHERE
- r.letter = 'A'
GROUP BY
- r.name
+ r.name, b.first_letter, r.letter
),
FirstADetails AS
(
SELECT
r.name,
+ fas.first_letter,
+ r.letter as first_a_letter,
r.date as first_a_date,
fas.first_a_sequence as a_sequence_number
FROM
@@ -44,9 +44,9 @@
SubsequentPerformance AS
(
SELECT
- f.name,
- CASE WHEN MAX(CASE WHEN r.letter != 'A' THEN 1 ELSE 0 END) = 0
- THEN 'Pure A'
+ f.name, f.first_letter, f.first_a_letter,
+ CASE WHEN MAX(CASE WHEN r.letter != f.first_a_letter THEN 1 ELSE 0 END) = 0
+ THEN 'Pure '||f.first_a_letter
ELSE 'Mixed Grades'
END as grade_pattern
FROM
@@ -56,24 +56,24 @@
WHERE
r.grade_sequence > f.a_sequence_number
GROUP BY
- f.name
+ f.name, f.first_letter, f.first_a_letter
),
Summary AS
(
SELECT
- grade_pattern,
+ first_letter, first_a_letter, grade_pattern,
COUNT(*) as student_count,
- SUM(COUNT(*)) OVER () as total_students
+ SUM(COUNT(*)) OVER (partition by first_letter, first_a_letter) as total_students
FROM
SubsequentPerformance
GROUP BY
- grade_pattern
+ first_letter, first_a_letter, grade_pattern
)
SELECT
- grade_pattern,
+ first_letter "Starting with", first_a_letter "entering a sequence of", grade_pattern "finished as",
student_count,
- ROUND(CAST(student_count AS FLOAT) / total_students * 100, 2) as percentage
+ ROUND(CAST(student_count AS FLOAT) / total_students * 10000) / 100 as percentage -- Adapt to PostgreSQL which doesn't have round(x, precision).
FROM
Summary
ORDER BY
- student_count DESC;
+ first_letter desc, first_a_letter, student_count DESC;