I've been trying to write a T-SQL script - I am aware it's probably not the best tool for this scenario - that can do a topological sort given a table of nodes and their edges.
I've eventually come up with the following script inspired by Khan's algorithm:
DROP TABLE IF EXISTS #Edges
CREATE TABLE #Edges (EdgeId INT IDENTITY(1,1), FromNode INT, ToNode INT);
INSERT INTO #Edges (FromNode, ToNode) VALUES
(1, 2), (1, 3), (3, 4), (4, 2), (4, 5), (5, 6), (7, NULL), (8, NULL)--, (6, 4);
DROP TABLE IF EXISTS #AllNodesMaster CREATE TABLE #AllNodesMaster (node INT);
INSERT INTO #AllNodesMaster SELECT DISTINCT FromNode AS node FROM #Edges UNION SELECT DISTINCT ToNode AS Node FROM #Edges
DECLARE @CurrentLevel INT = 1;
DROP TABLE IF EXISTS #AllNodes CREATE TABLE #AllNodes (node INT);
DROP TABLE IF EXISTS #OrderedNodes CREATE TABLE #OrderedNodes (Id INT IDENTITY(1,1), node INT, level INT);
DROP TABLE IF EXISTS #InDegrees CREATE TABLE #InDegrees (node INT, InDegreeCount INT);
DROP TABLE IF EXISTS #Queue CREATE TABLE #Queue (node INT);
DROP TABLE IF EXISTS #DeletedEdges CREATE TABLE #DeletedEdges (node INT);
WHILE EXISTS (SELECT FromNode FROM #Edges)
BEGIN
INSERT INTO #AllNodes SELECT DISTINCT FromNode AS node FROM #Edges UNION SELECT DISTINCT ToNode AS Node FROM #Edges
INSERT INTO #InDegrees
SELECT T.node, ISNULL (C.InDegreeCount, 0) AS inDegreeCount
FROM #AllNodes T
LEFT JOIN (SELECT ToNode AS node, COUNT(FromNode) AS InDegreeCount FROM #Edges GROUP BY ToNode) C ON T.node = C.node
WHERE T.node IS NOT NULL
INSERT INTO #Queue SELECT node FROM #InDegrees WHERE inDegreeCount = 0
INSERT INTO #OrderedNodes (node, level) SELECT node, @CurrentLevel FROM #Queue
DELETE FROM #Edges OUTPUT deleted.FromNode INTO #DeletedEdges WHERE EdgeId IN (SELECT EdgeId FROM #Edges WHERE FromNode IN (SELECT node FROM #Queue))
TRUNCATE TABLE #Queue
INSERT INTO #Queue SELECT node FROM #AllNodes WHERE node NOT IN (SELECT DISTINCT FromNode AS node FROM #Edges UNION SELECT DISTINCT ToNode AS Node FROM #Edges) AND node NOT IN (SELECT node FROM #DeletedEdges)
TRUNCATE TABLE #AllNodes
TRUNCATE TABLE #DeletedEdges
TRUNCATE TABLE #InDegrees
SET @CurrentLevel = @CurrentLevel + 1;
END
SELECT
node, level
FROM
(
SELECT
nm.node,
CASE
WHEN orn.node IS NOT NULL THEN orn.level
ELSE @CurrentLevel
END AS level
FROM
#AllNodesMaster nm
LEFT JOIN #OrderedNodes orn ON nm.node = orn.node
) T
WHERE
node IS NOT NULL
ORDER BY
level
ASC
I chose to go for an iterative approach using a WHILE loop instead of a recursive CTE, as I found the latter harder to debug and often hanged endlessly.
The question is, how could cycles be best handled in this scenario using this implementation? Currently, if a cycle is introduced (the commented line "--, (6, 4)
" does this), then the WHILE gets caught in an endless loop.
Any suggestions or guidance would be greatly appreciated
The solution to this that worked for me is to do the cycle detection before doing the topological search.
The implementation for cycle detection in a directed graph was taken from here. Which also came from this answer.