Let's say you have a table with columns, Date, GroupID, X and Y.
CREATE TABLE #sample
(
[Date] DATETIME,
GroupID INT,
X FLOAT,
Y FLOAT
)
DECLARE @date DATETIME = getdate()
INSERT INTO #sample VALUES(@date, 1, 1,3)
INSERT INTO #sample VALUES(DATEADD(d, 1, @date), 1, 1,1)
INSERT INTO #sample VALUES(DATEADD(d, 2, @date), 1, 4,2)
INSERT INTO #sample VALUES(DATEADD(d, 3, @date), 1, 3,3)
INSERT INTO #sample VALUES(DATEADD(d, 4, @date), 1, 6,4)
INSERT INTO #sample VALUES(DATEADD(d, 5, @date), 1, 7,5)
INSERT INTO #sample VALUES(DATEADD(d, 6, @date), 1, 1,6)
and you want to calculate the correlation of X and Y for each group. Currently I use CTEs which get a little messy:
;WITH DataAvgStd
AS (SELECT GroupID,
AVG(X) AS XAvg,
AVG(Y) AS YAvg,
STDEV(X) AS XStdev,
STDEV(Y) AS YSTDev,
COUNT(*) AS SampleSize
FROM #sample
GROUP BY GroupID),
ExpectedVal
AS (SELECT s.GroupID,
SUM(( X - XAvg ) * ( Y - YAvg )) AS ExpectedValue
FROM #sample s
JOIN DataAvgStd das
ON s.GroupID = das.GroupID
GROUP BY s.GroupID)
SELECT das.GroupID,
ev.ExpectedValue / ( das.SampleSize - 1 ) / ( das.XStdev * das.YSTDev )
AS
Correlation
FROM DataAvgStd das
JOIN ExpectedVal ev
ON das.GroupID = ev.GroupID
DROP TABLE #sample
It seems like there should be a way to use OVER and PARTITION to do this in one fell swoop without any subqueries. Ideally TSQL would have a function so you could write:
SELECT GroupID, CORR(X, Y) OVER(PARTITION BY GroupID)
FROM #sample
GROUP BY GroupID
Using this formula of corellation you cannot avoid all the nested queries even if you use over(). The thing is that you cannot use both group by and over in the same query and also you can not have nested aggregation functions e.g. sum(x - avg(x)). So you in best case scenario, according to your data, you will need to keep at least the with.
Your code will look like something like that
;WITH DataAvgStd
AS (SELECT GroupID,
STDEV(X) over(partition by GroupID) AS XStdev,
STDEV(Y) over(partition by GroupID) AS YSTDev,
COUNT(*) over(partition by GroupID) AS SampleSize,
( X - AVG(X) over(partition by GroupID)) * ( Y - AVG(Y) over(partition by GroupID)) AS ExpectedValue
FROM #sample s)
SELECT distinct GroupID,
SUM(ExpectedValue) over(partition by GroupID) / (SampleSize - 1 ) / ( XStdev * YSTDev ) AS Correlation
FROM DataAvgStd
An alternative is to use an equilevant formula for correlation as Wikipedia describes.
This can be written as
SELECT GroupID,
Correlation=(COUNT(*) * SUM(X * Y) - SUM(X) * SUM(Y)) /
(SQRT(COUNT(*) * SUM(X * X) - SUM(X) * SUM(x))
* SQRT(COUNT(*) * SUM(Y* Y) - SUM(Y) * SUM(Y)))
FROM #sample s
GROUP BY GroupID;