Monday, August 12, 2013

TSQL Linear Regression

USE DWH;
GO

CREATE PROCEDURE LinearRegression (
  @TableName nvarchar(250)  = '[dbo].[linear]'
, @XColumnName nvarchar(250) = '[Xvalue]'
, @YColumnName nvarchar(250) = '[Yvalue]'
, @Where    nvarchar(250) = '[RecordId] <=11'
)
-----------------------------------------------------------------------
-- Liear regression using T-Sql
-- John Minkjan 2013
-----------------------------------------------------------------------
AS
BEGIN
declare @sqlstatement nvarchar(4000)
declare @MeanX decimal(18,6)
declare @MeanY decimal(18,6)
declare @VarX decimal(18,6)
declare @VarY decimal(18,6)
declare @StdX decimal(18,6)
declare @StdY decimal(18,6)
declare @sumX decimal(18,6)
declare @sumY decimal(18,6)
declare @sumXY decimal(18,6)
declare @X2 decimal(18,6)
declare @Y2 decimal(18,6)
declare @XY decimal(18,6)
declare @X2t decimal(18,6)
declare @Y2t decimal(18,6)
declare @XYt decimal(18,6)
declare @CorXY decimal(18,6)
declare @Slope decimal(18,6)
declare @Intercept decimal(18,6)
declare @CountXY bigint


-- get the basic stats
set @sqlstatement = 'Declare  users_cursor CURSOR FOR SELECT AVG('
    +@XColumnName +'), AVG(' + @YColumnName
    +') , COUNT(*), VAR('
    +@XColumnName +'), VAR(' + @YColumnName
    +') , STDEV('
    +@XColumnName +'), STDEV(' + @YColumnName
    +') , SUM('
    +@XColumnName +'), SUM(' + @YColumnName
    +') , SUM(' + @YColumnName
    +' * ' +@XColumnName +') FROM ' + @TableName + case when @Where <> '' then ' WHERE ' + @Where end

exec sp_executesql @sqlstatement

OPEN users_cursor
FETCH NEXT FROM users_cursor
INTO @MeanX ,@MeanY,@CountXY,@VarX ,@VarY,@StdX,@StdY,@sumX,@sumY,@sumXY

CLOSE users_cursor
DEALLOCATE users_cursor

-- Get the Linear regression info

set @X2 = 0
set @y2 = 0
set @xy = 0

set @sqlstatement = 'Declare  users_cursor CURSOR FOR SELECT ('
    + @XColumnName +' - ' +cast( @MeanX as varchar(250)) + ') * ('
    + @YColumnName +' - ' + cast( @MeanY as varchar(250)) + ') as XY, (('
    + @XColumnName +' - ' +cast( @MeanX as varchar(250))
    +') * (' + @XColumnName +' - ' +cast( @MeanX as varchar(250))+')) as X2 , (('
    + @YColumnName +' - ' + cast( @MeanY as varchar(250)) +') * ('
    + @YColumnName +' - ' + cast( @MeanY as varchar(250))+')) as Y2 '
    + ' FROM ' + @TableName + case when @Where <> '' then ' WHERE ' + @Where end

exec sp_executesql @sqlstatement

OPEN users_cursor
FETCH NEXT FROM users_cursor
INTO @xyt, @x2t, @y2t

WHILE @@FETCH_STATUS = 0
BEGIN

set @X2 = @x2+  @x2t
set @y2 = @y2+  @y2t
set @xy = @xy+  @xyt

FETCH NEXT FROM users_cursor
INTO @xyt, @x2t, @y2t

END

CLOSE users_cursor
DEALLOCATE users_cursor

-- Calculate the correlation
set @CorXY = (@xy / SQRT(@x2 * @y2))

-- Get the slope and intercept
set @Slope = (@xy/ @x2)
set @Intercept =  @MeanY-((@xy/ @x2) * @MeanX)

-- Output the result
set @sqlstatement = 'SELECT '
    +@XColumnName +' as X '
    +' ,'+@YColumnName +' as Y '
    +' ,'+cast(@MeanX as varchar(250)) + ' as MeanX  '
    +' ,'+cast(@MeanY  as varchar(250)) + ' as MeanY  '
    +' ,'+cast(@VarX as varchar(250))  +' as VarianceX  '
    +' ,'+cast(@VarY  as varchar(250)) + ' as VarianceY '
    +' ,'+cast(@CorXY  as varchar(250)) + ' as CorrelationXY'
    +' ,' +cast(@Slope  as varchar(250)) + ' as Slope '
    +' ,' +cast(@Intercept  as varchar(250)) + ' as Intercept '
        +' , (' + @XColumnName +' * ' + cast( @slope  as varchar(250)) +' ) +' +cast( @Intercept as varchar(250)) + ' as PredictedY'
    +' ,' +    cast(@StdX as varchar(250)) +' as  StandardDeviationX'
    +' ,' + cast(@StdY as varchar(250)) +' as  StandardDeviationY'
    +' ,' + cast(@sumX as varchar(250)) +' as  SumX'
    +' ,' +    cast(@sumY as varchar(250)) +' as  SumY'
    +' ,' +    cast(@sumXY as varchar(250)) +' as  SumXY'

    +' FROM ' + @TableName + case when @Where <> '' then ' WHERE ' + @Where end

exec sp_executesql @sqlstatement;
end

Till Next Time