diff --git a/examples/gpce/multi-lin-regress.sql b/examples/gpce/multi-lin-regress.sql index df596311e..7434f4483 100644 --- a/examples/gpce/multi-lin-regress.sql +++ b/examples/gpce/multi-lin-regress.sql @@ -29,13 +29,13 @@ COPY houses FROM STDIN DELIMITER '|'; select * from houses limit 5; \qecho === Calculate Coefficients ====================================== -select madlib.mregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses; +select madlib.linregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses; \qecho === Calculate R square value ==================================== -select madlib.mregr_r2(price, array[1, bedroom, bath, size])::REAL from houses; +select madlib.linregr_r2(price, array[1, bedroom, bath, size])::REAL from houses; \qecho === Calculate t statistics ====================================== -select madlib.mregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses; +select madlib.linregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses; \qecho === Calculate p values ========================================== -select madlib.mregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses; +select madlib.linregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses; diff --git a/src/modules/regress/linear.cpp b/src/modules/regress/linear.cpp index 96b7cc573..e47dcff1e 100644 --- a/src/modules/regress/linear.cpp +++ b/src/modules/regress/linear.cpp @@ -214,18 +214,23 @@ AnyValue LinearRegression::final(AbstractDBInterface &db, - ((state.y_sum * state.y_sum) / state.numRows) ); + // total sum of squares + double tss + = state.y_square_sum + - ((state.y_sum * state.y_sum) / state.numRows); + // coefficient of determination - if (what == kRSquare) { - // total sum of squares - double tss - = state.y_square_sum - - ((state.y_sum * state.y_sum) / state.numRows); - + if (what == kRSquare) return ess / tss; - } + + // In the case of linear regression: + // residual sum of squares (rss) = total sum of squares (tss) - explained + // sum of squares (ess) + // Proof: http://en.wikipedia.org/wiki/Sum_of_squares + double rss = tss - ess; // Variance is also called the mean square error - double variance = ess / (state.numRows - state.widthOfX); + double variance = rss / (state.numRows - state.widthOfX); // Precompute (X^T * X)^{-1} mat inverse_of_X_transp_X = inv(state.X_transp_X); diff --git a/src/ports/postgres/modules/regress/logistic.sql_in b/src/ports/postgres/modules/regress/logistic.sql_in index a9c6ef3d0..203b645e8 100644 --- a/src/ports/postgres/modules/regress/logistic.sql_in +++ b/src/ports/postgres/modules/regress/logistic.sql_in @@ -255,8 +255,8 @@ RETURNS DOUBLE PRECISION[] AS $$ except: sys.path.append("PLPYTHON_LIBDIR") import logistic + global MADlibSchema MADlibSchema = "MADLIB_SCHEMA" - return logistic.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; @@ -272,8 +272,8 @@ RETURNS DOUBLE PRECISION[] AS $$ except: sys.path.append("PLPYTHON_LIBDIR") import logistic + global MADlibSchema MADlibSchema = "MADLIB_SCHEMA" - return logistic.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; @@ -290,8 +290,8 @@ RETURNS DOUBLE PRECISION[] AS $$ except: sys.path.append("PLPYTHON_LIBDIR") import logistic + global MADlibSchema MADlibSchema = "MADLIB_SCHEMA" - return logistic.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE; @@ -339,7 +339,7 @@ RETURNS DOUBLE PRECISION[] AS $$ except: sys.path.append("PLPYTHON_LIBDIR") import logistic + global MADlibSchema MADlibSchema = "MADLIB_SCHEMA" - return logistic.compute_logregr_coef(**globals()) $$ LANGUAGE plpythonu VOLATILE;