Skip to content

Commit

Permalink
Fixed: Linear-regression t-statistics were computed incorrectly.
Browse files Browse the repository at this point in the history
  • Loading branch information
Florian Schoppmann authored and Florian Schoppmann committed May 18, 2011
1 parent f75969c commit 379de83
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 16 deletions.
8 changes: 4 additions & 4 deletions examples/gpce/multi-lin-regress.sql
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ COPY houses FROM STDIN DELIMITER '|';
select * from houses limit 5;

\qecho === Calculate Coefficients ======================================
select madlib.mregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses;
select madlib.linregr_coef(price, array[1, bedroom, bath, size])::REAL[] from houses;

\qecho === Calculate R square value ====================================
select madlib.mregr_r2(price, array[1, bedroom, bath, size])::REAL from houses;
select madlib.linregr_r2(price, array[1, bedroom, bath, size])::REAL from houses;

\qecho === Calculate t statistics ======================================
select madlib.mregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses;
select madlib.linregr_tstats(price, array[1, bedroom, bath, size])::REAL[] from houses;

\qecho === Calculate p values ==========================================
select madlib.mregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses;
select madlib.linregr_pvalues(price, array[1, bedroom, bath, size])::REAL[] from houses;
21 changes: 13 additions & 8 deletions src/modules/regress/linear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,18 +214,23 @@ AnyValue LinearRegression::final(AbstractDBInterface &db,
- ((state.y_sum * state.y_sum) / state.numRows)
);

// total sum of squares
double tss
= state.y_square_sum
- ((state.y_sum * state.y_sum) / state.numRows);

// coefficient of determination
if (what == kRSquare) {
// total sum of squares
double tss
= state.y_square_sum
- ((state.y_sum * state.y_sum) / state.numRows);

if (what == kRSquare)
return ess / tss;
}

// In the case of linear regression:
// residual sum of squares (rss) = total sum of squares (tss) - explained
// sum of squares (ess)
// Proof: http://en.wikipedia.org/wiki/Sum_of_squares
double rss = tss - ess;

// Variance is also called the mean square error
double variance = ess / (state.numRows - state.widthOfX);
double variance = rss / (state.numRows - state.widthOfX);

// Precompute (X^T * X)^{-1}
mat inverse_of_X_transp_X = inv(state.X_transp_X);
Expand Down
8 changes: 4 additions & 4 deletions src/ports/postgres/modules/regress/logistic.sql_in
Original file line number Diff line number Diff line change
Expand Up @@ -255,8 +255,8 @@ RETURNS DOUBLE PRECISION[] AS $$
except:
sys.path.append("PLPYTHON_LIBDIR")
import logistic
global MADlibSchema
MADlibSchema = "MADLIB_SCHEMA"

return logistic.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;

Expand All @@ -272,8 +272,8 @@ RETURNS DOUBLE PRECISION[] AS $$
except:
sys.path.append("PLPYTHON_LIBDIR")
import logistic
global MADlibSchema
MADlibSchema = "MADLIB_SCHEMA"

return logistic.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;

Expand All @@ -290,8 +290,8 @@ RETURNS DOUBLE PRECISION[] AS $$
except:
sys.path.append("PLPYTHON_LIBDIR")
import logistic
global MADlibSchema
MADlibSchema = "MADLIB_SCHEMA"

return logistic.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;

Expand Down Expand Up @@ -339,7 +339,7 @@ RETURNS DOUBLE PRECISION[] AS $$
except:
sys.path.append("PLPYTHON_LIBDIR")
import logistic
global MADlibSchema
MADlibSchema = "MADLIB_SCHEMA"

return logistic.compute_logregr_coef(**globals())
$$ LANGUAGE plpythonu VOLATILE;

0 comments on commit 379de83

Please sign in to comment.