1818use const Rubix \ML \LOG_EPSILON ;
1919
2020/**
21- * LODA
21+ * Loda
2222 *
23- * Lightweight Online Detector of Anomalies uses sparse random projection vectors
24- * to generate an ensemble of unique one dimensional equi-width histograms able
25- * to estimate the probability density of an unknown sample. The anomaly score is
26- * given by the negative log likelihood whose upper threshold can be set by the
27- * user through the *contamination* hyper-parameter.
23+ * Lightweight Online Detector of Anomalies a.k.a. Loda uses sparse random
24+ * projection vectors to generate an ensemble of unique one dimensional
25+ * equi-width histograms able to estimate the probability density of an unknown
26+ * sample. The anomaly score is given by the negative log likelihood whose upper
27+ * threshold can be set by the user through the *contamination* hyper-parameter.
2828 *
2929 * References:
3030 * [1] T. Pevný. (2015). Loda: Lightweight on-line detector of anomalies.
3434 * @package Rubix/ML
3535 * @author Andrew DalPino
3636 */
37- class LODA implements Estimator, Learner, Online, Ranking, Persistable
37+ class Loda implements Estimator, Learner, Online, Ranking, Persistable
3838{
3939 protected const MIN_SPARSE_DIMENSIONS = 3 ;
4040
@@ -74,7 +74,7 @@ class LODA implements Estimator, Learner, Online, Ranking, Persistable
7474 protected $ r ;
7575
7676 /**
77- * The edges, counts, and precomputed probability densities of each histogram.
77+ * The edges, and bin counts of each histogram.
7878 *
7979 * @var array[]|null
8080 */
@@ -83,9 +83,9 @@ class LODA implements Estimator, Learner, Online, Ranking, Persistable
8383 /**
8484 * The number of samples that have been learned so far.
8585 *
86- * @var int|null
86+ * @var int
8787 */
88- protected $ n ;
88+ protected $ n = 0 ;
8989
9090 /**
9191 * Estimate the number of bins from the number of samples in a dataset.
@@ -212,15 +212,7 @@ public function train(Dataset $dataset) : void
212212 $ counts [$ this ->bins ]++;
213213 }
214214
215- $ densities = [];
216-
217- foreach ($ counts as $ count ) {
218- $ densities [] = $ count > 0
219- ? -log ($ count / $ m )
220- : -LOG_EPSILON ;
221- }
222-
223- $ this ->histograms [] = [$ edges , $ counts , $ densities ];
215+ $ this ->histograms [] = [$ edges , $ counts ];
224216 }
225217
226218 $ this ->n = $ m ;
@@ -234,22 +226,20 @@ public function train(Dataset $dataset) : void
234226 */
235227 public function partial (Dataset $ dataset ) : void
236228 {
237- if (!$ this ->r or !$ this ->histograms or ! $ this -> n ) {
229+ if (!$ this ->r or !$ this ->histograms ) {
238230 $ this ->train ($ dataset );
239231
240232 return ;
241233 }
242234
243235 DatasetIsCompatibleWithEstimator::check ($ dataset , $ this );
244236
245- $ this ->n += $ dataset ->numRows ();
246-
247237 $ projections = Matrix::quick ($ dataset ->samples ())
248238 ->matmul ($ this ->r )
249239 ->transpose ();
250240
251241 foreach ($ projections as $ i => $ values ) {
252- [$ edges , $ counts, $ densities ] = $ this ->histograms [$ i ];
242+ [$ edges , $ counts ] = $ this ->histograms [$ i ];
253243
254244 $ interior = array_slice ($ edges , 1 , $ this ->bins , true );
255245
@@ -265,14 +255,10 @@ public function partial(Dataset $dataset) : void
265255 $ counts [$ this ->bins ]++;
266256 }
267257
268- foreach ($ counts as $ j => $ count ) {
269- $ densities [$ j ] = $ count > 0
270- ? -log ($ count / $ this ->n )
271- : -LOG_EPSILON ;
272- }
273-
274- $ this ->histograms [$ i ] = [$ edges , $ counts , $ densities ];
258+ $ this ->histograms [$ i ] = [$ edges , $ counts ];
275259 }
260+
261+ $ this ->n += $ dataset ->numRows ();
276262 }
277263
278264 /**
@@ -308,27 +294,31 @@ public function rank(Dataset $dataset) : array
308294 ->matmul ($ this ->r )
309295 ->transpose ();
310296
311- $ scores = array_fill (0 , $ projections ->n (), 0. );
297+ $ densities = array_fill (0 , $ projections ->n (), 0. );
312298
313299 foreach ($ projections as $ i => $ values ) {
314- [$ edges , $ counts, $ densities ] = $ this ->histograms [$ i ];
300+ [$ edges , $ counts ] = $ this ->histograms [$ i ];
315301
316302 foreach ($ values as $ j => $ value ) {
317303 foreach ($ edges as $ k => $ edge ) {
318304 if ($ value < $ edge ) {
319- $ scores [$ j ] += $ densities [$ k ];
305+ $ count = $ counts [$ k ];
306+
307+ $ densities [$ j ] += $ count > 0
308+ ? -log ($ counts [$ k ] / $ this ->n )
309+ : -LOG_EPSILON ;
320310
321311 break 1 ;
322312 }
323313 }
324314 }
325315 }
326316
327- foreach ($ scores as &$ score ) {
328- $ score /= $ this ->estimators ;
317+ foreach ($ densities as &$ density ) {
318+ $ density /= $ this ->estimators ;
329319 }
330320
331- return $ scores ;
321+ return $ densities ;
332322 }
333323
334324 /**
0 commit comments