feat: allow OOV processing in bowOf method of BM25

sanjayaksaxena · sanjayaksaxena · commit 39319d7bfd1b · 2022-05-10T14:19:14.000+05:30
references #73
diff --git a/test/bm25-vectorizer-specs.js b/test/bm25-vectorizer-specs.js
@@ -33,6 +33,7 @@
 var chai = require( 'chai' );
 var mocha = require( 'mocha' );
 var bm25 = require( '../utilities/bm25-vectorizer.js' );
+var similarity = require( '../utilities/similarity.js' );
 var its = require( '../src/its.js' );
 
 
@@ -231,7 +232,7 @@ describe( 'bm25-vectorizer', function () {
     } );
 
     it( 'should throw error learn() is called after out()', function () {
-      expect( v.learn.bind( [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' );
+      expect( v.learn.bind( null, [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' );
     } );
   } );
 
@@ -323,4 +324,54 @@ describe( 'bm25-vectorizer', function () {
       expect( () => v3.loadModel( JSON.stringify( { uid: 'WinkNLP-BM25Vectorizer-Model/1.0.0', 0: 0, 1: 1, 2: 2, 3: 3, 4: 4 } ) ) ).to.throw( 'wink-nlp: invalid model format/version' );
     } );
   } );
+
+  describe( 'similarity computation using bm25 output', function () {
+    const v = bm25( { norm: 'none' } );
+    v.learn( 'cat was black'.toLowerCase().split( /\s+/g ) );
+    v.learn( 'dog is white'.toLowerCase().split( /\s+/g ) );
+    v.learn( 'dog is cute'.toLowerCase().split( /\s+/g ) );
+
+    it( 'completely different sentences should return 0', function () {
+      const b1 = v.bowOf( 'cat is not yellow'.toLowerCase().split( /\s+/g ) );
+      const b2 = v.bowOf( 'dog were rarely pink'.toLowerCase().split( /\s+/g ) );
+      expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0 );
+    } );
+
+    it( 'completely different sentences with all OOV should return 1 with processOOV as false', function () {
+      // This happens as both bows are empty because OOVs are ignored!
+      const b1 = v.bowOf( 'ugly bat'.toLowerCase().split( /\s+/g ) );
+      const b2 = v.bowOf( 'dangerous snake'.toLowerCase().split( /\s+/g ) );
+
+      expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 );
+    } );
+
+    it( 'completely different sentences with all OOV should return 0 with processOOV as true', function () {
+      // Here OOVs are not ignored, so we get non-empty bows.
+      const b1 = v.bowOf( 'ugly bat'.toLowerCase().split( /\s+/g ), true );
+      const b2 = v.bowOf( 'dangerous snake'.toLowerCase().split( /\s+/g ), true );
+      expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0 );
+    } );
+
+    it( 'With processOOV as non boolean, it should throw error', function () {
+      expect( v.bowOf.bind( null, [ 'ugly', 'bat' ], 99 ) ).to.throw( 'wink-nlp: processOOV must be a boolean.' );
+    } );
+
+    it( 'partially different sentences should return <1', function () {
+      const b1 = v.bowOf( 'cat is black'.toLowerCase().split( /\s+/g ) );
+      const b2 = v.bowOf( 'dog is black'.toLowerCase().split( /\s+/g ) );
+      expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0.681698 );
+    } );
+
+    it( 'identical sentences should return 1', function () {
+      const b1 = v.bowOf([ 'dog is white'.toLowerCase().split( /\s+/g ) ] );
+      const b2 = v.bowOf([ 'dog is white'.toLowerCase().split( /\s+/g ) ] );
+      expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 );
+    } );
+
+    it( 'empty sentences should return 1', function () {
+      const b1 = v.bowOf([ ''.toLowerCase().split( /\s+/g ) ] );
+      const b2 = v.bowOf([ ''.toLowerCase().split( /\s+/g ) ] );
+      expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 );
+    } );
+  } );
 } );
diff --git a/utilities/bm25-vectorizer.js b/utilities/bm25-vectorizer.js
@@ -309,24 +309,37 @@ var bm25Vectorizer = function ( config ) {
   // ## bowOf
   /**
    * Computes the bag-of-words (bowOf) of the input document, using the tf-idf
-   * learned so far.
-   * @param  {string[]}   tokens  tokenized text, usually obtained via winkNLP.
-   * @return {object}             its bow.
+   * learned so far. If `processOOV` is true then for OOV token's frequency is
+   * computed and its `idf` is assumed to be **1**; otherwise all OOVs are ignored.
+   * @param  {string[]}   tokens      tokenized text, usually obtained via winkNLP.
+   * @param  {boolean}    processOOV  true — process OOV, false — ignore OOV (default).
+   * @return {object}                 its bow.
    */
-  methods.bowOf = function ( tokens ) {
+  methods.bowOf = function ( tokens, processOOV = false ) {
     computeWeights();
     const bow = Object.create( null );
     const avgDL = sumOfAllDLs / docId;
     let thisNorm = 0;
 
+    if ( typeof processOOV !== 'boolean' ) {
+      throw Error( 'wink-nlp: processOOV must be a boolean.' );
+    }
+
     for ( let i = 0; i < tokens.length; i += 1 ) {
       const t = tokens[ i ];
-      // bow applies only if the token is not an unseen one!
-      if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
+      // `processOOV` true means count every term otherwise count only if it is
+      // in the vocabulary i.e. `idf`.
+      if ( processOOV ) {
+        bow[ t ] = 1 + ( bow[ t ] || 0 );
+      } else if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
     }
 
     for ( const t in bow ) { // eslint-disable-line guard-for-in
-      bow[ t ] = idf[ t ] * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
+      // `bow` tokens are determined by `processOOV` i.e. if true it will contain
+      // OOVs also otherwise it will not have any OOV. On the other hand `idf`
+      // always contains all the seen tokens. Therefore when `processOOV` is true,
+      // the `idf[ t ]` for all OOV will be taken as **1** (highest possible value).
+      bow[ t ] = ( idf[ t ] || 1 ) * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
       thisNorm += normFn[ norm ]( bow[ t ] );
     }