Skip to content

Commit 39319d7

Browse files
feat: allow OOV processing in bowOf method of BM25
references #73
1 parent 80c3a10 commit 39319d7

File tree

2 files changed

+72
-8
lines changed

2 files changed

+72
-8
lines changed

test/bm25-vectorizer-specs.js

+52-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
var chai = require( 'chai' );
3434
var mocha = require( 'mocha' );
3535
var bm25 = require( '../utilities/bm25-vectorizer.js' );
36+
var similarity = require( '../utilities/similarity.js' );
3637
var its = require( '../src/its.js' );
3738

3839

@@ -231,7 +232,7 @@ describe( 'bm25-vectorizer', function () {
231232
} );
232233

233234
it( 'should throw error learn() is called after out()', function () {
234-
expect( v.learn.bind( [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' );
235+
expect( v.learn.bind( null, [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' );
235236
} );
236237
} );
237238

@@ -323,4 +324,54 @@ describe( 'bm25-vectorizer', function () {
323324
expect( () => v3.loadModel( JSON.stringify( { uid: 'WinkNLP-BM25Vectorizer-Model/1.0.0', 0: 0, 1: 1, 2: 2, 3: 3, 4: 4 } ) ) ).to.throw( 'wink-nlp: invalid model format/version' );
324325
} );
325326
} );
327+
328+
describe( 'similarity computation using bm25 output', function () {
329+
const v = bm25( { norm: 'none' } );
330+
v.learn( 'cat was black'.toLowerCase().split( /\s+/g ) );
331+
v.learn( 'dog is white'.toLowerCase().split( /\s+/g ) );
332+
v.learn( 'dog is cute'.toLowerCase().split( /\s+/g ) );
333+
334+
it( 'completely different sentences should return 0', function () {
335+
const b1 = v.bowOf( 'cat is not yellow'.toLowerCase().split( /\s+/g ) );
336+
const b2 = v.bowOf( 'dog were rarely pink'.toLowerCase().split( /\s+/g ) );
337+
expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0 );
338+
} );
339+
340+
it( 'completely different sentences with all OOV should return 1 with processOOV as false', function () {
341+
// This happens as both bows are empty because OOVs are ignored!
342+
const b1 = v.bowOf( 'ugly bat'.toLowerCase().split( /\s+/g ) );
343+
const b2 = v.bowOf( 'dangerous snake'.toLowerCase().split( /\s+/g ) );
344+
345+
expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 );
346+
} );
347+
348+
it( 'completely different sentences with all OOV should return 0 with processOOV as true', function () {
349+
// Here OOVs are not ignored, so we get non-empty bows.
350+
const b1 = v.bowOf( 'ugly bat'.toLowerCase().split( /\s+/g ), true );
351+
const b2 = v.bowOf( 'dangerous snake'.toLowerCase().split( /\s+/g ), true );
352+
expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0 );
353+
} );
354+
355+
it( 'With processOOV as non boolean, it should throw error', function () {
356+
expect( v.bowOf.bind( null, [ 'ugly', 'bat' ], 99 ) ).to.throw( 'wink-nlp: processOOV must be a boolean.' );
357+
} );
358+
359+
it( 'partially different sentences should return <1', function () {
360+
const b1 = v.bowOf( 'cat is black'.toLowerCase().split( /\s+/g ) );
361+
const b2 = v.bowOf( 'dog is black'.toLowerCase().split( /\s+/g ) );
362+
expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0.681698 );
363+
} );
364+
365+
it( 'identical sentences should return 1', function () {
366+
const b1 = v.bowOf([ 'dog is white'.toLowerCase().split( /\s+/g ) ] );
367+
const b2 = v.bowOf([ 'dog is white'.toLowerCase().split( /\s+/g ) ] );
368+
expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 );
369+
} );
370+
371+
it( 'empty sentences should return 1', function () {
372+
const b1 = v.bowOf([ ''.toLowerCase().split( /\s+/g ) ] );
373+
const b2 = v.bowOf([ ''.toLowerCase().split( /\s+/g ) ] );
374+
expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 );
375+
} );
376+
} );
326377
} );

utilities/bm25-vectorizer.js

+20-7
Original file line numberDiff line numberDiff line change
@@ -309,24 +309,37 @@ var bm25Vectorizer = function ( config ) {
309309
// ## bowOf
310310
/**
311311
* Computes the bag-of-words (bowOf) of the input document, using the tf-idf
312-
* learned so far.
313-
* @param {string[]} tokens tokenized text, usually obtained via winkNLP.
314-
* @return {object} its bow.
312+
* learned so far. If `processOOV` is true then for OOV token's frequency is
313+
* computed and its `idf` is assumed to be **1**; otherwise all OOVs are ignored.
314+
* @param {string[]} tokens tokenized text, usually obtained via winkNLP.
315+
* @param {boolean} processOOV true — process OOV, false — ignore OOV (default).
316+
* @return {object} its bow.
315317
*/
316-
methods.bowOf = function ( tokens ) {
318+
methods.bowOf = function ( tokens, processOOV = false ) {
317319
computeWeights();
318320
const bow = Object.create( null );
319321
const avgDL = sumOfAllDLs / docId;
320322
let thisNorm = 0;
321323

324+
if ( typeof processOOV !== 'boolean' ) {
325+
throw Error( 'wink-nlp: processOOV must be a boolean.' );
326+
}
327+
322328
for ( let i = 0; i < tokens.length; i += 1 ) {
323329
const t = tokens[ i ];
324-
// bow applies only if the token is not an unseen one!
325-
if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
330+
// `processOOV` true means count every term otherwise count only if it is
331+
// in the vocabulary i.e. `idf`.
332+
if ( processOOV ) {
333+
bow[ t ] = 1 + ( bow[ t ] || 0 );
334+
} else if ( idf[ t ] ) bow[ t ] = 1 + ( bow[ t ] || 0 );
326335
}
327336

328337
for ( const t in bow ) { // eslint-disable-line guard-for-in
329-
bow[ t ] = idf[ t ] * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
338+
// `bow` tokens are determined by `processOOV` i.e. if true it will contain
339+
// OOVs also otherwise it will not have any OOV. On the other hand `idf`
340+
// always contains all the seen tokens. Therefore when `processOOV` is true,
341+
// the `idf[ t ]` for all OOV will be taken as **1** (highest possible value).
342+
bow[ t ] = ( idf[ t ] || 1 ) * ( ( k1 + 1 ) * bow[ t ] ) / ( ( k1 * ( 1 - b + ( b * ( tokens.length / avgDL ) ) ) ) + bow[ t ] );
330343
thisNorm += normFn[ norm ]( bow[ t ] );
331344
}
332345

0 commit comments

Comments
 (0)