|
33 | 33 | var chai = require( 'chai' );
|
34 | 34 | var mocha = require( 'mocha' );
|
35 | 35 | var bm25 = require( '../utilities/bm25-vectorizer.js' );
|
| 36 | +var similarity = require( '../utilities/similarity.js' ); |
36 | 37 | var its = require( '../src/its.js' );
|
37 | 38 |
|
38 | 39 |
|
@@ -231,7 +232,7 @@ describe( 'bm25-vectorizer', function () {
|
231 | 232 | } );
|
232 | 233 |
|
233 | 234 | it( 'should throw error learn() is called after out()', function () {
|
234 |
| - expect( v.learn.bind( [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' ); |
| 235 | + expect( v.learn.bind( null, [ 'hello', 'world' ] ) ).to.throw( 'wink-nlp: learn can not be used after a call to out() API in BM25 Vectorizer' ); |
235 | 236 | } );
|
236 | 237 | } );
|
237 | 238 |
|
@@ -323,4 +324,54 @@ describe( 'bm25-vectorizer', function () {
|
323 | 324 | expect( () => v3.loadModel( JSON.stringify( { uid: 'WinkNLP-BM25Vectorizer-Model/1.0.0', 0: 0, 1: 1, 2: 2, 3: 3, 4: 4 } ) ) ).to.throw( 'wink-nlp: invalid model format/version' );
|
324 | 325 | } );
|
325 | 326 | } );
|
| 327 | + |
| 328 | + describe( 'similarity computation using bm25 output', function () { |
| 329 | + const v = bm25( { norm: 'none' } ); |
| 330 | + v.learn( 'cat was black'.toLowerCase().split( /\s+/g ) ); |
| 331 | + v.learn( 'dog is white'.toLowerCase().split( /\s+/g ) ); |
| 332 | + v.learn( 'dog is cute'.toLowerCase().split( /\s+/g ) ); |
| 333 | + |
| 334 | + it( 'completely different sentences should return 0', function () { |
| 335 | + const b1 = v.bowOf( 'cat is not yellow'.toLowerCase().split( /\s+/g ) ); |
| 336 | + const b2 = v.bowOf( 'dog were rarely pink'.toLowerCase().split( /\s+/g ) ); |
| 337 | + expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0 ); |
| 338 | + } ); |
| 339 | + |
| 340 | + it( 'completely different sentences with all OOV should return 1 with processOOV as false', function () { |
| 341 | + // This happens as both bows are empty because OOVs are ignored! |
| 342 | + const b1 = v.bowOf( 'ugly bat'.toLowerCase().split( /\s+/g ) ); |
| 343 | + const b2 = v.bowOf( 'dangerous snake'.toLowerCase().split( /\s+/g ) ); |
| 344 | + |
| 345 | + expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 ); |
| 346 | + } ); |
| 347 | + |
| 348 | + it( 'completely different sentences with all OOV should return 0 with processOOV as true', function () { |
| 349 | + // Here OOVs are not ignored, so we get non-empty bows. |
| 350 | + const b1 = v.bowOf( 'ugly bat'.toLowerCase().split( /\s+/g ), true ); |
| 351 | + const b2 = v.bowOf( 'dangerous snake'.toLowerCase().split( /\s+/g ), true ); |
| 352 | + expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0 ); |
| 353 | + } ); |
| 354 | + |
| 355 | + it( 'With processOOV as non boolean, it should throw error', function () { |
| 356 | + expect( v.bowOf.bind( null, [ 'ugly', 'bat' ], 99 ) ).to.throw( 'wink-nlp: processOOV must be a boolean.' ); |
| 357 | + } ); |
| 358 | + |
| 359 | + it( 'partially different sentences should return <1', function () { |
| 360 | + const b1 = v.bowOf( 'cat is black'.toLowerCase().split( /\s+/g ) ); |
| 361 | + const b2 = v.bowOf( 'dog is black'.toLowerCase().split( /\s+/g ) ); |
| 362 | + expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 0.681698 ); |
| 363 | + } ); |
| 364 | + |
| 365 | + it( 'identical sentences should return 1', function () { |
| 366 | + const b1 = v.bowOf([ 'dog is white'.toLowerCase().split( /\s+/g ) ] ); |
| 367 | + const b2 = v.bowOf([ 'dog is white'.toLowerCase().split( /\s+/g ) ] ); |
| 368 | + expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 ); |
| 369 | + } ); |
| 370 | + |
| 371 | + it( 'empty sentences should return 1', function () { |
| 372 | + const b1 = v.bowOf([ ''.toLowerCase().split( /\s+/g ) ] ); |
| 373 | + const b2 = v.bowOf([ ''.toLowerCase().split( /\s+/g ) ] ); |
| 374 | + expect( similarity.bow.cosine( b1, b2 ) ).to.equal( 1 ); |
| 375 | + } ); |
| 376 | + } ); |
326 | 377 | } );
|
0 commit comments