Skip to content

Commit 7ffb5b6

Browse files
feat(*): handle UNK created by contextual vectors inside as.vector helper
1 parent 3f76c8f commit 7ffb5b6

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

src/as.js

+6-2
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ as.vector = function ( tokens, rdd ) {
193193
const size = rdd.wordVectors.dimensions;
194194
const precision = rdd.wordVectors.precision;
195195
const vectors = rdd.wordVectors.vectors;
196+
const l2NormIndex = rdd.wordVectors.l2NormIndex;
196197

197198
// Set up a new initialized vector of `size`
198199
const v = new Array( size );
@@ -203,8 +204,11 @@ as.vector = function ( tokens, rdd ) {
203204
for ( let i = 0; i < tokens.length; i += 1 ) {
204205
// Extract token vector for the current token.
205206
const tv = vectors[ tokens[ i ].toLowerCase() ];
206-
// Increment `numOfTokens` if the above operation was successful.
207-
if ( tv !== undefined ) numOfTokens += 1;
207+
// Increment `numOfTokens` if the above operation was successful
208+
// AND l2Norm is non-zero, because for UNK vectors it is set to 0.
209+
// The later is applicable for the contextual vectors, where in event
210+
// of UNK, an all zero vectors is set for UNK word.
211+
if ( tv !== undefined && tv[ l2NormIndex ] !== 0 ) numOfTokens += 1;
208212
for ( let j = 0; j < size; j += 1 ) {
209213
// Keep summing, eventually it will be divided by `numOfTokens` to obtain avareage.
210214
v[ j ] += ( tv === undefined ) ? 0 : tv[ j ];

src/doc-v2.js

+2
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,8 @@ var doc = function ( docData, addons ) {
487487
.out( its.lemma )
488488
.map( ( t ) => t.toLowerCase() );
489489

490+
// NOTE: For UNK words an all zero vector is set up, with `l2Norm = 0`, which may be used in as.vector helper
491+
// to detect an UNK word.
490492
for ( let i = 0; i < docTokens.length; i += 1 ) cv.vectors[ docTokens[ i ] ] = ( awvs[ docTokens[ i ] ] || cv.unkVector ).slice( 0 );
491493
for ( let i = 0; i < docTokensLemma.length; i += 1 ) cv.vectors[ docTokensLemma[ i ] ] = ( awvs[ docTokensLemma[ i ] ] || cv.unkVector ).slice( 0 );
492494
for ( let i = 0; i < specificWordVectors.length; i += 1 ) {

0 commit comments

Comments
 (0)