Skip to content

Commit 379a444

Browse files
feat(sentence-wise-importance): complete sentence wise importance between 0 & 1
1 parent 1a910dc commit 379a444

File tree

1 file changed

+80
-0
lines changed

1 file changed

+80
-0
lines changed

src/sentence-wise-importance.js

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
const constants = require( './constants.js' );
2+
// Bits reserved for `lemma`.
3+
const bits4lemma = constants.bits4lemma;
4+
// Mask for extracting pos
5+
const posMask = constants.posMask;
6+
// Size of a single token.
7+
const tkSize = constants.tkSize;
8+
9+
const sentenceWiseImportance = function ( rdd ) {
10+
// Define open class part-of-speeches; used to compute intitial information content
11+
const openClassPOS = Object.create(null);
12+
openClassPOS.ADJ = true;
13+
openClassPOS.ADV = true;
14+
openClassPOS.INTJ = true;
15+
openClassPOS.NOUN = true;
16+
openClassPOS.PROPN = true;
17+
openClassPOS.VERB = true;
18+
openClassPOS.NUM = true;
19+
openClassPOS.SYM = true;
20+
// N-gram to use to construct a pos group.
21+
const NGram = 4;
22+
const sentences = rdd.sentences;
23+
const tokens = rdd.tokens;
24+
const cache = rdd.cache;
25+
26+
// Used to build table of weights of pos groups. Apart from frequency, it also maintains
27+
// (a) array of sentences, where a given pos group was found, (b) total weight computed as
28+
// frequency minus count of closed class part-of-speech in the group.
29+
const posGroupWeightTable = Object.create( null );
30+
31+
for ( let s = 0; s < sentences.length; s += 1 ) {
32+
const pos = [];
33+
const [ start, end ] = sentences[ s ];
34+
for ( let t = start; t <= end; t += 1 ) {
35+
const p = cache.valueOf( 'pos', ( tokens[ ( t * tkSize ) + 2 ] & posMask ) >>> bits4lemma ); // eslint-disable-line no-bitwise
36+
if ( p !== 'SPACE' && p !== 'PUNCT' ) pos.push( p );
37+
}
38+
39+
// Ignore sentences where we cannot build NGram i.e. sentences shorter than NGram.
40+
if ( pos.length < 4 ) return [];
41+
// Determine NGrams;
42+
for ( let k = 0; k + NGram - 1 < pos.length; k += 1 ) {
43+
const pos4Gram = pos.slice( k, k + NGram );
44+
// Used to compute the weight for a pos group.
45+
const initInfoContent = pos4Gram.reduce(
46+
( pv, cv ) => pv - ( ( openClassPOS[cv] ) ? 0 : 1 ),
47+
0
48+
);
49+
const posGroup = pos4Gram.join( '_' );
50+
posGroupWeightTable[ posGroup ] = posGroupWeightTable[ posGroup ] || Object.create( null );
51+
posGroupWeightTable[ posGroup ].group = posGroup;
52+
posGroupWeightTable[ posGroup ].sentences = posGroupWeightTable[ posGroup ].sentences || [];
53+
posGroupWeightTable[ posGroup ].sentences.push( s ); // ?
54+
posGroupWeightTable[ posGroup ].weight = ( posGroupWeightTable[ posGroup ].weight === undefined ) ?
55+
initInfoContent + 1 :
56+
( posGroupWeightTable[ posGroup ].weight + 1 );
57+
posGroupWeightTable[ posGroup ].iv = initInfoContent;
58+
}
59+
}
60+
61+
// Transform object into an array, and filter out elements with weight <= 0.
62+
const posGroupWeights = Object.keys( posGroupWeightTable )
63+
.map( ( e ) => posGroupWeightTable[ e ] )
64+
.filter( ( e ) => e.weight > 0 );
65+
// This is an array index by each sentence's index and would contain the total weight
66+
// computed by adding all the weights of each pos group found in that sentence.
67+
const sentenceWiseWeights = new Array( sentences.length );
68+
sentenceWiseWeights.fill( 0 );
69+
posGroupWeights.forEach( ( pgw ) => {
70+
pgw.sentences.forEach( ( e ) => {
71+
sentenceWiseWeights[ e ] += pgw.weight;
72+
} );
73+
});
74+
// Normalize weights by dividing them by the max.
75+
const max = Math.max( ...sentenceWiseWeights );
76+
77+
return sentenceWiseWeights.map( ( e, i ) => ( { index: i, importance: +( e / max ).toFixed( 4 ) } ) );
78+
};
79+
80+
module.exports = sentenceWiseImportance;

0 commit comments

Comments
 (0)