|
| 1 | +const constants = require( './constants.js' ); |
| 2 | +// Bits reserved for `lemma`. |
| 3 | +const bits4lemma = constants.bits4lemma; |
| 4 | +// Mask for extracting pos |
| 5 | +const posMask = constants.posMask; |
| 6 | +// Size of a single token. |
| 7 | +const tkSize = constants.tkSize; |
| 8 | + |
| 9 | +const sentenceWiseImportance = function ( rdd ) { |
| 10 | + // Define open class part-of-speeches; used to compute intitial information content |
| 11 | + const openClassPOS = Object.create(null); |
| 12 | + openClassPOS.ADJ = true; |
| 13 | + openClassPOS.ADV = true; |
| 14 | + openClassPOS.INTJ = true; |
| 15 | + openClassPOS.NOUN = true; |
| 16 | + openClassPOS.PROPN = true; |
| 17 | + openClassPOS.VERB = true; |
| 18 | + openClassPOS.NUM = true; |
| 19 | + openClassPOS.SYM = true; |
| 20 | + // N-gram to use to construct a pos group. |
| 21 | + const NGram = 4; |
| 22 | + const sentences = rdd.sentences; |
| 23 | + const tokens = rdd.tokens; |
| 24 | + const cache = rdd.cache; |
| 25 | + |
| 26 | + // Used to build table of weights of pos groups. Apart from frequency, it also maintains |
| 27 | + // (a) array of sentences, where a given pos group was found, (b) total weight computed as |
| 28 | + // frequency minus count of closed class part-of-speech in the group. |
| 29 | + const posGroupWeightTable = Object.create( null ); |
| 30 | + |
| 31 | + for ( let s = 0; s < sentences.length; s += 1 ) { |
| 32 | + const pos = []; |
| 33 | + const [ start, end ] = sentences[ s ]; |
| 34 | + for ( let t = start; t <= end; t += 1 ) { |
| 35 | + const p = cache.valueOf( 'pos', ( tokens[ ( t * tkSize ) + 2 ] & posMask ) >>> bits4lemma ); // eslint-disable-line no-bitwise |
| 36 | + if ( p !== 'SPACE' && p !== 'PUNCT' ) pos.push( p ); |
| 37 | + } |
| 38 | + |
| 39 | + // Ignore sentences where we cannot build NGram i.e. sentences shorter than NGram. |
| 40 | + if ( pos.length < 4 ) return []; |
| 41 | + // Determine NGrams; |
| 42 | + for ( let k = 0; k + NGram - 1 < pos.length; k += 1 ) { |
| 43 | + const pos4Gram = pos.slice( k, k + NGram ); |
| 44 | + // Used to compute the weight for a pos group. |
| 45 | + const initInfoContent = pos4Gram.reduce( |
| 46 | + ( pv, cv ) => pv - ( ( openClassPOS[cv] ) ? 0 : 1 ), |
| 47 | + 0 |
| 48 | + ); |
| 49 | + const posGroup = pos4Gram.join( '_' ); |
| 50 | + posGroupWeightTable[ posGroup ] = posGroupWeightTable[ posGroup ] || Object.create( null ); |
| 51 | + posGroupWeightTable[ posGroup ].group = posGroup; |
| 52 | + posGroupWeightTable[ posGroup ].sentences = posGroupWeightTable[ posGroup ].sentences || []; |
| 53 | + posGroupWeightTable[ posGroup ].sentences.push( s ); // ? |
| 54 | + posGroupWeightTable[ posGroup ].weight = ( posGroupWeightTable[ posGroup ].weight === undefined ) ? |
| 55 | + initInfoContent + 1 : |
| 56 | + ( posGroupWeightTable[ posGroup ].weight + 1 ); |
| 57 | + posGroupWeightTable[ posGroup ].iv = initInfoContent; |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + // Transform object into an array, and filter out elements with weight <= 0. |
| 62 | + const posGroupWeights = Object.keys( posGroupWeightTable ) |
| 63 | + .map( ( e ) => posGroupWeightTable[ e ] ) |
| 64 | + .filter( ( e ) => e.weight > 0 ); |
| 65 | + // This is an array index by each sentence's index and would contain the total weight |
| 66 | + // computed by adding all the weights of each pos group found in that sentence. |
| 67 | + const sentenceWiseWeights = new Array( sentences.length ); |
| 68 | + sentenceWiseWeights.fill( 0 ); |
| 69 | + posGroupWeights.forEach( ( pgw ) => { |
| 70 | + pgw.sentences.forEach( ( e ) => { |
| 71 | + sentenceWiseWeights[ e ] += pgw.weight; |
| 72 | + } ); |
| 73 | + }); |
| 74 | + // Normalize weights by dividing them by the max. |
| 75 | + const max = Math.max( ...sentenceWiseWeights ); |
| 76 | + |
| 77 | + return sentenceWiseWeights.map( ( e, i ) => ( { index: i, importance: +( e / max ).toFixed( 4 ) } ) ); |
| 78 | + }; |
| 79 | + |
| 80 | + module.exports = sentenceWiseImportance; |
0 commit comments