bow-cosine.js

//     wink-distance
//     Distance functions for Bag of Words, Strings,
//     Vectors and more.
//
//     Copyright (C) GRAYPE Systems Private Limited
//
//     This file is part of “wink-distance”.
//
//     Permission is hereby granted, free of charge, to any person obtaining a
//     copy of this software and associated documentation files (the "Software"),
//     to deal in the Software without restriction, including without limitation
//     the rights to use, copy, modify, merge, publish, distribute, sublicense,
//     and/or sell copies of the Software, and to permit persons to whom the
//     Software is furnished to do so, subject to the following conditions:
//
//     The above copyright notice and this permission notice shall be included
//     in all copies or substantial portions of the Software.
//
//     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
//     OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
//     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
//     DEALINGS IN THE SOFTWARE.

//
// Because we want to logically group the variables.
/* eslint sort-vars: 0 */

// ## bow

// ### cosine
/**
 *
 * Computes the cosine distance between the input bag of words (bow)
 * `a` and `b` and returns a value between 0 and 1.
 *
 * @method bow.cosine
 * @param {object} a the first set of bows i.e word (i.e. key) and it's frequency
 * (i.e. value) pairs.
 * @param {object} b the second set of bows.
 * @return {number} cosine distance between `a` and `b`.
 *
 * @example
 * // bow for "the dog chased the cat"
 * var a = { the: 2, dog: 1, chased: 1, cat: 1 };
 * // bow  for "the cat chased the mouse"
 * var b = { the: 2, cat: 1, chased: 1, mouse: 1 };
 * cosine( a, b );
 * // -> 0.14285714285714302
 * // Note the bow could have been created directly by
 * // using "tokens.bow()" from the "wink-nlp-utils".
 */
var cosine = function ( a, b ) {
  // `ab` & `ba` additional variables are required as you dont want to corrupt
  // `a` & `b`!
  // Updated `a` with words in `b` set as 0 in `a`.
  var ab = Object.create( null );
  // Updated `b` with words in `a` set as 0 in `b`.
  var ba = Object.create( null );
  var distance;
  var w; // a word!

  // Fill up `ab` and `ba`
  for ( w in a ) { // eslint-disable-line guard-for-in
    ab[ w ] = a[ w ];
    ba[ w ] = 0;
  }
  for ( w in b ) { // eslint-disable-line guard-for-in
    ba[ w ] = b[ w ];
    ab[ w ] = ab[ w ] || 0;
  }
  // With `ab` & `ba` in hand, its easy to transform in to
  // vector: its a frequency of each word found in both strings
  // We do not need to store these vectors in arrays, instead we can perform
  // processing in the same loop.
  var sa2 = 0,  // sum of ai^2
     saxb = 0, // sum of ai x bi
     sb2 = 0,  // sum of bi^2
     va, vb;  // value of ai and bi
  // One could have used `ba`, as both have same words now!
  for ( w in ab ) { // eslint-disable-line guard-for-in
    va = ab[ w ];
    vb = ba[ w ];
    sa2 += va * va;
    sb2 += vb * vb;
    saxb += va * vb;
  }
  // Compute cosine distance; ensure you dont get `NaN i.e. 0/0` by testing for
  // `sa2` and `sb2`.
  distance = 1 - (
    ( sa2 && sb2 ) ?
      // Compute cosine if both of them are non-zero.
      ( saxb / ( Math.sqrt( sa2 ) * Math.sqrt( sb2 ) ) ) :
      // If one of them is 0 means **0 distance** otherwise a distance of **1**.
      ( !sa2 ^ !sb2 ) ? 0 : 1 // eslint-disable-line no-bitwise
  );
  return distance;
}; // cosine()

// Export cosine
module.exports = cosine;