streaming-freq-table.js

//     wink-statistics
//     Fast and Numerically Stable Statistical Analysis Utilities.
//
//     Copyright (C) GRAYPE Systems Private Limited
//
//     This file is part of “wink-statistics”.
//
//     Permission is hereby granted, free of charge, to any person obtaining a
//     copy of this software and associated documentation files (the "Software"),
//     to deal in the Software without restriction, including without limitation
//     the rights to use, copy, modify, merge, publish, distribute, sublicense,
//     and/or sell copies of the Software, and to permit persons to whom the
//     Software is furnished to do so, subject to the following conditions:
//
//     The above copyright notice and this permission notice shall be included
//     in all copies or substantial portions of the Software.
//
//     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
//     OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
//     THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//     FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
//     DEALINGS IN THE SOFTWARE.

// ## streaming

// Load wink helpers for object to array conversion & sorting.
var helpers = require( 'wink-helpers' );
var getValidFD = require( './get-valid-fd.js' );

// ### freqTable
/**
 *
 * Frequency table is built incrementally with arrival of each value from the
 * stream of data.
 *
 * The [`build()`](https://winkjs.org/wink-statistics/Stream.html#build) requires
 * a single argument, which could be either a string or numeric value.
 *
 * The [`result()`](https://winkjs.org/wink-statistics/Stream.html#result) returns
 * an object containing the frequency `table` sorted in descending order of
 * category frequency, along with table `size`, `sum` of frequencies,
 * `x2` — chi-squared statistic, `df` — degree of freedom, and the
 * `entropy`.
 *
 * The `x2` along with the `df` can be used to test the hypothesis, "the distribution is uniform". The
 * `percentage` in `table` represents %age of a category share in the `sum`; and `expected` count
 * assuming uniform distribution.
 *
 * @memberof streaming#
 * @return {Stream} Object containing methods such as `build()`, `result()` & `reset()`.
 * @example
 * var ft = freqTable();
 * ft.build( 'Tea' );
 * ft.build( 'Tea' );
 * ft.build( 'Tea' );
 * ft.build( 'Pepsi' );
 * ft.build( 'Pepsi' );
 * ft.build( 'Gin' );
 * ft.build( 'Coke' );
 * ft.build( 'Coke' );
 * ft.value();
 * // returns { Tea: 3, Pepsi: 2, Gin: 1, Coke: 2 }
 * ft.result();
 * // returns {
 * //   table: [
 * //     { category: 'Tea', observed: 3, percentage: 37.5, expected: 2 },
 * //     { category: 'Pepsi', observed: 2, percentage: 25, expected: 2 },
 * //     { category: 'Coke', observed: 2, percentage: 25, expected: 2 },
 * //     { category: 'Gin', observed: 1, percentage: 12.5, expected: 2 }
 * //   ],
 * //   size: 4,
 * //   sum: 8,
 * //   x2: 1,
 * //   df: 3,
 * //   entropy: 1.9056
 * // }
 */
var freqTable = function () {
  var obj = Object.create( null );
  var methods = Object.create( null );
  var sum = 0;

  methods.build = function ( x ) {
    obj[ x ] = 1 + ( obj[ x ] || 0 );
    sum += 1;
    return undefined;
  }; // compute()

  methods.value = function () {
    return obj;
  }; // value()

  methods.result = function ( fractionDigits ) {
    var fd = getValidFD( fractionDigits );
    var t = helpers.object.table( obj );
    var imax = t.length;
    var table = new Array( imax );
    var expectedVal = sum / imax;
    var x2 = 0;
    var entropy = 0;
    var p;
    var diff;
    var ft = Object.create( null );

    t.sort( helpers.array.descendingOnValue );
    for ( var i = 0;  i < imax; i += 1 ) {
      table[ i ] = Object.create( null );
      table[ i ].category = t[ i ][ 0 ];
      table[ i ].observed = t[ i ][ 1 ];
      p = t[ i ][ 1 ] / sum;
      table[ i ].percentage = +( p * 100 ).toFixed( fd );
      table[ i ].expected = +expectedVal.toFixed( fd );
      diff = ( t[ i ][ 1 ] - expectedVal );
      x2 += ( diff * ( diff / expectedVal ) );
      entropy += -p * Math.log2( p );
    }

    ft.table = table;
    ft.size = imax;
    ft.sum = sum;
    ft.x2 = +x2.toFixed( fd );
    ft.df = ( imax - 1 );
    ft.entropy = +entropy.toFixed( fd );

    return ft;
  }; // result()

  methods.reset = function () {
    obj = Object.create( null );
    sum = 0;
  }; // reset()

  methods.compute = methods.build;
  return methods;
}; // freqTable()

module.exports = freqTable;