// wink-statistics
// Fast and Numerically Stable Statistical Analysis Utilities.
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-statistics”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
// ## data
// Load percentile.
var percentile = require( './data-percentile.js' );
// Load accessor.
var value = require( './accessor.js' );
// Load mad.
var mad = require( './data-mad.js' );
// ### distribution
/**
*
* Internal function to compute distribution from `bin` and `binWidth`.
*
* @param {number} bins number of bins as computed by `histogram()`.
* @param {number} binWidth width of each bin.
* @param {array} sortedData sorted in ascending order of value.
* @param {object} rs robust stats containing `min`, `size`, etc.
* @param {number} precision of the data.
* @param {(string|number|function)} [accessor=undefined] required when elements of
* `x` are objects or arrays instead of numbers.
* For objects, use key (string) to access the value; in case of arrays, use
* index (number) to access the value; or it could be a function
* that extracts the value from the element passed to it.
* @returns {object} histogram conatining arrays `classes` and corresponding `frequencies`.
* Each element of `classes` array is an object having `min/mid/max` values.
* @private
*/
var distribution = function ( bins, binWidth, sortedData, rs, precision, accessor ) {
// Helpers.
var cutoff, i, k, limit, mid, min;
// Hold x axis and y axis values.
var x, y;
// Distribution object.
var dist = Object.create( null );
// Find distribution now.
x = new Array( bins );
y = new Array( bins );
cutoff = new Array( bins );
limit = +( rs.min + binWidth ).toFixed( precision );
min = +( rs.min ).toFixed( precision );
for ( i = 0; i < bins; i += 1 ) {
y[ i ] = 0;
mid = +( limit - ( binWidth / 2 ) ).toFixed( precision );
x[ i ] = { min: min, mid: mid, max: limit };
cutoff[ i ] = limit;
min = +( min + binWidth).toFixed( precision );
limit = +( limit + binWidth).toFixed( precision );
}
i = 0;
for ( k = 0; k < bins; k += 1 ) {
// > REVIEW: Make it faster by deploying binary search approach.
for ( ; ( ( i < rs.size ) && ( value( sortedData[ i ], accessor ) <= cutoff[ k ] ) ); i += 1 ) {
y[ k ] += 1;
}
}
dist.classes = x;
dist.frequencies = y;
return ( dist );
}; // distribution()
// ### histogram
/**
*
* Generates histogram using Freedman–Diaconis method.
* If both IQR and MAD are `0` then it automatically
* switches to Sturges' Rule while ensuring minimum of 5 bins.
* It attempts to reduce excessive sparsity of distribution,
* if any, by adjusting the number of bins using Sturges' Rule.
*
* @memberof data
* @param {array} sortedData sorted in ascending order of value.
* @param {number} [dataPrecision=0] typically the minumum number of
* decimal places observed in the `sortedData`.
* @param {(string|number|function)} [accessor=undefined] required when elements of
* `x` are objects or arrays instead of numbers.
* For objects, use key (string) to access the value; in case of arrays, use
* index (number) to access the value; or it could be a function
* that extracts the value from the element passed to it.
* @returns {object} conatining arrays `classes` and the corresponding `frequencies`.
* Each element of `classes` array is an object with values for `min/max (class intervals)`
* and `mid` point of a class. <br/><br/>In addition, the returned object
* contains useful statistics like `q1`, `q3`, `iqr`, `min`, `max`, and `range`.
* @example
* var data = [
* 12, 14, 14, 14, 16, 18, 20, 20, 21, 23, 27, 27, 27, 29, 31,
* 31, 32, 32, 34, 36, 40, 40, 40, 40, 40, 42, 51, 56, 60, 65
* ];
* histogram( data );
* // returns {
* // classes: [
* // { min: 12, mid: 19, max: 25 },
* // { min: 25, mid: 32, max: 38 },
* // { min: 38, mid: 45, max: 51 },
* // { min: 51, mid: 58, max: 64 },
* // { min: 64, mid: 71, max: 77 } ],
* // frequencies: [ 10, 10, 7, 2, 1 ],
* // q1: 20, q3: 40, iqr: 20, size: 30, min: 12, max: 65,range: 53
* // }
*/
var histogram = function ( sortedData, dataPrecision, accessor ) {
var rs = Object.create( null );
rs.q1 = percentile( sortedData, 0.25, accessor );
rs.q3 = percentile( sortedData, 0.75, accessor );
rs.iqr = ( rs.q3 - rs.q1 );
rs.size = sortedData.length;
rs.min = value( sortedData[ 0 ], accessor );
rs.max = value( sortedData[ rs.size - 1 ], accessor );
rs.range = ( rs.max - rs.min );
// The histogram.
var histo;
// Number of bins.
var bins;
// Class interval or bin width.
var binWidth = rs.iqr;
// The `precision` is extremely important to get a quality histogram - in terms
// of number of classes and counting data points in a class interval.
var precision = Math.round( Math.abs( dataPrecision || 0 ) );
// Compute `bins` and `binWidth`.
if ( ( binWidth === 0 ) ) {
rs.mad = mad( sortedData, accessor );
binWidth = 2 * rs.mad;
}
if ( binWidth > 0 ) {
// Apply Freedman–Diaconis formula.
binWidth = 2 * binWidth * Math.pow( rs.size, -( 1 / 3 ) );
// Adjust `binWidth` according to the `precision`.
binWidth = +binWidth.toFixed( precision );
if ( binWidth === 0 ) binWidth = 1;
bins = Math.ceil( rs.range / binWidth );
histo = distribution( bins, binWidth, sortedData, rs, precision, accessor );
// Check how sparse is the distribution - # of 0s > 20% of the total frequencies.
// If yes then attempt its reduction by using the Sturges' Rule (as above).
if ( histo.frequencies.filter( function ( e ) { return ( e === 0 ); } ).length > histo.frequencies.length * 0.20 ) { // eslint-disable-line
// Sparse! Apply Sturge's Rule now.
bins = Math.max( Math.ceil( Math.log2( rs.size ) + 1 ), 5 );
binWidth = rs.range / bins;
binWidth = +binWidth.toFixed( precision );
binWidth = Math.max( binWidth, 1 );
bins = Math.ceil( rs.range / binWidth );
histo = distribution( bins, binWidth, sortedData, rs, precision, accessor );
}
} else {
// Nothing is working out, downgrade to Sturges' Rule, but ensure minimum 5 bins.
bins = Math.max( Math.ceil( Math.log2( rs.size ) + 1 ), 5 );
binWidth = rs.range / bins;
// Adjust `binWidth` according to `precision` and recompute everything.
binWidth = +binWidth.toFixed( precision );
binWidth = Math.max( binWidth, 1 );
bins = Math.max( Math.ceil( rs.range / binWidth ), 1 );
histo = distribution( bins, binWidth, sortedData, rs, precision, accessor );
}
return ( Object.assign( histo, rs ) );
}; // histogram()
module.exports = histogram;