// wink-tokenizer
// Multilingual tokenizer that automatically tags each token with its type.
//
// Copyright (C) GRAYPE Systems Private Limited
//
// This file is part of “wink-tokenizer”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var emojiRegex = require( 'emoji-regex' );
var contractions = require( './eng-contractions.js' );
var rgxSpaces = /\s+/g;
// Ordinals only for Latin like 1st, 2nd or 12th or 33rd.
var rgxOrdinalL1 = /1\dth|[04-9]th|1st|2nd|3rd|[02-9]1st|[02-9]2nd|[02-9]3rd|[02-9][04-9]th|\d+\d[04-9]th|\d+\d1st|\d+\d2nd|\d+\d3rd/g;
// Apart from detecting pure integers or decimals, also detect numbers containing
// `. - / ,` so that dates, ip address, fractions and things like codes or part
// numbers are also detected as numbers only. These regex will therefore detected
// 8.8.8.8 or 12-12-1924 or 1,1,1,1.00 or 1/4 or 1/4/66/777 as numbers.
// Latin-1 Numbers.
var rgxNumberL1 = /\d+\/\d+|\d(?:[\.,-\/]?\d)*(?:\.\d+)?/g;
// Devanagari Numbers.
var rgxNumberDV = /[\u0966-\u096F]+\/[\u0966-\u096F]+|[\u0966-\u096F](?:[\.,-\/]?[\u0966-\u096F])*(?:\.[\u0966-\u096F]+)?/g;
var rgxMention = /@\w+/g;
// Latin-1 Hashtags.
// Include entire Latin-1 script and not just English alphas.
var rgxHashtagL1 = /#[a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF_][a-z0-9\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF_]*/gi;
// Devanagari Hashtags
var rgxHashtagDV = /#[\u0900-\u0963\u0970-\u097F_][\u0900-\u0963\u0970-\u097F\u0966-\u096F0-9_]*/gi;
// EMail is EN character set.
var rgxEmail = /[-!#$%&'*+\/=?^\w{|}~](?:\.?[-!#$%&'*+\/=?^\w`{|}~])*@[a-z0-9](?:-?\.?[a-z0-9])*(?:\.[a-z](?:-?[a-z0-9])*)+/gi;
// Bitcoin, Ruble, Indian Rupee, Other Rupee, Dollar, Pound, Yen, Euro, Wong.
var rgxCurrency = /[₿₽₹₨$£¥€₩]/g;
// These include both the punctuations: Latin-1 & Devanagari.
var rgxPunctuation = /[’'‘’`“”"\[\]\(\){}…,\.!;\?\-:\u0964\u0965]/g;
var rgxQuotedPhrase = /"[^"]*"/g;
// NOTE: URL will support only EN character set for now.
var rgxURL = /(?:https?:\/\/)(?:[\da-z\.-]+)\.(?:[a-z\.]{2,6})(?:[\/\w\.\-\?#=]*)*\/?/gi;
var rgxEmoji = emojiRegex();
var rgxEmoticon = /:-?[dps\*\/\[\]{}\(\)]|;-?[/(/)d]|<3/gi;
var rgxTime = /(?:\d|[01]\d|2[0-3]):?(?:[0-5][0-9])?\s?(?:[ap]\.?m\.?|hours|hrs)/gi;
// Inlcude [Latin-1 Supplement Unicode Block](https://en.wikipedia.org/wiki/Latin-1_Supplement_(Unicode_block))
var rgxWordL1 = /[a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF][a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF']*/gi;
// Define [Devanagari Unicode Block](https://unicode.org/charts/PDF/U0900.pdf)
var rgxWordDV = /[\u0900-\u094F\u0951-\u0963\u0970-\u097F]+/gi;
// Symbols go here; including Om.
var rgxSymbol = /[\u0950~@#%\^\+=\*\|\/<>&]/g;
// For detecting if the word is a potential contraction.
var rgxContraction = /'/;
// Singular & Plural possessive
var rgxPosSingular = /([a-z]+)('s)$/i;
var rgxPosPlural = /([a-z]+s)(')$/i;
// Regexes and their categories; used for tokenizing via match/split. The
// sequence is *critical* for correct tokenization.
var rgxsMaster = [
{ regex: rgxQuotedPhrase, category: 'quoted_phrase' },
{ regex: rgxURL, category: 'url' },
{ regex: rgxEmail, category: 'email' },
{ regex: rgxMention, category: 'mention' },
{ regex: rgxHashtagL1, category: 'hashtag' },
{ regex: rgxHashtagDV, category: 'hashtag' },
{ regex: rgxEmoji, category: 'emoji' },
{ regex: rgxEmoticon, category: 'emoticon' },
{ regex: rgxTime, category: 'time' },
{ regex: rgxOrdinalL1, category: 'ordinal' },
{ regex: rgxNumberL1, category: 'number' },
{ regex: rgxNumberDV, category: 'number' },
{ regex: rgxCurrency, category: 'currency' },
{ regex: rgxWordL1, category: 'word' },
{ regex: rgxWordDV, category: 'word' },
{ regex: rgxPunctuation, category: 'punctuation' },
{ regex: rgxSymbol, category: 'symbol' }
];
// Used to generate finger print from the tokens.
// NOTE: this variable is being reset in `defineConfig()`.
var fingerPrintCodes = {
emoticon: 'c',
email: 'e',
emoji: 'j',
hashtag: 'h',
mention: 'm',
number: 'n',
ordinal: 'o',
quoted_phrase: 'q', // eslint-disable-line camelcase
currency: 'r',
// symbol: 's',
time: 't',
url: 'u',
word: 'w',
alien: 'z'
};
// ### tokenizer
/**
*
* Creates an instance of {@link Tokenizer}.
*
* @return {Tokenizer} object conatining set of API methods for tokenizing a sentence
* and defining configuration, plugin etc.
* @example
* // Load wink tokenizer.
* var tokenizer = require( 'wink-tokenizer' );
* // Create your instance of wink tokenizer.
* var myTokenizer = tokenizer();
*/
var tokenizer = function () {
// Default configuration: most comprehensive tokenization. Make deep copy!
var rgxs = rgxsMaster.slice( 0 );
// The result of last call to `tokenize()` is retained here.
var finalTokens = [];
// Returned!
/**
* @classdesc Tokenizer class
* @class Tokenizer
* @hideconstructor
*/
var methods = Object.create( null );
// ### manageContraction
/**
*
* Splits a contractions into words by first trying a lookup in strandard
* `contractions`; if the lookup fails, it checks for possessive in `'s` or
* `s'` forms and separates the possesive part from the word. Otherwise the
* contraction is treated as a normal word and no splitting occurs.
*
* @param {string} word that could be a potential conraction.
* @param {object[]} tokens where the outcome is pushed.
* @return {object[]} updated tokens according to the `word.`
* @private
*/
var manageContraction = function ( word, tokens ) {
var ct = contractions[ word ];
var matches;
if ( ct === undefined ) {
// Try possesive of sigular & plural forms
matches = word.match( rgxPosSingular );
if ( matches ) {
tokens.push( { value: matches[ 1 ], tag: 'word' } );
tokens.push( { value: matches[ 2 ], tag: 'word' } );
} else {
matches = word.match( rgxPosPlural );
if ( matches ) {
tokens.push( { value: matches[ 1 ], tag: 'word' } );
tokens.push( { value: matches[ 2 ], tag: 'word' } );
} else tokens.push( { value: word, tag: 'word' } );
}
} else {
// Manage via lookup; ensure cloning!
tokens.push( Object.assign( {}, ct[ 0 ] ) );
tokens.push( Object.assign( {}, ct[ 1 ] ) );
if ( ct[ 2 ] ) tokens.push( Object.assign( {}, ct[ 2 ] ) );
}
return tokens;
}; // manageContraction()
// ### tokenizeTextUnit
/**
*
* Attempts to tokenize the input `text` using the `rgxSplit`. The tokenization
* is carried out by combining the regex matches and splits in the right sequence.
* The matches are the *real tokens*, whereas splits are text units that are
* tokenized in later rounds! The real tokens (i.e. matches) are pushed as
* `object` and splits as `string`.
*
* @param {string} text unit that is to be tokenized.
* @param {object} rgxSplit object containing the regex and it's category.
* @return {array} of tokens.
* @private
*/
var tokenizeTextUnit = function ( text, rgxSplit ) {
// Regex matches go here; note each match is a token and has the same tag
// as of regex's category.
var matches = text.match( rgxSplit.regex );
// Balance is "what needs to be tokenized".
var balance = text.split( rgxSplit.regex );
// The result, in form of combination of tokens & matches, is captured here.
var tokens = [];
// The tag;
var tag = rgxSplit.category;
// Helper variables.
var aword,
i,
imax,
k = 0,
t;
// Combine tokens & matches in the following pattern [ b0 m0 b1 m1 ... ]
matches = ( matches ) ? matches : [];
for ( i = 0, imax = balance.length; i < imax; i += 1 ) {
t = balance[ i ];
t = t.trim();
if ( t ) tokens.push( t );
if ( k < matches.length ) {
if ( tag === 'word' ) {
// Tag type `word` token may have a contraction.
aword = matches[ k ];
if ( rgxContraction.test( aword ) ) {
tokens = manageContraction( aword, tokens );
} else {
// Means there is no contraction.
tokens.push( { value: aword, tag: tag } );
}
} else tokens.push( { value: matches[ k ], tag: tag } );
}
k += 1;
}
return ( tokens );
}; // tokenizeTextUnit()
// ### tokenizeTextRecursively
/**
*
* Tokenizes the input text recursively using the array of `regexes` and then
* the `tokenizeTextUnit()` function. If (or whenever) the `regexes` becomes
* empty, it simply splits the text on non-word characters instead of using
* the `tokenizeTextUnit()` function.
*
* @param {string} text unit that is to be tokenized.
* @param {object} regexes object containing the regex and it's category.
* @return {undefined} nothing!
* @private
*/
var tokenizeTextRecursively = function ( text, regexes ) {
var sentence = text.trim();
var tokens = [];
var i, imax;
if ( !regexes.length ) {
// No regex left, split on `spaces` and tag every token as **alien**.
text.split( rgxSpaces ).forEach( function ( tkn ) {
finalTokens.push( { value: tkn.trim(), tag: 'alien' } );
} );
return;
}
var rgx = regexes[ 0 ];
tokens = tokenizeTextUnit( sentence, rgx );
for ( i = 0, imax = tokens.length; i < imax; i += 1 ) {
if ( typeof tokens[ i ] === 'string' ) {
// Strings become candidates for further tokenization.
tokenizeTextRecursively( tokens[ i ], regexes.slice( 1 ) );
} else {
finalTokens.push( tokens[ i ] );
}
}
}; // tokenizeTextRecursively()
// ### defineConfig
/**
*
* Defines the configuration in terms of the types of token that will be
* extracted by [`tokenize()`](#tokenize) method. Note by default, all types
* of tokens will be detected and tagged automatically.
*
* @method Tokenizer#defineConfig
* @param {object} config It defines 0 or more properties from the list of
* **14** properties. A true value for a property ensures tokenization
* for that type of text; whereas false value will mean that the tokenization of that
* type of text will not be attempted. It also **resets** the effect of any previous
* call(s) to the [`addRegex()`](#addregex) API.
*
* *An empty config object is equivalent to splitting on spaces. Whatever tokens
* are created like this are tagged as **alien** and **`z`** is the
* [finger print](#gettokensfp) code of this token type.*
*
* The table below gives the name of each property and it's description including
* examples. The character with in paranthesis is the [finger print](#gettokensfp) code for the
* token of that type.
* @param {boolean} [config.currency=true] such as **$** or **£** symbols (**`r`**)
* @param {boolean} [config.email=true] for example **john@acme.com** or **superman1@gmail.com** (**`e`**)
* @param {boolean} [config.emoji=true] any standard unicode emojis e.g. 😊 or 😂 or 🎉 (**`j`**)
* @param {boolean} [config.emoticon=true] common emoticons such as **`:-)`** or **`:D`** (**`c`**)
* @param {boolean} [config.hashtag=true] hash tags such as **`#happy`** or **`#followme`** (**`h`**)
* @param {boolean} [config.number=true] any integer, decimal number, fractions such as **19**, **2.718**
* or **1/4** and numerals containing "**`, - / .`**", for example 12-12-1924 (**`n`**)
* @param {boolean} [config.ordinal=true] ordinals like **1st**, **2nd**, **3rd**, **4th** or **12th** or **91st** (**`o`**)
* @param {boolean} [config.punctuation=true] common punctuation such as **`?`** or **`,`**
* ( token becomes fingerprint )
* @param {boolean} [config.quoted_phrase=false] any **"quoted text"** in the sentence. _Note: its default value is **false**._ (**`q`**)
* @param {boolean} [config.symbol=true] for example **`~`** or **`+`** or **`&`** or **`%`** or **`/`** ( token becomes fingerprint )
* @param {boolean} [config.time=true] common representation of time such as **4pm** or **16:00 hours** (**`t`**)
* @param {boolean} [config.mention=true] **@mention** as in github or twitter (**`m`**)
* @param {boolean} [config.url=true] URL such as **https://github.com** (**`u`**)
* @param {boolean} [config.word=true] word such as **faster** or **résumé** or **prévenir** (**`w`**)
* @return {number} number of properties set to true from the list of above 13.
* @example
* // Do not tokenize & tag @mentions.
* var myTokenizer.defineConfig( { mention: false } );
* // -> 13
* // Only tokenize words as defined above.
* var myTokenizer.defineConfig( {} );
* // -> 0
*/
var defineConfig = function ( config ) {
if ( typeof config === 'object' && Object.keys( config ).length ) {
rgxs = rgxsMaster.filter( function ( rgx ) {
// Config for the Category of `rgx`.
var cc = config[ rgx.category ];
// Means `undefined` & `null` values are taken as true; otherwise
// standard **truthy** and **falsy** interpretation applies!!
return ( cc === undefined || cc === null || !!cc );
} );
} else rgxs = [];
// Count normalized length i.e. ignore multi-script entries.
const uniqueCats = Object.create( null );
rgxs.forEach( function ( rgx ) {
uniqueCats[ rgx.category ] = true;
} );
// Reset the `fingerPrintCodes` variable.
fingerPrintCodes = {
emoticon: 'c',
email: 'e',
emoji: 'j',
hashtag: 'h',
mention: 'm',
number: 'n',
ordinal: 'o',
quoted_phrase: 'q', // eslint-disable-line camelcase
currency: 'r',
// symbol: 's',
time: 't',
url: 'u',
word: 'w',
alien: 'z'
};
return ( ( Object.keys( uniqueCats ) ).length );
}; // defineConfig()
// ### tokenize
/**
*
* Tokenizes the input `sentence` using the configuration specified via
* [`defineConfig()`](#defineconfig).
* Common contractions and possessive nouns are split into 2 separate tokens;
* for example **I'll** splits as `'I'` and `'\'ll'` or **won't** splits as
* `'wo'` and `'n\'t'`.
*
* @method Tokenizer#tokenize
* @param {string} sentence the input sentence.
* @return {object[]} of tokens; each one of them is an object with 2-keys viz.
* `value` and its `tag` identifying the type of the token.
* @example
* var s = 'For detailed API docs, check out http://winkjs.org/wink-regression-tree/ URL!';
* myTokenizer.tokenize( s );
* // -> [ { value: 'For', tag: 'word' },
* // { value: 'detailed', tag: 'word' },
* // { value: 'API', tag: 'word' },
* // { value: 'docs', tag: 'word' },
* // { value: ',', tag: 'punctuation' },
* // { value: 'check', tag: 'word' },
* // { value: 'out', tag: 'word' },
* // { value: 'http://winkjs.org/wink-regression-tree/', tag: 'url' },
* // { value: 'URL', tag: 'word' },
* // { value: '!', tag: 'punctuation' } ]
*/
var tokenize = function ( sentence ) {
finalTokens = [];
tokenizeTextRecursively( sentence, rgxs );
return finalTokens;
}; // tokenize()
// ### getTokensFP
/**
*
* Returns the finger print of the tokens generated by the last call to
* [`tokenize()`](#tokenize). A finger print is a string created by sequentially
* joining the unique code of each token's type. Refer to table given under
* [`defineConfig()`](#defineconfig) for values of these codes.
*
* A finger print is extremely useful in spotting patterns present in the sentence
* using `regexes`, which is otherwise a complex and time consuming task.
*
* @method Tokenizer#getTokensFP
* @return {string} finger print of tokens generated by the last call to `tokenize()`.
* @example
* // Generate finger print of sentence given in the previous example
* // under tokenize().
* myTokenizer.getTokensFP();
* // -> 'wwww,wwuw!'
*/
var getTokensFP = function () {
var fp = [];
finalTokens.forEach( function ( t ) {
fp.push( ( fingerPrintCodes[ t.tag ] ) ? fingerPrintCodes[ t.tag ] : t.value );
} );
return fp.join( '' );
}; // getFingerprint()
// ### addTag
var addTag = function (name, fingerprintCode) {
if (fingerPrintCodes[name]) {
throw new Error( 'Tag ' + name + ' already exists' );
}
fingerPrintCodes[name] = fingerprintCode;
}; // addTag()
// ### addRegex
/**
* Adds a regex for parsing a new type of token. This regex can either be mapped
* to an existing tag or it allows creation of a new tag along with its finger print.
* The uniqueness of the [finger prints](#defineconfig) have to ensured by the user.
*
* *The added regex(s) will supersede the internal parsing.*
*
* @method Tokenizer#addRegex
* @param {RegExp} regex the new regular expression.
* @param {string} tag tokens matching the `regex` will be assigned this tag.
* @param {string} [fingerprintCode=undefined] required if adding a new
* tag; ignored if using an existing tag.
* @return {void} nothing!
* @example
* // Adding a regex for an existing tag
* myTokenizer.addRegex( /\(oo\)/gi, 'emoticon' );
* myTokenizer.tokenize( '(oo) Hi!' )
* // -> [ { value: '(oo)', tag: 'emoticon' },
* // { value: 'Hi', tag: 'word' },
* // { value: '!', tag: 'punctuation' } ]
*
* // Adding a regex to parse a new token type
* myTokenizer.addRegex( /hello/gi, 'greeting', 'g' );
* myTokenizer.tokenize( 'hello, how are you?' );
* // -> [ { value: 'hello', tag: 'greeting' },
* // { value: ',', tag: 'punctuation' },
* // { value: 'how', tag: 'word' },
* // { value: 'are', tag: 'word' },
* // { value: 'you', tag: 'word' },
* // { value: '?', tag: 'punctuation' } ]
* // Notice how "hello" is now tagged as "greeting" and not as "word".
*
* // Using definConfig will reset the above!
* myTokenizer.defineConfig( { word: true } );
* myTokenizer.tokenize( 'hello, how are you?' );
* // -> [ { value: 'hello', tag: 'word' },
* // { value: ',', tag: 'punctuation' },
* // { value: 'how', tag: 'word' },
* // { value: 'are', tag: 'word' },
* // { value: 'you', tag: 'word' },
* // { value: '?', tag: 'punctuation' } ]
*/
var addRegex = function (regex, tag, fingerprintCode) {
if (!fingerPrintCodes[tag] && !fingerprintCode) {
throw new Error( 'Tag ' + tag + ' doesn\'t exist; Provide a \'fingerprintCode\' to add it as a tag.' );
} else if (!fingerPrintCodes[tag]) {
addTag(tag, fingerprintCode);
}
rgxs.unshift( { regex: regex, category: tag } );
}; // addRegex()
// Set quoted_phrase as false becuase mostly it is not required.
defineConfig( { quoted_phrase: false } ); // eslint-disable-line camelcase
methods.defineConfig = defineConfig;
methods.tokenize = tokenize;
methods.getTokensFP = getTokensFP;
methods.addTag = addTag;
methods.addRegex = addRegex;
return methods;
};
module.exports = tokenizer;