// wink-pos-tagger
// English Part-of-speech (POS) tagger
//
// Copyright (C) 2017-19 GRAYPE Systems Private Limited
//
// This file is part of “wink-pos-tagger”.
//
// Permission is hereby granted, free of charge, to any person obtaining a
// copy of this software and associated documentation files (the "Software"),
// to deal in the Software without restriction, including without limitation
// the rights to use, copy, modify, merge, publish, distribute, sublicense,
// and/or sell copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included
// in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
var helpers = require( 'wink-helpers' );
var winkLexicon = require( 'wink-lexicon/src/lexicon.js' );
var unigramPOSTagger = require( './unigram-tagger.js' );
var applyContextRules = require( './rules-engine.js' );
var wl = require( 'wink-lemmatizer' );
var lemmatizeVBX = wl.lemmatizeVerb;
var lemmatizeNNX = wl.lemmatizeNoun;
var lemmatizeJJX = wl.lemmatizeAdjective;
// Load tokenizer, instanciate and get tokenize method; use default config.
var tokenize = require( 'wink-tokenizer' )().tokenize;
// Extract string normalization function from `wink-helpers`.
var normalize = helpers.string.normalize;
var lemmaExceptions = Object.create( null );
lemmaExceptions.ai = 'be';
lemmaExceptions.ca = 'can';
lemmaExceptions.sha = 'shall';
lemmaExceptions[ '\'ll' ] = lemmaExceptions.wo = 'will';
lemmaExceptions[ '\'ve' ] = 'have';
lemmaExceptions[ '\'m' ] = 'am';
lemmaExceptions[ '\'re' ] = 'be';
lemmaExceptions[ 'n\'t' ] = 'not';
lemmaExceptions[ '\'d' ] = 'would';
// Needed for simple NNP transformation rule.
const capA = 'A';
const capZ = 'Z';
// Required in raw tokens tagging
const rgxNumber = /^\d+\/\d+|\d(?:[\.\,\-\/]?\d)*(?:\.\d+)?$/;
const rgxPunctuation = /^[\’\'\‘\’\`\“\”\"\[\]\(\)\{\}\…\,\.\!\;\?\-\:]+$/;
// Used in tagging years.
const year = Object.create( null );
year[ '1990s' ] = 'CD';
year[ '1980s' ] = 'CD';
year[ '1970s' ] = 'CD';
year[ '1960s' ] = 'CD';
year[ '1950s' ] = 'CD';
year[ '1940s' ] = 'CD';
year[ '1930s' ] = 'CD';
year[ '1910s' ] = 'CD';
year[ 'mid-1990s' ] = 'CD';
year[ 'mid-1980s' ] = 'CD';
year[ 'mid-1970s' ] = 'CD';
year[ 'mid-1960s' ] = 'CD';
year[ 'mid-1950s' ] = 'CD';
year[ 'mid-1940s' ] = 'CD';
year[ 'mid-1930s' ] = 'CD';
year[ 'mid-1920s' ] = 'CD';
year[ 'mid-1910s' ] = 'CD';
// ### posTagger
/**
*
* Creates an instance of {@link Tagger}.
*
* @return {Tagger} object conatining set of API methods for pos-tagging.
* @example
* // Load wink tokenizer.
* var tagger = require( 'wink-pos-tagger' );
* // Create your instance of wink tokenizer.
* var myTagger = posTagger();
*/
var posTagger = function ( ) {
/**
* @classdesc Tagger class
* @class Tagger
* @hideconstructor
*/
var methods = Object.create( null );
// ### updateLexicon
/**
*
* Updates the internal lexicon using the input `lexicon`. If a word/pos pair
* is found in the internal lexicon then it's value is updated with the new pos;
* otherwise it added.
*
* @method Tagger#updateLexicon
* @param {object} lexicon containing **`word/pos`** pairs to be added to or
* replaced in the existing lexicon. The `pos` should be an array containing
* pos tags, with the first one as the most frequently used POS. The `word` is
* normalized before updating the internal lexicon.
* @return {undefined} Nothing!
* @throws {Error} if `lexicon` is not a valid JS object.
* @example
* myTagger.updateLexicon( { Obama: [ 'NNP' ] } );
*/
var updateLexicon = function ( lexicon ) {
if ( !helpers.validate.isObject( lexicon ) ) {
throw Error( 'wink-pos-tagger/updateLexicon: lexicon must be an object, instead found: ' + JSON.stringify( lexicon ) );
}
// Update winkLexicon but with **normalized** key.
for ( var key in lexicon ) winkLexicon[ normalize( key ) ] = lexicon[ key ]; // eslint-disable-line guard-for-in
}; // updateLexicon()
// ### defineConfig
/**
*
* This API has no effect. It has been maintained for compatibility purpose.
* The `wink-tokenizer` will now always add **lemma** and **normal** forms.
* Note, lemmas are added only for **nouns** (excluding proper noun), **verbs** and
* **adjectives**.
*
* @method Tagger#defineConfig
* @return {object} always as `{ lemma: true, normal: true }`.
* @example
* // There will not be any effect:
* var myTagger.defineConfig( { lemma: false } );
* // -> { lemma: true, normal: true }
*/
var defineConfig = function ( ) {
// Return a copy of configuration object.
return ( JSON.parse( JSON.stringify( { lemma: true, normal: true } ) ) );
}; // defineConfig()
// ### lemmatize
/**
*
* Performs lemmatization; also applies NNP transformation rules for captitalized
* nouns and adjectives and CD rule for years.
*
* @method Tagger#lemmatize
* @param {object[]} tokens to be lemmatized.
* @return {object[]} lemmatized tokens.
* @private
*/
var lemmatize = function ( tokens ) {
var t, v0, w;
var lemma;
var tpos;
for ( let i = 0, imax = tokens.length; i < imax; i += 1 ) {
t = tokens[ i ];
w = t.normal;
v0 = t.value[ 0 ];
tpos = year[ w ];
if ( tpos ) t.pos = 'CD';
// First handle exceptions arising out of contractions.
lemma = lemmaExceptions[ w ];
if ( lemma ) {
t.lemma = lemma;
} else {
// Otherwise use lemmatizer.
switch ( t.pos[ 0 ] ) {
case 'J':
if ( ( v0 >= capA ) && ( v0 <= capZ ) ) {
t.lemma = w;
t.pos = 'NNP';
} else {
t.lemma = ( t.pos.length > 2 ) ? lemmatizeJJX( w ) : w;
}
break;
case 'V':
t.lemma = ( t.pos.length > 2 ) ?
( ( t.normal === '\'s') ? 'be' : lemmatizeVBX( w ) ) :
w;
break;
case 'N':
if ( ( v0 >= capA ) && ( v0 <= capZ ) ) {
t.lemma = w;
t.pos = 'NNP';
} else {
// No lemmatization of NNPs please!
t.lemma = ( t.pos !== 'NNP' && t.pos.length > 2 ) ? lemmatizeNNX( w ) : w;
}
break;
case 'M':
t.lemma = lemmatizeVBX( w );
break;
default:
// Do nothing!
} // swtich
} // if
}
return tokens;
}; // lemmatize()
// ### tag
/**
*
* Tags the input **`tokens`** with their **pos**. It has another alias – **`tagTokens()`**.
*
* *In order to pos tag a sentence directly, use
* [`tagSentence`](http://winkjs.org/wink-pos-tagger/Tagger.html#tagSentence)
* API instead.*
*
* @method Tagger#tag
* @param {object[]} tokens to be pos tagged. They are array of objects and
* must follow the [**`wink-tokenizer`**](http://winkjs.org/wink-tokenizer/)
* standard.
* @return {object[]} pos tagged `tokens`.
* @example
* // Get `tokenizer` method from the instance of `wink-tokenizer`.
* var tokenize = require( 'wink-tokenizer' )().tokenize;
* // Tag the tokenized sentence.
* myTagger.tag( tokenize( 'I ate the entire pizza as I was feeling hungry.' ) );
* // -> [ { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' },
* // { value: 'ate', tag: 'word', normal: 'ate', pos: 'VBD', lemma: 'eat' },
* // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' },
* // { value: 'entire', tag: 'word', normal: 'entire', pos: 'JJ', lemma: 'entire' },
* // { value: 'pizza', tag: 'word', normal: 'pizza', pos: 'NN', lemma: 'pizza' },
* // { value: 'as', tag: 'word', normal: 'as', pos: 'IN' },
* // { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' },
* // { value: 'was', tag: 'word', normal: 'was', pos: 'VBD', lemma: 'be' },
* // { value: 'feeling', tag: 'word', normal: 'feeling', pos: 'VBG', lemma: 'feel' },
* // { value: 'hungry', tag: 'word', normal: 'hungry', pos: 'JJ', lemma: 'hungry' },
* // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ]
*/
var tag = function ( tokens ) {
// Array of "array each possible pos" for each token.
var poses = [];
// Temp token & word.
var t;
for ( let i = 0, imax = tokens.length; i < imax; i += 1 ) {
t = tokens[ i ];
// Normalize, if configuration demands it!
t.normal = normalize( t.value );
poses.push( unigramPOSTagger( t, winkLexicon ) );
}
applyContextRules( tokens, poses );
// Lemmatize, if configuration demands...
lemmatize( tokens );
return tokens;
}; // tagTokens();
// ### tagRawTokens
/**
*
* Tags the **`raw tokens`** with their **pos**. Note, it only categorizes each
* token in to one of the following 3-categories (a) word, or (b) punctuation,
* or (c) number.
*
* *In order to pos tag a sentence directly, use
* [`tagSentence`](http://winkjs.org/wink-pos-tagger/Tagger.html#tagSentence)
* API instead.*
*
* @method Tagger#tagRawTokens
* @param {string[]} rawTokens to be pos tagged. They are simple array of string.
* @return {object[]} pos tagged `tokens`.
* @example
* var rawTokens = [ 'I', 'ate', 'the', 'entire', 'pizza', 'as', 'I', 'was', 'feeling', 'hungry', '.' ];
* // Tag the raw tokens.
* myTagger.tagRawTokens( rawTokens );
* // -> [ { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' },
* // { value: 'ate', tag: 'word', normal: 'ate', pos: 'VBD', lemma: 'eat' },
* // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' },
* // { value: 'entire', tag: 'word', normal: 'entire', pos: 'JJ', lemma: 'entire' },
* // { value: 'pizza', tag: 'word', normal: 'pizza', pos: 'NN', lemma: 'pizza' },
* // { value: 'as', tag: 'word', normal: 'as', pos: 'IN' },
* // { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' },
* // { value: 'was', tag: 'word', normal: 'was', pos: 'VBD', lemma: 'be' },
* // { value: 'feeling', tag: 'word', normal: 'feeling', pos: 'VBG', lemma: 'feel' },
* // { value: 'hungry', tag: 'word', normal: 'hungry', pos: 'JJ', lemma: 'hungry' },
* // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ]
*/
var tagRawTokens = function ( rawTokens ) {
// Will contain tokens transformed into wink format tokens
var wt = [];
var t;
for ( var i = 0, imax = rawTokens.length; i < imax; i += 1 ) {
t = rawTokens[ i ];
if ( rgxNumber.test( t ) ) {
wt.push( { value: t, tag: 'number' } );
} else if ( rgxPunctuation.test( t ) ) {
wt.push( { value: t, tag: 'punctuation' } );
} else wt.push( { value: t, tag: 'word' } );
}
return tag( wt );
}; // tagRawTokens()
// ### tagSentence
/**
*
* Tags the input `sentence` with their **pos**.
*
* @method Tagger#tagSentence
* @param {string} sentence to be pos tagged.
* @return {object[]} pos tagged `tokens.`
* @throws {Error} if `sentence` is not a valid string.
* @example
* myTagger.tagSentence( 'A bear just crossed the road.' );
* // -> [ { value: 'A', tag: 'word', normal: 'a', pos: 'DT' },
* // { value: 'bear', tag: 'word', normal: 'bear', pos: 'NN', lemma: 'bear' },
* // { value: 'just', tag: 'word', normal: 'just', pos: 'RB' },
* // { value: 'crossed', tag: 'word', normal: 'crossed', pos: 'VBD', lemma: 'cross' },
* // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' },
* // { value: 'road', tag: 'word', normal: 'road', pos: 'NN', lemma: 'road' },
* // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ]
* //
* //
* myTagger.tagSentence( 'I will bear all the expenses.' );
* // -> [ { value: 'I', tag: 'word', normal: 'i', pos: 'PRP' },
* // { value: 'will', tag: 'word', normal: 'will', pos: 'MD', lemma: 'will' },
* // { value: 'bear', tag: 'word', normal: 'bear', pos: 'VB', lemma: 'bear' },
* // { value: 'all', tag: 'word', normal: 'all', pos: 'PDT' },
* // { value: 'the', tag: 'word', normal: 'the', pos: 'DT' },
* // { value: 'expenses', tag: 'word', normal: 'expenses', pos: 'NNS', lemma: 'expense' },
* // { value: '.', tag: 'punctuation', normal: '.', pos: '.' } ]
*/
var tagSentence = function ( sentence ) {
if ( typeof sentence !== 'string' ) {
throw Error( 'wink-pos-tagger: input sentence must be a string, instead found: ' + typeof sentence );
}
return tag( tokenize( sentence ) );
}; // tagSentence()
methods.updateLexicon = updateLexicon;
methods.tag = tag;
methods.tagTokens = tag;
methods.tagRawTokens = tagRawTokens;
methods.tagSentence = tagSentence;
methods.defineConfig = defineConfig;
return methods;
}; // posTagger()
module.exports = posTagger;