Source code for nlp_architect.models.np2vec

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import json
import logging
import sys

from gensim.models import FastText, Word2Vec, KeyedVectors
from gensim.models.word2vec import LineSentence
from gensim import utils
import nltk
from nltk.corpus import conll2000
from six import iteritems
from smart_open import open as smart_open

logger = logging.getLogger(__name__)


# pylint: disable-msg=too-many-instance-attributes
[docs]class NP2vec:
    """
    Initialize the np2vec model, train it, save it and load it.
    """

[docs]    def is_marked(self, s):
        """
        Check if a string is marked.

        Args:
            s (str): string to check
        """
        return len(s) > 0 and s[-1] == self.mark_char

    # pylint: disable-msg=too-many-arguments
    # pylint: disable-msg=too-many-locals
    # pylint: disable-msg=too-many-branches
    def __init__(  # noqa: C901
        self,
        corpus,
        corpus_format="txt",
        mark_char="_",
        word_embedding_type="word2vec",
        sg=0,
        size=100,
        window=10,
        alpha=0.025,
        min_alpha=0.0001,
        min_count=5,
        sample=1e-5,
        workers=20,
        hs=0,
        negative=25,
        cbow_mean=1,
        iterations=15,
        min_n=3,
        max_n=6,
        word_ngrams=1,
        prune_non_np=True,
    ):
        """
        Initialize np2vec model and train it.

        Args:
          corpus (str): path to the corpus.
          corpus_format (str {json,txt,conll2000}): format of the input marked corpus; txt and json
          formats are supported. For json format, the file should contain an iterable of
          sentences. Each sentence is a list of terms (unicode strings) that will be used for
          training.
          mark_char (char): special character that marks NP's suffix.
          word_embedding_type (str {word2vec,fasttext}): word embedding model type; word2vec and
          fasttext are supported.
          np2vec_model_file (str): path to the file where the trained np2vec model has to be
          stored.
          binary (bool): boolean indicating whether the model is stored in binary format; if
          word_embedding_type is fasttext and word_ngrams is 1, binary should be set to True.
          sg (int {0,1}): model training hyperparameter, skip-gram. Defines the training
          algorithm. If 1, CBOW is used,otherwise, skip-gram is employed.
          size (int): model training hyperparameter, size of the feature vectors.
          window (int): model training hyperparameter, maximum distance between the current and
          predicted word within a sentence.
          alpha (float): model training hyperparameter. The initial learning rate.
          min_alpha (float): model training hyperparameter. Learning rate will linearly drop to
          `min_alpha` as training progresses.
          min_count (int): model training hyperparameter, ignore all words with total frequency
          lower than this.
          sample (float): model training hyperparameter, threshold for configuring which
          higher-frequency words are randomly downsampled, useful range is (0, 1e-5)
          workers (int): model training hyperparameter, number of worker threads.
          hs (int {0,1}): model training hyperparameter, hierarchical softmax. If set to 1,
          hierarchical softmax will be used for model training. If set to 0, and `negative` is non-
                        zero, negative sampling will be used.
          negative (int): model training hyperparameter, negative sampling. If > 0, negative
          sampling will be used, the int for negative specifies how many "noise words" should be
          drawn (usually between 5-20). If set to 0, no negative sampling is used.
          cbow_mean (int {0,1}): model training hyperparameter. If 0, use the sum of the context
          word vectors. If 1, use the mean, only applies when cbow is used.
          iterations (int): model training hyperparameter, number of iterations.
          min_n (int): fasttext training hyperparameter. Min length of char ngrams to be used
          for training word representations.
          max_n (int): fasttext training hyperparameter. Max length of char ngrams to be used for
          training word representations. Set `max_n` to be lesser than `min_n` to avoid char
          ngrams being used.
          word_ngrams (int {0,1}): fasttext training hyperparameter. If 1, uses enrich word
          vectors with subword (ngrams) information. If 0, this is equivalent to word2vec training.
          prune_non_np (bool): indicates whether to prune non-NP's after training process.

        """

        self.mark_char = mark_char
        self.word_embedding_type = word_embedding_type
        self.sg = sg
        self.size = size
        self.window = window
        self.alpha = alpha
        self.min_alpha = min_alpha
        self.min_count = min_count
        self.sample = sample
        self.workers = workers
        self.hs = hs
        self.negative = negative
        self.cbow_mean = cbow_mean
        self.iter = iterations
        self.min_n = min_n
        self.max_n = max_n
        self.word_ngrams = word_ngrams
        self.prune_non_np = prune_non_np

        if corpus_format == "txt":
            self._sentences = LineSentence(corpus)
        elif corpus_format == "json":
            with open(corpus) as json_data:
                self._sentences = json.load(json_data)
        # pylint: disable-msg=too-many-nested-blocks
        elif corpus_format == "conll2000":
            try:
                self._sentences = list()
                for chunked_sent in conll2000.chunked_sents(corpus):
                    tokens = list()
                    for chunk in chunked_sent:
                        # pylint: disable-msg=protected-access
                        if hasattr(chunk, "_label") and chunk._label == "NP":
                            s = ""
                            for w in chunk:
                                s += w[0] + self.mark_char
                            tokens.append(s)
                        else:
                            if isinstance(chunk, nltk.Tree):
                                for w in chunk:
                                    tokens.append(w[0])
                            else:
                                tokens.append(chunk[0])
                        self._sentences.append(tokens)
            # pylint: disable-msg=broad-except
            except Exception:
                print("Conll2000 dataset is missing. See downloading details in the " "README file")
        else:
            logger.error("invalid corpus format: %s", corpus_format)
            sys.exit(0)

        if word_embedding_type == "fasttext" and word_ngrams == 1:
            # remove the marking character at the end for subword fasttext model training
            self._sentences = [
                [w[:-1] if self.is_marked(w) else w for w in sentence]
                for sentence in self._sentences
            ]

        logger.info("training np2vec model")
        self._train()

    def _train(self):
        """
        Train the np2vec model.
        """
        if self.word_embedding_type == "word2vec":
            self.model = Word2Vec(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=self.iter,
            )

        elif self.word_embedding_type == "fasttext":
            self.model = FastText(
                self._sentences,
                sg=self.sg,
                size=self.size,
                window=self.window,
                alpha=self.alpha,
                min_alpha=self.min_alpha,
                min_count=self.min_count,
                sample=self.sample,
                workers=self.workers,
                hs=self.hs,
                negative=self.negative,
                cbow_mean=self.cbow_mean,
                iter=self.iter,
                min_n=self.min_n,
                max_n=self.max_n,
                word_ngrams=self.word_ngrams,
            )
        else:
            logger.error("invalid word embedding type: %s", self.word_embedding_type)
            sys.exit(0)

[docs]    def save(self, np2vec_model_file="np2vec.model", binary=False, word2vec_format=True):
        """
        Save the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word2vec_format(bool): boolean indicating whether to save the model in original
            word2vec format.
        """
        if self.word_embedding_type == "fasttext" and self.word_ngrams == 1:
            if not binary:
                logger.error(
                    "if word_embedding_type is fasttext and word_ngrams is 1, "
                    "binary should be set to True."
                )
                sys.exit(0)
            # not relevant to prune fasttext subword model
            self.model.save(np2vec_model_file)
        else:
            # prune non NP terms
            if self.prune_non_np:
                logger.info("pruning np2vec model")
                total_vec = 0
                vector_size = self.model.vector_size
                for word in self.model.wv.vocab.keys():
                    if self.is_marked(word) and len(word) > 1:
                        total_vec += 1
                logger.info(
                    "storing %sx%s projection weights for NP's into %s",
                    total_vec,
                    vector_size,
                    np2vec_model_file,
                )
                with smart_open(np2vec_model_file, "wb") as fout:
                    fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
                    # store NP vectors in sorted order: most frequent NP's at the top
                    for word, vocab in sorted(
                        iteritems(self.model.wv.vocab), key=lambda item: -item[1].count
                    ):
                        if self.is_marked(word) and len(word) > 1:  # discard empty marked np's
                            embedding_vec = self.model.wv.syn0[vocab.index]
                            if binary:
                                fout.write(utils.to_utf8(word) + b" " + embedding_vec.tostring())
                            else:
                                fout.write(
                                    utils.to_utf8(
                                        "%s %s\n"
                                        % (word, " ".join("%f" % val for val in embedding_vec))
                                    )
                                )
                if not word2vec_format:
                    # pylint: disable=attribute-defined-outside-init
                    self.model = KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary)
            if not word2vec_format:
                self.model.save(np2vec_model_file)

[docs]    @classmethod
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0, word2vec_format=True):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.
            word2vec_format(bool): boolean indicating whether the model to load has been stored in
            original word2vec format.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            if word2vec_format:
                return KeyedVectors.load_word2vec_format(np2vec_model_file, binary=binary)
            return KeyedVectors.load(np2vec_model_file, mmap="r")
        if word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        logger.error("invalid value for 'word_ngrams'")
        return None