Source code for nlp_architect.utils.embedding

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import os
import sys
from typing import List

import numpy as np

from gensim.models import FastText

from nlp_architect.utils.text import Vocabulary

logger = logging.getLogger(__name__)


[docs]def load_word_embeddings(file_path, vocab=None):
    """
    Loads a word embedding model text file into a word(str) to numpy vector dictionary

    Args:
        file_path (str): path to model file
        vocab (list of str): optional - vocabulary

    Returns:
        list: a dictionary of numpy.ndarray vectors
        int: detected word embedding vector size
    """
    with open(file_path, encoding="utf-8") as fp:
        word_vectors = {}
        size = None
        for line in fp:
            line_fields = line.split()
            if len(line_fields) < 5:
                continue
            if line[0] == " ":
                word_vectors[" "] = np.asarray(line_fields, dtype="float32")
            elif vocab is None or line_fields[0] in vocab:
                word_vectors[line_fields[0]] = np.asarray(line_fields[1:], dtype="float32")
                if size is None:
                    size = len(line_fields[1:])
    return word_vectors, size


[docs]def fill_embedding_mat(src_mat, src_lex, emb_lex, emb_size):
    """
    Creates a new matrix from given matrix of int words using the embedding
    model provided.

    Args:
        src_mat (numpy.ndarray): source matrix
        src_lex (dict): source matrix lexicon
        emb_lex (dict): embedding lexicon
        emb_size (int): embedding vector size
    """
    emb_mat = np.zeros((src_mat.shape[0], src_mat.shape[1], emb_size))
    for i, sen in enumerate(src_mat):
        for j, w in enumerate(sen):
            if w > 0:
                w_emb = emb_lex.get(str(src_lex.get(w)).lower())
                if w_emb is not None:
                    emb_mat[i][j] = w_emb
    return emb_mat


[docs]def get_embedding_matrix(
    embeddings: dict, vocab: Vocabulary, embedding_size: int = None, lowercase_only: bool = False
) -> np.ndarray:
    """
    Generate a matrix of word embeddings given a vocabulary

    Args:
        embeddings (dict): a dictionary of embedding vectors
        vocab (Vocabulary): a Vocabulary
        embedding_size (int): custom embedding matrix size

    Returns:
        a 2D numpy matrix of lexicon embeddings
    """
    emb_size = len(next(iter(embeddings.values())))
    if embedding_size:
        mat = np.zeros((embedding_size, emb_size))
    else:
        mat = np.zeros((len(vocab), emb_size))
    if lowercase_only:
        for word, wid in vocab.vocab.items():
            vec = embeddings.get(word.lower(), None)
            if vec is not None:
                mat[wid] = vec
    else:
        for word, wid in vocab.vocab.items():
            vec = embeddings.get(word, None)
            if vec is None:
                vec = embeddings.get(word.lower(), None)
            if vec is not None:
                mat[wid] = vec
    return mat


[docs]def load_embedding_file(filename: str, dim: int = None) -> dict:
    """Load a word embedding file

    Args:
        filename (str): path to embedding file

    Returns:
        dict: dictionary with embedding vectors
    """
    if filename is not None and os.path.exists(filename):
        logger.info("Loading external word embeddings from {}".format(filename))
    embedding_dict = {}
    with open(filename, encoding="utf-8") as fp:
        for line in fp:
            split_line = line.split()
            word = split_line[0]
            vec = np.array([float(val) for val in split_line[1:]])
            embedding_dict[word] = vec
    return embedding_dict


# pylint: disable=not-context-manager
[docs]class ELMoEmbedderTFHUB(object):
    def __init__(self):
        try:
            import tensorflow as tf
            import tensorflow_hub as hub
        except (AttributeError, ImportError):
            logger.error(
                "tensorflow_hub is not installed, "
                + "please install nlp_architect with [all] package. "
                + "for example: pip install nlp_architect[all]"
            )
            sys.exit()

        self.g = tf.Graph()

        with self.g.as_default():
            text_input = tf.compat.v1.placeholder(dtype=tf.string)
            text_input_size = tf.compat.v1.placeholder(dtype=tf.int32)
            print(
                "Loading Tensorflow hub ELMo model, "
                "might take a while on first load (model is downloaded from web)"
            )
            self.elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=False)
            self.inputs = {"tokens": text_input, "sequence_len": text_input_size}
            self.embedding = self.elmo(inputs=self.inputs, signature="tokens", as_dict=True)["elmo"]

            sess = tf.compat.v1.Session(graph=self.g)
            sess.run(tf.compat.v1.global_variables_initializer())
            sess.run(tf.compat.v1.tables_initializer())
            self.s = sess

[docs]    def get_vector(self, tokens):
        vec = self.s.run(
            self.embedding,
            feed_dict={self.inputs["tokens"]: [tokens], self.inputs["sequence_len"]: [len(tokens)]},
        )
        return np.squeeze(vec, axis=0)


[docs]class FasttextEmbeddingsModel(object):
    """Fasttext embedding trainer class

    Args:
        texts (List[List[str]]): list of tokenized sentences
        size (int): embedding size
        epochs (int, optional): number of epochs to train
        window (int, optional): The maximum distance between
        the current and predicted word within a sentence

    """

    def __init__(self, size: int = 5, window: int = 3, min_count: int = 1, skipgram: bool = True):
        model = FastText(size=size, window=window, min_count=min_count, sg=skipgram)
        self.model = model

[docs]    def train(self, texts: List[List[str]], epochs: int = 100):
        self.model.build_vocab(texts)
        self.model.train(sentences=texts, total_examples=len(texts), epochs=epochs)

[docs]    def vec(self, word: str) -> np.ndarray:
        """return vector corresponding given word"""
        return self.model.wv[word]

    def __getitem__(self, item):
        return self.vec(item)

[docs]    def save(self, path) -> None:
        """save model to path"""
        self.model.save(path)

[docs]    @classmethod
    def load(cls, path):
        """load model from path"""
        loaded_model = FastText.load(path)
        new_model = cls()
        new_model.model = loaded_model
        return new_model