Source code for nlp_architect.data.cdc_resources.embedding.embed_glove

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import logging
import pickle

import numpy as np

from nlp_architect.common.cdc.mention_data import MentionDataLight

logger = logging.getLogger(__name__)


[docs]class GloveEmbedding(object): def __init__(self, glove_file): logger.info("Loading Glove Online Embedding module, This my take a while...") self.word_to_ix, self.word_embeddings = self.load_glove_for_vocab(glove_file) logger.info("Glove Offline Embedding module lead successfully")
[docs] @staticmethod def load_glove_for_vocab(glove_filename): vocab = [] embd = [] with open(glove_filename) as glove_file: for line in glove_file: row = line.strip().split(" ") word = row[0] vocab.append(word) embd.append(row[1:]) embeddings = np.asarray(embd, dtype=float) word_to_ix = {word: i for i, word in enumerate(vocab)} return word_to_ix, embeddings
[docs]class GloveEmbeddingOffline(object): def __init__(self, embed_resources): logger.info("Loading Glove Offline Embedding module") with open(embed_resources, "rb") as out: self.word_to_ix, self.word_embeddings = pickle.load(out, encoding="latin1") logger.info("Glove Offline Embedding module lead successfully")
[docs] def get_feature_vector(self, mention: MentionDataLight): embed = None head = mention.mention_head lemma = mention.mention_head_lemma if head in self.word_to_ix: embed = self.word_embeddings[self.word_to_ix[head]] elif lemma in self.word_to_ix: embed = self.word_embeddings[self.word_to_ix[lemma]] return embed
[docs] def get_avrg_feature_vector(self, tokens_str): embed = np.zeros(300, dtype=np.float64) mention_size = 0 for token in tokens_str.split(): if token in self.word_to_ix: token_embed = self.word_embeddings[self.word_to_ix[token]] embed = np.add(embed, token_embed) mention_size += 1 if mention_size == 0: mention_size = 1 return np.true_divide(embed, mention_size)