Source code for nlp_architect.data.cdc_resources.gen_scripts.create_word_embed_elmo_dump

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import argparse
import logging
import os
import pickle
from os.path import join

from nlp_architect.common.cdc.mention_data import MentionData
from nlp_architect.data.cdc_resources.embedding.embed_elmo import ElmoEmbedding
from nlp_architect.utils import io

logger = logging.getLogger(__name__)


[docs]def load_elmo_for_vocab(mentions):
    """
    Create the embedding using the cache logic in the embedding class
    Args:
        mentions:

    Returns:

    """
    elmo_embeddings = ElmoEmbedding()

    for mention in mentions:
        elmo_embeddings.get_head_feature_vector(mention)

    logger.info("Total words/contexts in vocabulary %d", len(elmo_embeddings.cache))
    return elmo_embeddings.cache


[docs]def elmo_dump():
    out_file = args.output
    mention_files = list()
    if os.path.isdir(args.mentions):
        for (dirpath, _, files) in os.walk(args.mentions):
            for file in files:
                if file == ".DS_Store":
                    continue

                mention_files.append(join(dirpath, file))
    else:
        mention_files.append(args.mentions)

    mentions = []
    for _file in mention_files:
        mentions.extend(MentionData.read_mentions_json_to_mentions_data_list(_file))

    elmo_ecb_embeddings = load_elmo_for_vocab(mentions)

    with open(out_file, "wb") as f:
        pickle.dump(elmo_ecb_embeddings, f)

    logger.info("Saving dump to file-%s", out_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Create Elmo Embedding dataset only dump")
    parser.add_argument("--mentions", type=str, help="mentions_file file", required=True)
    parser.add_argument(
        "--output", type=str, help="location were to create dump file", required=True
    )

    args = parser.parse_args()

    if os.path.isdir(args.mentions):
        io.validate_existing_directory(args.mentions)
    else:
        io.validate_existing_filepath(args.mentions)

    elmo_dump()
    print("Done!")