Source code for nlp_architect.models.crossling_emb

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import print_function, division

import io
import os
import time

import numpy as np
import scipy
import tensorflow as tf


[docs]class Discriminator:
    def __init__(self, input_data, Y, lr_ph):
        self.input_data = input_data
        self.lr_ph = lr_ph
        self.do_ph = tf.placeholder(name="dropout_ph", dtype=tf.float32)
        self.Y = Y
        self.hid_dim = 2048
        # Build Graph
        self._build_network_graph()
        self.disc_cost = None
        self.disc_opt = None
        self.map_opt = None
        self.W = None

    def _build_network_graph(self):
        """
        Builds the basic inference graph for discriminator
        """
        with tf.variable_scope("Discriminator", reuse=tf.AUTO_REUSE):
            w_init = tf.contrib.layers.xavier_initializer()
            noisy_input = tf.nn.dropout(self.input_data, self.do_ph, name="DO1")
            fc1 = tf.layers.dense(
                noisy_input,
                self.hid_dim,
                kernel_initializer=w_init,
                activation=tf.nn.leaky_relu,
                name="Dense1",
            )
            fc2 = tf.layers.dense(
                fc1,
                self.hid_dim,
                kernel_initializer=w_init,
                activation=tf.nn.leaky_relu,
                name="Dense2",
            )
            self.prediction = tf.layers.dense(fc2, 1, kernel_initializer=w_init, name="Dense_Sig")

[docs]    def build_train_graph(self, disc_pred):
        """
        Builds training graph for discriminator
        Arguments:
             disc_pred(object): Discriminator instance
        """
        # Variables in discrimnator scope
        disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "Discriminator")
        # Binary Cross entropy
        disc_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_pred, labels=self.Y)
        # Cost
        self.disc_cost = tf.reduce_mean(disc_entropy)
        # Optimizer
        disc_opt = tf.train.GradientDescentOptimizer(self.lr_ph)
        self.disc_opt = disc_opt.minimize(self.disc_cost, var_list=disc_vars)


[docs]class Generator:
    def __init__(self, src_ten, tgt_ten, emb_dim, batch_size, smooth_val, lr_ph, beta, vocab_size):
        self.src_ten = src_ten
        self.tgt_ten = tgt_ten
        self.emb_dim = emb_dim
        self.batch_size = batch_size
        self.smooth_val = smooth_val
        self.beta = beta
        self.lr_ph = lr_ph
        self.vocab_size = vocab_size

        # Placeholders
        self.src_ph = tf.placeholder(name="src_ph", shape=[None], dtype=tf.int32)
        self.tgt_ph = tf.placeholder(name="tgt_ph", shape=[None], dtype=tf.int32)

        # Build Graph
        self._build_network_graph()
        ortho_weight = self._build_ortho_graph(self.W)
        self.assign_weight = self._assign_ortho_weight(ortho_weight)
        self.map_opt = None
        self.W = None

    def _build_network_graph(self):
        """
        Builds basic inference graph for generator
        """
        with tf.variable_scope("Generator", reuse=tf.AUTO_REUSE):
            # Look up tables
            self.src_emb = tf.nn.embedding_lookup(self.src_ten, self.src_ph, name="src_lut")
            self.tgt_emb = tf.nn.embedding_lookup(self.tgt_ten, self.tgt_ph, name="tgt_lut")
            # Map them
            self.mapWX = self._mapper(self.src_emb)
            # Concatenate them
            self.X = tf.concat([self.mapWX, self.tgt_emb], 0, name="X")
            # Set target for discriminator
            Y = np.zeros(shape=(2 * self.batch_size, 1), dtype=np.float32)
            # Label smoothing
            Y[: self.batch_size] = 1 - self.smooth_val
            Y[self.batch_size :] = self.smooth_val
            # Convert to tensor
            self.Y = tf.convert_to_tensor(Y, name="Y")

[docs]    def build_train_graph(self, disc_pred):
        """
        Builds training graph for generator
        Arguments:
            disc_pred(object): Discriminator instance

        """
        # Variables in Mapper scope
        map_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "Generator/Mapper")
        # Binary Cross entropy
        map_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_pred, labels=(1 - self.Y))
        # Cost
        map_cost = tf.reduce_mean(map_entropy)
        map_opt = tf.train.GradientDescentOptimizer(self.lr_ph)
        self.map_opt = map_opt.minimize(map_cost, var_list=map_vars)

    def _build_ortho_graph(self, W):
        """
        Builds a graph to orthogonalize weight W
        Arguments:
            W (Tensor): Weight in the mapper
        """
        with tf.variable_scope("Ortho", reuse=tf.AUTO_REUSE):
            a = tf.scalar_mul((1 + self.beta), W)  # (1+B)W
            b = tf.matmul(tf.transpose(W), W)  # WWt
            c = tf.matmul(W, b)  # W(W.Wt)
            d = tf.scalar_mul(self.beta, c)  # B(W.Wt)W
            ortho_weight = a - d
            return ortho_weight

    def _assign_ortho_weight(self, ortho_weight):
        """
        Builds a graph to assign weight W after it is orthogonalized
        Arguments:
             ortho_weight(Tensor): Weight after it is orthogonalized
        """
        return tf.assign(self.W, ortho_weight)

    def _mapper(self, src_emb):
        """
        Learns WX mapping to make ||WX-Y|| smaller
        Arguments:
             src_emb(Tensor): Source embeddings after lookup
        """
        with tf.variable_scope("Mapper", reuse=tf.AUTO_REUSE):
            # Initialize as an eye of emb_dim x emb_dim
            self.W = tf.Variable(name="W", initial_value=tf.eye(self.emb_dim, self.emb_dim))
            # Do Matrix Multiply
            WX = tf.matmul(src_emb, self.W)
            # Returns map and weight handles
            return WX


[docs]class WordTranslator:
    """
    Main network which does cross-lingual embeddings training
    """

    def __init__(self, hparams, src_vec, tgt_vec, vocab_size):
        # Hyperparameters
        self.batch_size = hparams.batch_size
        self.smooth_val = hparams.smooth_val
        self.beta = hparams.beta
        self.most_freq = hparams.most_freq
        self.emb_dim = hparams.emb_dim
        self.vocab_size = vocab_size
        self.disc_runs = hparams.disc_runs
        self.iters_epoch = hparams.iters_epoch
        self.src_vec = src_vec
        self.tgt_vec = tgt_vec
        self.src_ten = tf.convert_to_tensor(src_vec)
        self.tgt_ten = tf.convert_to_tensor(tgt_vec)
        self.save_dir = hparams.weight_dir
        self.slang = hparams.src_lang
        self.tlang = hparams.tgt_lang

        # Placeholders
        self.lr_ph = tf.placeholder(tf.float32, name="lrPh")
        # Build Graph
        self._build_network_graph()
        self._build_train_graph()

    def _build_network_graph(self):
        """
        Builds inference graph for the GAN
        """
        self.generator = Generator(
            self.src_ten,
            self.tgt_ten,
            self.emb_dim,
            self.batch_size,
            self.smooth_val,
            self.lr_ph,
            self.beta,
            self.vocab_size,
        )
        self.discriminator = Discriminator(self.generator.X, self.generator.Y, self.lr_ph)

    def _build_train_graph(self):
        """
        Builds training graph for the GAN
        """
        self.generator.build_train_graph(self.discriminator.prediction)
        self.discriminator.build_train_graph(self.discriminator.prediction)

[docs]    @staticmethod
    def report_metrics(iters, n_words_proc, disc_cost_acc, tic):
        """
        Reports metrics of how training is going
        """
        if iters > 0 and iters % 500 == 0:
            mean_cost = str(sum(disc_cost_acc) / len(disc_cost_acc))
            print(
                str(int(n_words_proc / (time.time() - tic)))
                + " Samples/Sec - Iter "
                + str(iters)
                + " Discriminator Cost: "
                + mean_cost
            )
            # Reset instrumentation
            del disc_cost_acc
            disc_cost_acc = []
            n_words_proc = 0
            tic = time.time()

[docs]    def run_generator(self, sess, local_lr):
        """
        Runs generator part of GAN
        Arguments:
            sess(tf.session): Tensorflow Session
            local_lr(float): Learning rate
        Returns:
            Returns number of words processed
        """
        # Generate random ids to look up
        src_ids = np.random.choice(self.vocab_size, self.batch_size, replace=False)
        tgt_ids = np.random.choice(self.vocab_size, self.batch_size, replace=False)
        train_dict = {
            self.generator.src_ph: src_ids,
            self.generator.tgt_ph: tgt_ids,
            self.discriminator.do_ph: 1.0,
            self.lr_ph: local_lr,
        }
        sess.run(self.generator.map_opt, feed_dict=train_dict)
        # Run orthogonalize
        sess.run(self.generator.assign_weight)
        return 2 * self.batch_size

[docs]    def run_discriminator(self, sess, local_lr):
        """
        Runs discriminator part of GAN
        Arguments:
            sess(tf.session): Tensorflow Session
            local_lr(float): Learning rate
        """
        # Generate random ids to look up
        src_ids = np.random.choice(self.most_freq, self.batch_size, replace=False)
        tgt_ids = np.random.choice(self.most_freq, self.batch_size, replace=False)
        train_dict = {
            self.generator.src_ph: src_ids,
            self.generator.tgt_ph: tgt_ids,
            self.discriminator.do_ph: 0.9,
            self.lr_ph: local_lr,
        }
        return sess.run(
            [self.discriminator.disc_cost, self.discriminator.disc_opt], feed_dict=train_dict
        )

[docs]    def run(self, sess, local_lr):
        """
        Runs whole GAN
        Arguments:
            sess(tf.session): Tensorflow Session
            local_lr(float): Learning rate
        """
        disc_cost_acc = []
        n_words_proc = 0
        tic = time.time()
        for iters in range(0, self.iters_epoch, self.batch_size):
            # 1.Run the discriminator
            for _ in range(self.disc_runs):
                disc_result = self.run_discriminator(sess, local_lr)
                disc_cost_acc.append(disc_result[0])
            # 2.Run the Generator
            n_words_proc += self.run_generator(sess, local_lr)
            # 3.Report the metrics
            self.report_metrics(iters, n_words_proc, disc_cost_acc, tic)

[docs]    @staticmethod
    def set_lr(local_lr, drop_lr):
        """
        Drops learning rate based on CSLS criterion
        Arguments:
            local_lr(float): Learning Rate
            drop_lr(bool): Drop learning rate by 2 if True
        """
        new_lr = local_lr * 0.98
        print("Dropping learning rate to " + str(new_lr) + " from " + str(local_lr))
        if drop_lr:
            new_lr = new_lr / 2.0
            print(
                "Dividing learning rate by 2 as validation criterion\
                   decreased. New lr is "
                + str(new_lr)
            )
        return new_lr

[docs]    def save_model(self, save_model, sess):
        """
        Saves W in mapper as numpy array based on CSLS criterion
        Arguments:
            save_model(bool): Save model if True
            sess(tf.session): Tensorflow Session
        """
        if save_model:
            print("Saving model ....")
            model_W = sess.run(self.generator.W)
            path = os.path.join(self.save_dir, "W_best_mapping")
            np.save(path, model_W)

[docs]    def apply_procrustes(self, sess, final_pairs):
        """
        Applies procrustes to W matrix for better mapping
        Arguments:
            sess(tf.session): Tensorflow Session
            final_pairs(ndarray): Array of pairs which are mutual neighbors
        """
        print("Applying solution of Procrustes problem to get better mapping...")
        proc_dict = {
            self.generator.src_ph: final_pairs[:, 0],
            self.generator.tgt_ph: final_pairs[:, 1],
        }
        A, B = sess.run([self.generator.src_emb, self.generator.tgt_emb], feed_dict=proc_dict)
        # pylint: disable=no-member
        R = scipy.linalg.orthogonal_procrustes(A, B)
        sess.run(tf.assign(self.generator.W, R[0]))

[docs]    def generate_xling_embed(self, sess, src_dict, tgt_dict, tgt_vec):
        """
        Generates cross lingual embeddings
        Arguments:
             sess(tf.session): Tensorflow session
        """
        print("Generating Cross-lingual embeddings...")
        src_emb_x = []
        batch_size = 512
        for i in range(0, self.vocab_size, batch_size):
            sids = [x for x in range(i, min(i + batch_size, self.vocab_size))]
            src_emb_x.append(
                sess.run(self.generator.mapWX, feed_dict={self.generator.src_ph: sids})
            )
        src_emb_x = np.concatenate(src_emb_x)
        print("Writing cross-lingual embeddings to file...")
        src_path = os.path.join(self.save_dir, "vectors-%s.txt" % self.slang)
        tgt_path = os.path.join(self.save_dir, "vectors-%s.txt" % self.tlang)
        with io.open(src_path, "w", encoding="utf-8") as f:
            f.write("%i %i\n" % src_emb_x.shape)
            for i in range(len(src_dict)):
                f.write("%s %s\n" % (src_dict[i], " ".join("%.5f" % x for x in src_emb_x[i])))

        with io.open(tgt_path, "w", encoding="utf-8") as f:
            f.write("%i %i\n" % tgt_vec.shape)
            for i in range(len(tgt_dict)):
                f.write("%s %s\n" % (tgt_dict[i], " ".join("%.5f" % x for x in tgt_vec[i])))