# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import print_function, division
import io
import os
import time
import numpy as np
import scipy
import tensorflow as tf
[docs]class Discriminator:
def __init__(self, input_data, Y, lr_ph):
self.input_data = input_data
self.lr_ph = lr_ph
self.do_ph = tf.placeholder(name="dropout_ph", dtype=tf.float32)
self.Y = Y
self.hid_dim = 2048
# Build Graph
self._build_network_graph()
self.disc_cost = None
self.disc_opt = None
self.map_opt = None
self.W = None
def _build_network_graph(self):
"""
Builds the basic inference graph for discriminator
"""
with tf.variable_scope("Discriminator", reuse=tf.AUTO_REUSE):
w_init = tf.contrib.layers.xavier_initializer()
noisy_input = tf.nn.dropout(self.input_data, self.do_ph, name="DO1")
fc1 = tf.layers.dense(
noisy_input,
self.hid_dim,
kernel_initializer=w_init,
activation=tf.nn.leaky_relu,
name="Dense1",
)
fc2 = tf.layers.dense(
fc1,
self.hid_dim,
kernel_initializer=w_init,
activation=tf.nn.leaky_relu,
name="Dense2",
)
self.prediction = tf.layers.dense(fc2, 1, kernel_initializer=w_init, name="Dense_Sig")
[docs] def build_train_graph(self, disc_pred):
"""
Builds training graph for discriminator
Arguments:
disc_pred(object): Discriminator instance
"""
# Variables in discrimnator scope
disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "Discriminator")
# Binary Cross entropy
disc_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_pred, labels=self.Y)
# Cost
self.disc_cost = tf.reduce_mean(disc_entropy)
# Optimizer
disc_opt = tf.train.GradientDescentOptimizer(self.lr_ph)
self.disc_opt = disc_opt.minimize(self.disc_cost, var_list=disc_vars)
[docs]class Generator:
def __init__(self, src_ten, tgt_ten, emb_dim, batch_size, smooth_val, lr_ph, beta, vocab_size):
self.src_ten = src_ten
self.tgt_ten = tgt_ten
self.emb_dim = emb_dim
self.batch_size = batch_size
self.smooth_val = smooth_val
self.beta = beta
self.lr_ph = lr_ph
self.vocab_size = vocab_size
# Placeholders
self.src_ph = tf.placeholder(name="src_ph", shape=[None], dtype=tf.int32)
self.tgt_ph = tf.placeholder(name="tgt_ph", shape=[None], dtype=tf.int32)
# Build Graph
self._build_network_graph()
ortho_weight = self._build_ortho_graph(self.W)
self.assign_weight = self._assign_ortho_weight(ortho_weight)
self.map_opt = None
self.W = None
def _build_network_graph(self):
"""
Builds basic inference graph for generator
"""
with tf.variable_scope("Generator", reuse=tf.AUTO_REUSE):
# Look up tables
self.src_emb = tf.nn.embedding_lookup(self.src_ten, self.src_ph, name="src_lut")
self.tgt_emb = tf.nn.embedding_lookup(self.tgt_ten, self.tgt_ph, name="tgt_lut")
# Map them
self.mapWX = self._mapper(self.src_emb)
# Concatenate them
self.X = tf.concat([self.mapWX, self.tgt_emb], 0, name="X")
# Set target for discriminator
Y = np.zeros(shape=(2 * self.batch_size, 1), dtype=np.float32)
# Label smoothing
Y[: self.batch_size] = 1 - self.smooth_val
Y[self.batch_size :] = self.smooth_val
# Convert to tensor
self.Y = tf.convert_to_tensor(Y, name="Y")
[docs] def build_train_graph(self, disc_pred):
"""
Builds training graph for generator
Arguments:
disc_pred(object): Discriminator instance
"""
# Variables in Mapper scope
map_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "Generator/Mapper")
# Binary Cross entropy
map_entropy = tf.nn.sigmoid_cross_entropy_with_logits(logits=disc_pred, labels=(1 - self.Y))
# Cost
map_cost = tf.reduce_mean(map_entropy)
map_opt = tf.train.GradientDescentOptimizer(self.lr_ph)
self.map_opt = map_opt.minimize(map_cost, var_list=map_vars)
def _build_ortho_graph(self, W):
"""
Builds a graph to orthogonalize weight W
Arguments:
W (Tensor): Weight in the mapper
"""
with tf.variable_scope("Ortho", reuse=tf.AUTO_REUSE):
a = tf.scalar_mul((1 + self.beta), W) # (1+B)W
b = tf.matmul(tf.transpose(W), W) # WWt
c = tf.matmul(W, b) # W(W.Wt)
d = tf.scalar_mul(self.beta, c) # B(W.Wt)W
ortho_weight = a - d
return ortho_weight
def _assign_ortho_weight(self, ortho_weight):
"""
Builds a graph to assign weight W after it is orthogonalized
Arguments:
ortho_weight(Tensor): Weight after it is orthogonalized
"""
return tf.assign(self.W, ortho_weight)
def _mapper(self, src_emb):
"""
Learns WX mapping to make ||WX-Y|| smaller
Arguments:
src_emb(Tensor): Source embeddings after lookup
"""
with tf.variable_scope("Mapper", reuse=tf.AUTO_REUSE):
# Initialize as an eye of emb_dim x emb_dim
self.W = tf.Variable(name="W", initial_value=tf.eye(self.emb_dim, self.emb_dim))
# Do Matrix Multiply
WX = tf.matmul(src_emb, self.W)
# Returns map and weight handles
return WX
[docs]class WordTranslator:
"""
Main network which does cross-lingual embeddings training
"""
def __init__(self, hparams, src_vec, tgt_vec, vocab_size):
# Hyperparameters
self.batch_size = hparams.batch_size
self.smooth_val = hparams.smooth_val
self.beta = hparams.beta
self.most_freq = hparams.most_freq
self.emb_dim = hparams.emb_dim
self.vocab_size = vocab_size
self.disc_runs = hparams.disc_runs
self.iters_epoch = hparams.iters_epoch
self.src_vec = src_vec
self.tgt_vec = tgt_vec
self.src_ten = tf.convert_to_tensor(src_vec)
self.tgt_ten = tf.convert_to_tensor(tgt_vec)
self.save_dir = hparams.weight_dir
self.slang = hparams.src_lang
self.tlang = hparams.tgt_lang
# Placeholders
self.lr_ph = tf.placeholder(tf.float32, name="lrPh")
# Build Graph
self._build_network_graph()
self._build_train_graph()
def _build_network_graph(self):
"""
Builds inference graph for the GAN
"""
self.generator = Generator(
self.src_ten,
self.tgt_ten,
self.emb_dim,
self.batch_size,
self.smooth_val,
self.lr_ph,
self.beta,
self.vocab_size,
)
self.discriminator = Discriminator(self.generator.X, self.generator.Y, self.lr_ph)
def _build_train_graph(self):
"""
Builds training graph for the GAN
"""
self.generator.build_train_graph(self.discriminator.prediction)
self.discriminator.build_train_graph(self.discriminator.prediction)
[docs] @staticmethod
def report_metrics(iters, n_words_proc, disc_cost_acc, tic):
"""
Reports metrics of how training is going
"""
if iters > 0 and iters % 500 == 0:
mean_cost = str(sum(disc_cost_acc) / len(disc_cost_acc))
print(
str(int(n_words_proc / (time.time() - tic)))
+ " Samples/Sec - Iter "
+ str(iters)
+ " Discriminator Cost: "
+ mean_cost
)
# Reset instrumentation
del disc_cost_acc
disc_cost_acc = []
n_words_proc = 0
tic = time.time()
[docs] def run_generator(self, sess, local_lr):
"""
Runs generator part of GAN
Arguments:
sess(tf.session): Tensorflow Session
local_lr(float): Learning rate
Returns:
Returns number of words processed
"""
# Generate random ids to look up
src_ids = np.random.choice(self.vocab_size, self.batch_size, replace=False)
tgt_ids = np.random.choice(self.vocab_size, self.batch_size, replace=False)
train_dict = {
self.generator.src_ph: src_ids,
self.generator.tgt_ph: tgt_ids,
self.discriminator.do_ph: 1.0,
self.lr_ph: local_lr,
}
sess.run(self.generator.map_opt, feed_dict=train_dict)
# Run orthogonalize
sess.run(self.generator.assign_weight)
return 2 * self.batch_size
[docs] def run_discriminator(self, sess, local_lr):
"""
Runs discriminator part of GAN
Arguments:
sess(tf.session): Tensorflow Session
local_lr(float): Learning rate
"""
# Generate random ids to look up
src_ids = np.random.choice(self.most_freq, self.batch_size, replace=False)
tgt_ids = np.random.choice(self.most_freq, self.batch_size, replace=False)
train_dict = {
self.generator.src_ph: src_ids,
self.generator.tgt_ph: tgt_ids,
self.discriminator.do_ph: 0.9,
self.lr_ph: local_lr,
}
return sess.run(
[self.discriminator.disc_cost, self.discriminator.disc_opt], feed_dict=train_dict
)
[docs] def run(self, sess, local_lr):
"""
Runs whole GAN
Arguments:
sess(tf.session): Tensorflow Session
local_lr(float): Learning rate
"""
disc_cost_acc = []
n_words_proc = 0
tic = time.time()
for iters in range(0, self.iters_epoch, self.batch_size):
# 1.Run the discriminator
for _ in range(self.disc_runs):
disc_result = self.run_discriminator(sess, local_lr)
disc_cost_acc.append(disc_result[0])
# 2.Run the Generator
n_words_proc += self.run_generator(sess, local_lr)
# 3.Report the metrics
self.report_metrics(iters, n_words_proc, disc_cost_acc, tic)
[docs] @staticmethod
def set_lr(local_lr, drop_lr):
"""
Drops learning rate based on CSLS criterion
Arguments:
local_lr(float): Learning Rate
drop_lr(bool): Drop learning rate by 2 if True
"""
new_lr = local_lr * 0.98
print("Dropping learning rate to " + str(new_lr) + " from " + str(local_lr))
if drop_lr:
new_lr = new_lr / 2.0
print(
"Dividing learning rate by 2 as validation criterion\
decreased. New lr is "
+ str(new_lr)
)
return new_lr
[docs] def save_model(self, save_model, sess):
"""
Saves W in mapper as numpy array based on CSLS criterion
Arguments:
save_model(bool): Save model if True
sess(tf.session): Tensorflow Session
"""
if save_model:
print("Saving model ....")
model_W = sess.run(self.generator.W)
path = os.path.join(self.save_dir, "W_best_mapping")
np.save(path, model_W)
[docs] def apply_procrustes(self, sess, final_pairs):
"""
Applies procrustes to W matrix for better mapping
Arguments:
sess(tf.session): Tensorflow Session
final_pairs(ndarray): Array of pairs which are mutual neighbors
"""
print("Applying solution of Procrustes problem to get better mapping...")
proc_dict = {
self.generator.src_ph: final_pairs[:, 0],
self.generator.tgt_ph: final_pairs[:, 1],
}
A, B = sess.run([self.generator.src_emb, self.generator.tgt_emb], feed_dict=proc_dict)
# pylint: disable=no-member
R = scipy.linalg.orthogonal_procrustes(A, B)
sess.run(tf.assign(self.generator.W, R[0]))
[docs] def generate_xling_embed(self, sess, src_dict, tgt_dict, tgt_vec):
"""
Generates cross lingual embeddings
Arguments:
sess(tf.session): Tensorflow session
"""
print("Generating Cross-lingual embeddings...")
src_emb_x = []
batch_size = 512
for i in range(0, self.vocab_size, batch_size):
sids = [x for x in range(i, min(i + batch_size, self.vocab_size))]
src_emb_x.append(
sess.run(self.generator.mapWX, feed_dict={self.generator.src_ph: sids})
)
src_emb_x = np.concatenate(src_emb_x)
print("Writing cross-lingual embeddings to file...")
src_path = os.path.join(self.save_dir, "vectors-%s.txt" % self.slang)
tgt_path = os.path.join(self.save_dir, "vectors-%s.txt" % self.tlang)
with io.open(src_path, "w", encoding="utf-8") as f:
f.write("%i %i\n" % src_emb_x.shape)
for i in range(len(src_dict)):
f.write("%s %s\n" % (src_dict[i], " ".join("%.5f" % x for x in src_emb_x[i])))
with io.open(tgt_path, "w", encoding="utf-8") as f:
f.write("%i %i\n" % tgt_vec.shape)
for i in range(len(tgt_dict)):
f.write("%s %s\n" % (tgt_dict[i], " ".join("%.5f" % x for x in tgt_vec[i])))