Source code for nlp_architect.nn.torch.distillation

# ******************************************************************************
# Copyright 2017-2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import argparse
import logging
import torch
import torch.nn as nn
import torch.nn.functional as F

from nlp_architect.models import TrainableModel

logger = logging.getLogger(__name__)


MSE_loss = nn.MSELoss(reduction="mean")
KL_loss = nn.KLDivLoss(reduction="batchmean")

losses = {
    "kl": KL_loss,
    "mse": MSE_loss,
}

TEACHER_TYPES = ["bert"]


[docs]class TeacherStudentDistill:
    """
    Teacher-Student knowledge distillation helper.
    Use this object when training a model with KD and a teacher model.

    Args:
        teacher_model (TrainableModel): teacher model
        temperature (float, optional): KD temperature. Defaults to 1.0.
        dist_w (float, optional): distillation loss weight. Defaults to 0.1.
        loss_w (float, optional): student loss weight. Defaults to 1.0.
        loss_function (str, optional): loss function to use (kl for KLDivLoss,
            mse for MSELoss)
    """

    def __init__(
        self,
        teacher_model: TrainableModel,
        temperature: float = 1.0,
        dist_w: float = 0.1,
        loss_w: float = 1.0,
        loss_function="kl",
    ):
        self.teacher = teacher_model
        self.t = temperature
        self.dist_w = dist_w
        self.loss_w = loss_w
        self.loss_fn = losses.get(loss_function, KL_loss)

[docs]    def get_teacher_logits(self, inputs):
        """
        Get teacher logits

        Args:
            inputs: input

        Returns:
            teachr logits
        """
        return self.teacher.get_logits(inputs)

[docs]    @staticmethod
    def add_args(parser: argparse.ArgumentParser):
        """
        Add KD arguments to parser

        Args:
            parser (argparse.ArgumentParser): parser
        """
        parser.add_argument(
            "--teacher_model_path", type=str, required=True, help="Path to teacher model"
        )
        parser.add_argument(
            "--teacher_model_type",
            type=str,
            required=True,
            choices=TEACHER_TYPES,
            help="Teacher model class type",
        )
        parser.add_argument("--kd_temp", type=float, default=1.0, help="KD temperature value")
        parser.add_argument(
            "--kd_loss_fn", type=str, choices=["kl", "mse"], default="mse", help="KD loss function"
        )
        parser.add_argument("--kd_dist_w", type=float, default=0.1, help="KD weight on loss")
        parser.add_argument(
            "--kd_student_w", type=float, default=1.0, help="KD student weight on loss"
        )

[docs]    def distill_loss(self, loss, student_logits, teacher_logits):
        """
        Add KD loss

        Args:
            loss: student loss
            student_logits: student model logits
            teacher_logits: teacher model logits

        Returns:
            KD loss
        """
        student_log_sm = F.log_softmax(student_logits / self.t, dim=-1)
        teacher_log_sm = F.softmax(teacher_logits / self.t, dim=-1)
        distill_loss = self.loss_fn(input=student_log_sm, target=teacher_log_sm)
        return (self.loss_w * loss) + (distill_loss * self.dist_w * (self.t ** 2))

[docs]    def distill_loss_dict(self, loss, student_logits_dict, teacher_logits_dict):
        """
        Add KD loss

        Args:
            loss: student loss
            student_logits: student model logits
            teacher_logits: teacher model logits

        Returns:
            KD loss
        """

        student_sm_dict = {}
        for i in range(len(student_logits_dict.keys())):
            student_sm_dict[i] = F.log_softmax(student_logits_dict[i] / self.t, dim=-1)

        teacher_sm_dict = {}
        for i in range(len(teacher_logits_dict.keys())):
            teacher_sm_dict[i] = F.softmax(teacher_logits_dict[i] / self.t, dim=-1)

        distill_losses = [
            self.loss_fn(input=student_sm_dict[i], target=teacher_sm_dict[i])
            for i in range(len(student_sm_dict.keys()))
        ]
        distill_loss = torch.mean(torch.stack(distill_losses))

        return (self.loss_w * loss) + (distill_loss * self.dist_w * (self.t ** 2))