Source code for nlp_architect.data.glue_tasks

# ******************************************************************************
# Copyright 2017-2019 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import logging
import os

from sklearn.metrics import matthews_corrcoef

from nlp_architect.data.sequence_classification import SequenceClsInputExample
from nlp_architect.data.utils import DataProcessor, Task, read_tsv
from nlp_architect.utils.metrics import acc_and_f1, pearson_and_spearman, simple_accuracy

logger = logging.getLogger(__name__)


[docs]class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id, valid_ids=None):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.valid_ids = valid_ids


[docs]class MrpcProcessor(DataProcessor):
    """Processor for the MRPC data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["0", "1"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]
            text_b = line[4]
            if set_type in ["train", "dev"]:
                label = line[0]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]class MnliProcessor(DataProcessor):
    """Processor for the MultiNLI data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(
            read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched"
        )

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(
            read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test_matched"
        )

[docs]    def get_labels(self):
        return ["contradiction", "entailment", "neutral"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[8]
            text_b = line[9]
            if set_type in ["train", "dev_matched"]:
                label = line[-1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]class MnliMismatchedProcessor(MnliProcessor):
    """Processor for the MultiNLI Mismatched data set (GLUE version)."""

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(
            read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched"
        )

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(
            read_tsv(os.path.join(data_dir, "test_mismatched.tsv")), "test_mismatched"
        )


[docs]class ColaProcessor(DataProcessor):
    """Processor for the CoLA data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["0", "1"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0 and set_type not in ["train", "dev"]:
                continue
            guid = "%s-%s" % (set_type, i)
            if set_type in ["train", "dev"]:
                text_a = line[3]
                label = line[1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=None, label=label)
                )
            else:
                text_a = line[1]
                examples.append(SequenceClsInputExample(guid=guid, text=text_a))
        return examples


[docs]class Sst2Processor(DataProcessor):
    """Processor for the SST-2 data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["0", "1"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
            if set_type in ["train", "dev"]:
                text_a = line[0]
                label = line[1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=None, label=label)
                )
            else:
                text_a = line[1]
                examples.append(SequenceClsInputExample(guid=guid, text=text_a))
        return examples


[docs]class StsbProcessor(DataProcessor):
    """Processor for the STS-B data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return [None]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[7]
            text_b = line[8]
            if set_type in ["train", "dev"]:
                label = line[-1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]class QqpProcessor(DataProcessor):
    """Processor for the QQP data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["0", "1"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            if set_type in ["train", "dev"]:
                try:
                    text_a = line[3]
                    text_b = line[4]
                    label = line[5]
                except IndexError:
                    continue
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                try:
                    text_a = line[1]
                    text_b = line[2]
                except IndexError:
                    continue
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]class QnliProcessor(DataProcessor):
    """Processor for the QNLI data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["entailment", "not_entailment"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[1]
            text_b = line[2]
            if set_type in ["train", "dev"]:
                label = line[-1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]class RteProcessor(DataProcessor):
    """Processor for the RTE data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["entailment", "not_entailment"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[1]
            text_b = line[2]
            if set_type in ["train", "dev"]:
                label = line[-1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]class WnliProcessor(DataProcessor):
    """Processor for the WNLI data set (GLUE version)."""

[docs]    def get_train_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "train.tsv")), "train")

[docs]    def get_dev_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")

[docs]    def get_test_examples(self, data_dir):
        return self._create_examples(read_tsv(os.path.join(data_dir, "test.tsv")), "test")

[docs]    def get_labels(self):
        return ["0", "1"]

    @staticmethod
    def _create_examples(lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, line[0])
            text_a = line[1]
            text_b = line[2]
            if set_type in ["train", "dev"]:
                label = line[-1]
                examples.append(
                    SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b, label=label)
                )
            else:
                examples.append(SequenceClsInputExample(guid=guid, text=text_a, text_b=text_b))
        return examples


[docs]def convert_examples_to_features(
    examples,
    label_list,
    max_seq_length,
    tokenizer,
    output_mode,
    cls_token_at_end=False,
    pad_on_left=False,
    cls_token="[CLS]",
    sep_token="[SEP]",
    pad_token=0,
    sequence_a_segment_id=0,
    sequence_b_segment_id=1,
    cls_token_segment_id=1,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
):
    """Loads a data file into a list of `InputBatch`s
    `cls_token_at_end` define the location of the CLS token:
        - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
        - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
    `cls_token_segment_id` define the segment id associated to the CLS token
    (0 for BERT, 2 for XLNet)
    """

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example {} of {}".format(ex_index, len(examples)))

        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[: (max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = tokens_a + [sep_token]
        segment_ids = [sequence_a_segment_id] * len(tokens)

        if tokens_b:
            tokens += tokens_b + [sep_token]
            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

        if cls_token_at_end:
            tokens = tokens + [cls_token]
            segment_ids = segment_ids + [cls_token_segment_id]
        else:
            tokens = [cls_token] + tokens
            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        if output_mode == "classification":
            label_id = label_map[example.label]
        elif output_mode == "regression":
            label_id = float(example.label)
        else:
            raise KeyError(output_mode)

        features.append(
            InputFeatures(
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                label_id=label_id,
            )
        )
    return features


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


processors = {
    "cola": ColaProcessor,
    "mnli": MnliProcessor,
    "mnli-mm": MnliMismatchedProcessor,
    "mrpc": MrpcProcessor,
    "sst-2": Sst2Processor,
    "sts-b": StsbProcessor,
    "qqp": QqpProcessor,
    "qnli": QnliProcessor,
    "rte": RteProcessor,
    "wnli": WnliProcessor,
}

output_modes = {
    "cola": "classification",
    "mnli": "classification",
    "mnli-mm": "classification",
    "mrpc": "classification",
    "sst-2": "classification",
    "sts-b": "regression",
    "qqp": "classification",
    "qnli": "classification",
    "rte": "classification",
    "wnli": "classification",
}

DEFAULT_FOLDER_NAMES = {
    "cola": "CoLA",
    "sst": "SST-2",
    "mrpc": "MRPC",
    "stsb": "STS-B",
    "qqp": "QQP",
    "mnli": "MNLI",
    "qnli": "QNLI",
    "rte": "RTE",
    "wnli": "WNLI",
    "snli": "SNLI",
}


# GLUE task metrics
[docs]def get_metric_fn(task_name):
    if task_name == "cola":
        return lambda p, l: {"mcc": matthews_corrcoef(p, l)}
    if task_name == "sst-2":
        return lambda p, l: {"acc": simple_accuracy(p, l)}
    if task_name == "mrpc":
        return acc_and_f1
    if task_name == "sts-b":
        return pearson_and_spearman
    if task_name == "qqp":
        return acc_and_f1
    if task_name == "mnli":
        return lambda p, l: {"acc": simple_accuracy(p, l)}
    if task_name == "mnli-mm":
        return lambda p, l: {"acc": simple_accuracy(p, l)}
    if task_name == "qnli":
        return lambda p, l: {"acc": simple_accuracy(p, l)}
    if task_name == "rte":
        return lambda p, l: {"acc": simple_accuracy(p, l)}
    if task_name == "wnli":
        return lambda p, l: {"acc": simple_accuracy(p, l)}
    raise KeyError(task_name)


[docs]def get_glue_task(task_name: str, data_dir: str = None):
    """Return a GLUE task object
    Args:
        task_name (str): name of GLUE task
        data_dir (str, optional): path to dataset, if not provided will be taken from
            GLUE_DIR env. variable
    """
    task_name = task_name.lower()
    if task_name not in processors:
        raise ValueError("Task not found: {}".format(task_name))
    task_processor = processors[task_name]()
    if data_dir is None:
        try:
            data_dir = os.path.join(os.environ["GLUE_DIR"], DEFAULT_FOLDER_NAMES[task_name])
        except Exception:
            data_dir = None
    task_type = output_modes[task_name]
    return Task(task_name, task_processor, data_dir, task_type)