Source code for nlp_architect.models.bist.eval.conllu.conll17_ud_eval

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import os
import sys

# Things that were changed from the original:
# - Added legal header
# - Reformatted code and variable names to conform with PEP8
# - Added pointer to 'weights.clas' file
# - Added run_conllu_eval()
# - Removed tests and command-line usage option
# - Removed unnecessary imports
# - Add pylint check disable flags

# !/usr/bin/env python
# CoNLL 2017 UD Parsing evaluation script.
#
# Compatible with Python 2.7 and 3.2+, can be used either as a module
# or a standalone executable.
#
# Copyright 2017 Institute of Formal and Applied Linguistics (UFAL),
# Faculty of Mathematics and Physics, Charles University, Czech Republic.
#
# Changelog:
# - [02 Jan 2017] Version 0.9: Initial release
# - [25 Jan 2017] Version 0.9.1: Fix bug in LCS alignment computation
# - [10 Mar 2017] Version 1.0: Add documentation and test
#                              Compare HEADs correctly using aligned words
#                              Allow evaluation with errorneous spaces in forms
#                              Compare forms in LCS case insensitively
#                              Detect cycles and multiple root nodes
#                              Compute AlignedAccuracy
# API usage
# ---------
# - load_conllu(file)
#   - loads CoNLL-U file from given file object to an internal representation
#   - the file object should return str on both Python 2 and Python 3
#   - raises UDError exception if the given file cannot be loaded
# - evaluate(gold_ud, system_ud)
#   - evaluate the given gold and system CoNLL-U files (loaded with
#     load_conllu)
#   - raises UDError if the concatenated tokens of gold and system file do not
#     match
#   - returns a dictionary with the metrics described above, each metrics
#     having three fields: precision, recall and f1
#
# Description of token matching
# -----------------------------
# In order to match tokens of gold file and system file, we consider the text
# resulting from concatenation of gold tokens and text resulting from
# concatenation of system tokens. These texts should match -- if they do not,
# the evaluation fails.
#
# If the texts do match, every token is represented as a range in this original
# text, and tokens are equal only if their range is the same.
#
# Description of word matching
# ----------------------------
# When matching words of gold file and system file, we first match the tokens.
# The words which are also tokens are matched as tokens, but words in
# multi-word tokens have to be handled differently.
#
# To handle multi-word tokens, we start by finding "multi-word spans".
# Multi-word span is a span in the original text such that
# - it contains at least one multi-word token
# - all multi-word tokens in the span (considering both gold and system ones)
#   are completely inside the span (i.e., they do not "stick out")
# - the multi-word span is as small as possible
#
# For every multi-word span, we align the gold and system words completely
# inside this span using LCS on their FORMs. The words not intersecting
# (even partially) any multi-word span are then aligned as tokens.
# pylint: disable=too-many-statements

# CoNLL-U column names
ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = list(range(10))

WEIGHTS = os.path.join(os.path.abspath(os.path.dirname(__file__)), "weights.clas")


# UD Error is used when raising exceptions in this module
[docs]class UDError(Exception):
    pass


# Load given CoNLL-U file into internal representation
[docs]def load_conllu(file):
    # pylint: disable=too-many-locals
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-statements

    # Internal representation classes
    class UDRepresentation:
        # pylint: disable=too-few-public-methods
        def __init__(self):
            # Characters of all the tokens in the whole file.
            # Whitespace between tokens is not included.
            self.characters = []
            # List of UDSpan instances with start&end indices into `characters`
            self.tokens = []
            # List of UDWord instances.
            self.words = []
            # List of UDSpan instances with start&end indices into `characters`
            self.sentences = []

    class UDSpan:
        # pylint: disable=too-few-public-methods
        def __init__(self, start, end):
            self.start = start
            # Note that self.end marks the first position **after the end** of
            # span, so we can use characters[start:end] or range(start, end).
            self.end = end

    class UDWord:
        # pylint: disable=too-few-public-methods
        def __init__(self, span, columns, is_multiword):
            # Span of this word (or MWT, see below) within
            # ud_representation.characters.
            self.span = span
            # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,...
            self.columns = columns
            # is_multiword==True means that this word is part of a multi-word
            # token.
            # In that case, self.span marks the span of the whole multi-word
            # token.
            self.is_multiword = is_multiword
            # Reference to the UDWord instance representing the HEAD (or None
            # if root).
            self.parent = None
            # Let's ignore language-specific deprel subtypes.
            self.columns[DEPREL] = columns[DEPREL].split(":")[0]

    ud = UDRepresentation()

    # Load the CoNLL-U file
    index, sentence_start = 0, None
    while True:
        line = file.readline()
        if not line:
            break
        line = line.rstrip("\r\n")

        # Handle sentence start boundaries
        if sentence_start is None:
            # Skip comments
            if line.startswith("#"):
                continue
            # Start a new sentence
            ud.sentences.append(UDSpan(index, 0))
            sentence_start = len(ud.words)
        if not line:
            # Add parent UDWord links and check there are no cycles
            def process_word(word):
                if word.parent == "remapping":
                    raise UDError("There is a cycle in a sentence")
                if word.parent is None:
                    head = int(word.columns[HEAD])
                    if head > len(ud.words) - sentence_start:
                        raise UDError(
                            "HEAD '{}' points outside of the sentence".format(word.columns[HEAD])
                        )
                    if head:
                        parent = ud.words[sentence_start + head - 1]
                        word.parent = "remapping"
                        process_word(parent)
                        word.parent = parent

            for word in ud.words[sentence_start:]:
                process_word(word)

            # Check there is a single root node
            if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1:
                raise UDError("There are multiple roots in a sentence")

            # End the sentence
            ud.sentences[-1].end = index
            sentence_start = None
            continue

        # Read next token/word
        columns = line.split("\t")
        if len(columns) != 10:
            raise UDError(
                "The CoNLL-U line does not contain 10 tab-separated columns: " "'{}'".format(line)
            )

        # Skip empty nodes
        if "." in columns[ID]:
            continue

        # Delete spaces from FORM  so gold.characters == system.characters
        # even if one of them tokenizes the space.
        columns[FORM] = columns[FORM].replace(" ", "")
        if not columns[FORM]:
            raise UDError("There is an empty FORM in the CoNLL-U file")

        # Save token
        ud.characters.extend(columns[FORM])
        ud.tokens.append(UDSpan(index, index + len(columns[FORM])))
        index += len(columns[FORM])

        # Handle multi-word tokens to save word(s)
        if "-" in columns[ID]:
            try:
                start, end = list(map(int, columns[ID].split("-")))
            except Exception:
                raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID]))

            for _ in range(start, end + 1):
                word_line = file.readline().rstrip("\r\n")
                word_columns = word_line.split("\t")
                if len(word_columns) != 10:
                    raise UDError(
                        "The CoNLL-U line does not contain 10 tab-separated "
                        "columns: '{}'".format(word_line)
                    )
                ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True))
        # Basic tokens/words
        else:
            try:
                word_id = int(columns[ID])
            except Exception:
                raise UDError("Cannot parse word ID '{}'".format(columns[ID]))
            if word_id != len(ud.words) - sentence_start + 1:
                raise UDError(
                    "Incorrect word ID '{}' for word '{}', expected"
                    " '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)
                )

            try:
                head_id = int(columns[HEAD])
            except Exception:
                raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD]))
            if head_id < 0:
                raise UDError("HEAD cannot be negative")

            ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False))

    if sentence_start is not None:
        raise UDError("The CoNLL-U file does not end with empty line")

    return ud


# Evaluate the gold and system treebanks (loaded using load_conllu).
[docs]def evaluate(gold_ud, system_ud, deprel_weights=None):
    # pylint: disable=too-many-locals
    class Score:
        # pylint: disable=too-few-public-methods
        def __init__(self, gold_total, system_total, correct, aligned_total=None):
            self.precision = correct / system_total if system_total else 0.0
            self.recall = correct / gold_total if gold_total else 0.0
            self.f1 = (
                2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0
            )
            self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total

    class AlignmentWord:
        # pylint: disable=too-few-public-methods
        def __init__(self, gold_word, system_word):
            self.gold_word = gold_word
            self.system_word = system_word
            self.gold_parent = None
            self.system_parent_gold_aligned = None

    class Alignment:
        def __init__(self, gold_words, system_words):
            self.gold_words = gold_words
            self.system_words = system_words
            self.matched_words = []
            self.matched_words_map = {}

        def append_aligned_words(self, gold_word, system_word):
            self.matched_words.append(AlignmentWord(gold_word, system_word))
            self.matched_words_map[system_word] = gold_word

        def fill_parents(self):
            # We represent root parents in both gold and system data by '0'.
            # For gold data, we represent non-root parent by corresponding gold
            # word.
            # For system data, we represent non-root parent by either gold word
            # aligned
            # to parent system nodes, or by None if no gold words is aligned to
            # the parent.
            for words in self.matched_words:
                words.gold_parent = (
                    words.gold_word.parent if words.gold_word.parent is not None else 0
                )
                words.system_parent_gold_aligned = (
                    self.matched_words_map.get(words.system_word.parent, None)
                    if words.system_word.parent is not None
                    else 0
                )

    def lower(text):
        if sys.version_info < (3, 0) and isinstance(text, str):
            return text.decode("utf-8").lower()
        return text.lower()

    def spans_score(gold_spans, system_spans):
        correct, gi, si = 0, 0, 0
        while gi < len(gold_spans) and si < len(system_spans):
            if system_spans[si].start < gold_spans[gi].start:
                si += 1
            elif gold_spans[gi].start < system_spans[si].start:
                gi += 1
            else:
                correct += gold_spans[gi].end == system_spans[si].end
                si += 1
                gi += 1

        return Score(len(gold_spans), len(system_spans), correct)

    def alignment_score(alignment, key_fn, weight_fn=lambda w: 1):
        gold, system, aligned, correct = 0, 0, 0, 0

        for word in alignment.gold_words:
            gold += weight_fn(word)

        for word in alignment.system_words:
            system += weight_fn(word)

        for words in alignment.matched_words:
            aligned += weight_fn(words.gold_word)

        if key_fn is None:
            # Return score for whole aligned words
            return Score(gold, system, aligned)

        for words in alignment.matched_words:
            if key_fn(words.gold_word, words.gold_parent) == key_fn(
                words.system_word, words.system_parent_gold_aligned
            ):
                correct += weight_fn(words.gold_word)

        return Score(gold, system, correct, aligned)

    def beyond_end(words, i, multiword_span_end):
        if i >= len(words):
            return True
        if words[i].is_multiword:
            return words[i].span.start >= multiword_span_end
        return words[i].span.end > multiword_span_end

    def extend_end(word, multiword_span_end):
        if word.is_multiword and word.span.end > multiword_span_end:
            return word.span.end
        return multiword_span_end

    def find_multiword_span(gold_words, system_words, gi, si):
        # We know gold_words[gi].is_multiword or system_words[si].is_multiword.
        # Find the start of the multiword span (gs, ss), so the multiword span
        # is minimal.
        # Initialize multiword_span_end characters index.
        if gold_words[gi].is_multiword:
            multiword_span_end = gold_words[gi].span.end
            if (
                not system_words[si].is_multiword
                and system_words[si].span.start < gold_words[gi].span.start
            ):
                si += 1
        else:  # if system_words[si].is_multiword
            multiword_span_end = system_words[si].span.end
            if (
                not gold_words[gi].is_multiword
                and gold_words[gi].span.start < system_words[si].span.start
            ):
                gi += 1
        gs, ss = gi, si

        # Find the end of the multiword span
        # (so both gi and si are pointing to the word following the multiword
        # span end).
        while not beyond_end(gold_words, gi, multiword_span_end) or not beyond_end(
            system_words, si, multiword_span_end
        ):
            gold_start = gold_words[gi].span.start
            sys_start = system_words[si].span.start
            if gi < len(gold_words) and (si >= len(system_words) or gold_start <= sys_start):
                multiword_span_end = extend_end(gold_words[gi], multiword_span_end)
                gi += 1
            else:
                multiword_span_end = extend_end(system_words[si], multiword_span_end)
                si += 1
        return gs, ss, gi, si

    def compute_lcs(gold_words, system_words, gi, si, gs, ss):
        # pylint: disable=too-many-arguments
        lcs = [[0] * (si - ss) for _ in range(gi - gs)]
        for g in reversed(list(range(gi - gs))):
            for s in reversed(list(range(si - ss))):
                if lower(gold_words[gs + g].columns[FORM]) == lower(
                    system_words[ss + s].columns[FORM]
                ):
                    lcs[g][s] = 1 + (
                        lcs[g + 1][s + 1] if g + 1 < gi - gs and s + 1 < si - ss else 0
                    )
                lcs[g][s] = max(lcs[g][s], lcs[g + 1][s] if g + 1 < gi - gs else 0)
                lcs[g][s] = max(lcs[g][s], lcs[g][s + 1] if s + 1 < si - ss else 0)
        return lcs

    def align_words(gold_words, system_words):
        alignment = Alignment(gold_words, system_words)

        gi, si = 0, 0
        while gi < len(gold_words) and si < len(system_words):
            if gold_words[gi].is_multiword or system_words[si].is_multiword:
                # A: Multi-word tokens => align via LCS within the whole
                # "multiword span".
                gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si)

                if si > ss and gi > gs:
                    lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss)

                    # Store aligned words
                    s, g = 0, 0
                    while g < gi - gs and s < si - ss:
                        if lower(gold_words[gs + g].columns[FORM]) == lower(
                            system_words[ss + s].columns[FORM]
                        ):
                            alignment.append_aligned_words(gold_words[gs + g], system_words[ss + s])
                            g += 1
                            s += 1
                        elif lcs[g][s] == (lcs[g + 1][s] if g + 1 < gi - gs else 0):
                            g += 1
                        else:
                            s += 1
            else:
                # B: No multi-word token => align according to spans.
                if (gold_words[gi].span.start, gold_words[gi].span.end) == (
                    system_words[si].span.start,
                    system_words[si].span.end,
                ):
                    alignment.append_aligned_words(gold_words[gi], system_words[si])
                    gi += 1
                    si += 1
                elif gold_words[gi].span.start <= system_words[si].span.start:
                    gi += 1
                else:
                    si += 1

        alignment.fill_parents()

        return alignment

    # Check that underlying character sequences do match
    if gold_ud.characters != system_ud.characters:
        index = 0
        while gold_ud.characters[index] == system_ud.characters[index]:
            index += 1

        raise UDError(
            "The concatenation of tokens in gold file and in system file "
            "differ!\n" + "First 20 differing characters in gold file: '{}' and system file:"
            " '{}'".format(
                "".join(gold_ud.characters[index : index + 20]),
                "".join(system_ud.characters[index : index + 20]),
            )
        )

    # Align words
    alignment = align_words(gold_ud.words, system_ud.words)

    # Compute the F1-scores
    result = {
        "Tokens": spans_score(gold_ud.tokens, system_ud.tokens),
        "Sentences": spans_score(gold_ud.sentences, system_ud.sentences),
        "Words": alignment_score(alignment, None),
        "UPOS": alignment_score(alignment, lambda w, parent: w.columns[UPOS]),
        "XPOS": alignment_score(alignment, lambda w, parent: w.columns[XPOS]),
        "Feats": alignment_score(alignment, lambda w, parent: w.columns[FEATS]),
        "AllTags": alignment_score(
            alignment, lambda w, parent: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])
        ),
        "Lemmas": alignment_score(alignment, lambda w, parent: w.columns[LEMMA]),
        "UAS": alignment_score(alignment, lambda w, parent: parent),
        "LAS": alignment_score(alignment, lambda w, parent: (parent, w.columns[DEPREL])),
    }

    # Add WeightedLAS if weights are given
    if deprel_weights is not None:

        def weighted_las(word):
            return deprel_weights.get(word.columns[DEPREL], 1.0)

        result["WeightedLAS"] = alignment_score(
            alignment, lambda w, parent: (parent, w.columns[DEPREL]), weighted_las
        )

    return result


[docs]def load_deprel_weights(weights_file):
    if weights_file is None:
        return None

    deprel_weights = {}
    with open(weights_file) as f:
        for line in f:
            # Ignore comments and empty lines
            if line.startswith("#") or not line.strip():
                continue

            columns = line.rstrip("\r\n").split()
            if len(columns) != 2:
                raise ValueError(
                    "Expected two columns in the UD Relations weights file on line"
                    " '{}'".format(line)
                )

            deprel_weights[columns[0]] = float(columns[1])

    return deprel_weights


[docs]def load_conllu_file(path):
    with open(
        path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})
    ) as _file:
        return load_conllu(_file)


[docs]def evaluate_wrapper(gold_file: str, system_file: str, weights_file: str):
    # Load CoNLL-U files
    gold_ud = load_conllu_file(gold_file)
    system_ud = load_conllu_file(system_file)

    # Load weights if requested
    deprel_weights = load_deprel_weights(weights_file)

    return evaluate(gold_ud, system_ud, deprel_weights)


[docs]def run_conllu_eval(gold_file, test_file, weights_file=WEIGHTS, verbose=True):
    # Use verbose if weights are supplied
    if weights_file is not None and not verbose:
        verbose = True

    # Evaluate
    evaluation = evaluate_wrapper(gold_file, test_file, weights_file)

    # Write the evaluation to file
    with open(test_file[: test_file.rindex(".")] + "_eval.txt", "w") as out_file:
        if not verbose:
            out_file.write("LAS F1 Score: {:.2f}".format(100 * evaluation["LAS"].f1) + "\n")
        else:
            metrics = [
                "Tokens",
                "Sentences",
                "Words",
                "UPOS",
                "XPOS",
                "Feats",
                "AllTags",
                "Lemmas",
                "UAS",
                "LAS",
            ]
            if weights_file is not None:
                metrics.append("WeightedLAS")

            out_file.write("Metrics    | Precision |    Recall |  F1 Score | AligndAcc" + "\n")
            out_file.write("-----------+-----------+-----------+-----------+-----------" + "\n")
            for metric in metrics:
                out_file.write(
                    "{:11}|{:10.2f} |{:10.2f} |{:10.2f} |{}".format(
                        metric,
                        100 * evaluation[metric].precision,
                        100 * evaluation[metric].recall,
                        100 * evaluation[metric].f1,
                        "{:10.2f}".format(100 * evaluation[metric].aligned_accuracy)
                        if evaluation[metric].aligned_accuracy is not None
                        else "",
                    )
                    + "\n"
                )