Source code for nlp_architect.utils.metrics

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
##
# This file contains code from
#  (https://github.com/chakki-works/seqeval/tree/master/seqeval/metrics)
#
# The code was changed to support BILOU format evaluation
#
#  MIT License
#
# Copyright (c) 2018 chakki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import absolute_import, division, print_function, unicode_literals

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score as classification_f1_score
from collections import defaultdict
import numpy as np


[docs]def get_conll_scores(predictions, y, y_lex, unk="O"): """Get Conll style scores (precision, recall, f1)""" if isinstance(predictions, list): predictions = predictions[-1] test_p = predictions if len(test_p.shape) > 2: test_p = test_p.argmax(2) test_y = y if len(test_y.shape) > 2: test_y = test_y.argmax(2) prediction_data = [] for n in range(test_y.shape[0]): test_yval = [] for i in list(test_y[n]): try: test_yval.append(y_lex[i]) except KeyError: pass test_pval = [unk] * len(test_yval) for e, i in enumerate(list(test_p[n])[: len(test_pval)]): try: test_pval[e] = y_lex[i] except KeyError: pass prediction_data.append((test_yval, test_pval)) y_true, y_pred = list(zip(*prediction_data)) return classification_report(y_true, y_pred, digits=3)
[docs]def simple_accuracy(preds, labels): """return simple accuracy""" return (preds == labels).mean()
[docs]def accuracy(preds, labels): """return simple accuracy in expected dict format""" acc = simple_accuracy(preds, labels) return {"acc": acc}
[docs]def acc_and_f1(preds, labels): """return accuracy and f1 score""" acc = simple_accuracy(preds, labels) f1 = classification_f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, }
[docs]def pearson_and_spearman(preds, labels): """get pearson and spearman correlation""" pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, }
[docs]def tagging(preds, labels): p = sequence_precision_score(labels, preds) r = sequence_recall_score(labels, preds) f1 = sequence_f1_score(labels, preds) return p, r, f1
## # The below code is taken and changed from package chakki-works/seqeval # (seqeval/metrics/sequence_labeling.py) The code was changed to support # BILOU format evaluation ## """Metrics to assess performance on sequence labeling task given prediction Functions named as ``*_score`` return a scalar value to maximize: the higher the better """
[docs]def get_entities(seq, suffix=False): """Gets entities from sequence. Args: seq (list): sequence of labels. Returns: list: list of (chunk_type, chunk_start, chunk_end). Example: >>> from seqeval.metrics.sequence_labeling import get_entities >>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC'] >>> get_entities(seq) [('PER', 0, 1), ('LOC', 3, 3)] """ # for nested list if any(isinstance(s, list) for s in seq): seq = [item for sublist in seq for item in sublist + ["O"]] prev_tag = "O" prev_type = "" begin_offset = 0 chunks = [] for i, chunk in enumerate(seq + ["O"]): if suffix: tag = chunk[-1] type_ = chunk.split("-")[0] else: tag = chunk[0] type_ = chunk.split("-")[-1] if end_of_chunk(prev_tag, tag, prev_type, type_): chunks.append((prev_type, begin_offset, i - 1)) if start_of_chunk(prev_tag, tag, prev_type, type_): begin_offset = i prev_tag = tag prev_type = type_ return chunks
[docs]def end_of_chunk(prev_tag, tag, prev_type, type_): """Checks if a chunk ended between the previous and current word. Args: prev_tag: previous chunk tag. tag: current chunk tag. prev_type: previous type. type_: current type. Returns: chunk_end: boolean. """ chunk_end = False end_tag = ("L", "E") start_tag = ("U", "S") if prev_tag in end_tag: chunk_end = True if prev_tag in start_tag: chunk_end = True if prev_tag == "B" and tag == "B": chunk_end = True if prev_tag == "B" and tag in start_tag: chunk_end = True if prev_tag == "B" and tag == "O": chunk_end = True if prev_tag == "I" and tag == "B": chunk_end = True if prev_tag == "I" and tag in start_tag: chunk_end = True if prev_tag == "I" and tag == "O": chunk_end = True if prev_tag != "O" and prev_tag != "." and prev_type != type_: chunk_end = True return chunk_end
[docs]def start_of_chunk(prev_tag, tag, prev_type, type_): """Checks if a chunk started between the previous and current word. Args: prev_tag: previous chunk tag. tag: current chunk tag. prev_type: previous type. type_: current type. Returns: chunk_start: boolean. """ chunk_start = False end_tag = ("L", "E") start_tag = ("U", "S") if tag == "B": chunk_start = True if tag in start_tag: chunk_start = True if prev_tag in end_tag and tag in end_tag: chunk_start = True if prev_tag in end_tag and tag == "I": chunk_start = True if prev_tag in start_tag and tag in end_tag: chunk_start = True if prev_tag in start_tag and tag == "I": chunk_start = True if prev_tag == "O" and tag in end_tag: chunk_start = True if prev_tag == "O" and tag == "I": chunk_start = True if tag != "O" and tag != "." and prev_type != type_: chunk_start = True return chunk_start
[docs]def sequence_f1_score(y_true, y_pred, suffix=False): """Compute the F1 score. The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:: F1 = 2 * (precision * recall) / (precision + recall) Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. Returns: score : float. Example: >>> from seqeval.metrics import f1_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], >>> ['B-PER', 'I-PER', 'O']] >>> f1_score(y_true, y_pred) 0.50 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 score = 2 * p * r / (p + r) if p + r > 0 else 0 return score
[docs]def sequence_accuracy_score(y_true, y_pred): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. Returns: score : float. Example: >>> from seqeval.metrics import accuracy_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], >>> ['B-PER', 'I-PER', 'O']] >>> accuracy_score(y_true, y_pred) 0.80 """ if any(isinstance(s, list) for s in y_true): y_true = [item for sublist in y_true for item in sublist] y_pred = [item for sublist in y_pred for item in sublist] nb_correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred)) nb_true = len(y_true) score = nb_correct / nb_true return score
[docs]def sequence_precision_score(y_true, y_pred, suffix=False): """Compute the precision. The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample. The best value is 1 and the worst value is 0. Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. Returns: score : float. Example: >>> from seqeval.metrics import precision_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], >>> ['B-PER', 'I-PER', 'O']] >>> precision_score(y_true, y_pred) 0.50 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) score = nb_correct / nb_pred if nb_pred > 0 else 0 return score
[docs]def sequence_recall_score(y_true, y_pred, suffix=False): """Compute the recall. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0. Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. Returns: score : float. Example: >>> from seqeval.metrics import recall_score >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], >>> ['B-PER', 'I-PER', 'O']] >>> recall_score(y_true, y_pred) 0.50 """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) nb_correct = len(true_entities & pred_entities) nb_true = len(true_entities) score = nb_correct / nb_true if nb_true > 0 else 0 return score
[docs]def sequence_performance_measure(y_true, y_pred): """ Compute the performance metrics: TP, FP, FN, TN Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a tagger. Returns: performance_dict : dict Example: >>> from seqeval.metrics import performance_measure >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'O', 'B-ORG'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O', 'O'], ['B-PER', 'I-PER', 'O']] >>> performance_measure(y_true, y_pred) (3, 3, 1, 4) """ performace_dict = dict() if any(isinstance(s, list) for s in y_true): y_true = [item for sublist in y_true for item in sublist] y_pred = [item for sublist in y_pred for item in sublist] performace_dict["TP"] = sum( y_t == y_p for y_t, y_p in zip(y_true, y_pred) if ((y_t != "O") or (y_p != "O")) ) performace_dict["FP"] = sum(y_t != y_p for y_t, y_p in zip(y_true, y_pred)) performace_dict["FN"] = sum(((y_t != "O") and (y_p == "O")) for y_t, y_p in zip(y_true, y_pred)) performace_dict["TN"] = sum((y_t == y_p == "O") for y_t, y_p in zip(y_true, y_pred)) return performace_dict
[docs]def classification_report(y_true, y_pred, digits=2, suffix=False): """Build a text report showing the main classification metrics. Args: y_true : 2d array. Ground truth (correct) target values. y_pred : 2d array. Estimated targets as returned by a classifier. digits : int. Number of digits for formatting output floating point values. Returns: report : string. Text summary of the precision, recall, F1 score for each class. Examples: >>> from seqeval.metrics import classification_report >>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] >>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], >>> ['B-PER', 'I-PER', 'O']] >>> print(classification_report(y_true, y_pred)) precision recall f1-score support <BLANKLINE> MISC 0.00 0.00 0.00 1 PER 1.00 1.00 1.00 1 <BLANKLINE> micro avg 0.50 0.50 0.50 2 macro avg 0.50 0.50 0.50 2 <BLANKLINE> """ true_entities = set(get_entities(y_true, suffix)) pred_entities = set(get_entities(y_pred, suffix)) name_width = 0 d1 = defaultdict(set) d2 = defaultdict(set) for e in true_entities: d1[e[0]].add((e[1], e[2])) name_width = max(name_width, len(e[0])) for e in pred_entities: d2[e[0]].add((e[1], e[2])) last_line_heading = "macro avg" width = max(name_width, len(last_line_heading), digits) headers = ["precision", "recall", "f1-score", "support"] head_fmt = "{:>{width}s} " + " {:>9}" * len(headers) report = head_fmt.format("", *headers, width=width) report += "\n\n" row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n" ps, rs, f1s, s = [], [], [], [] for type_name, true_entities in d1.items(): pred_entities = d2[type_name] nb_correct = len(true_entities & pred_entities) nb_pred = len(pred_entities) nb_true = len(true_entities) p = nb_correct / nb_pred if nb_pred > 0 else 0 r = nb_correct / nb_true if nb_true > 0 else 0 f1 = 2 * p * r / (p + r) if p + r > 0 else 0 report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits) ps.append(p) rs.append(r) f1s.append(f1) s.append(nb_true) report += "\n" # compute averages report += row_fmt.format( "micro avg", sequence_precision_score(y_true, y_pred, suffix=suffix), sequence_recall_score(y_true, y_pred, suffix=suffix), sequence_f1_score(y_true, y_pred, suffix=suffix), np.sum(s), width=width, digits=digits, ) report += row_fmt.format( last_line_heading, np.average(ps, weights=s), np.average(rs, weights=s), np.average(f1s, weights=s), np.sum(s), width=width, digits=digits, ) return report
# up to here code from seqeval/metrics/sequence_labeling.py # (https://github.com/chakki-works/seqeval/tree/master/seqeval/metrics)