# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
##
# This file contains code from
# (https://github.com/chakki-works/seqeval/tree/master/seqeval/metrics)
#
# The code was changed to support BILOU format evaluation
#
# MIT License
#
# Copyright (c) 2018 chakki
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from __future__ import absolute_import, division, print_function, unicode_literals
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score as classification_f1_score
from collections import defaultdict
import numpy as np
[docs]def get_conll_scores(predictions, y, y_lex, unk="O"):
"""Get Conll style scores (precision, recall, f1)"""
if isinstance(predictions, list):
predictions = predictions[-1]
test_p = predictions
if len(test_p.shape) > 2:
test_p = test_p.argmax(2)
test_y = y
if len(test_y.shape) > 2:
test_y = test_y.argmax(2)
prediction_data = []
for n in range(test_y.shape[0]):
test_yval = []
for i in list(test_y[n]):
try:
test_yval.append(y_lex[i])
except KeyError:
pass
test_pval = [unk] * len(test_yval)
for e, i in enumerate(list(test_p[n])[: len(test_pval)]):
try:
test_pval[e] = y_lex[i]
except KeyError:
pass
prediction_data.append((test_yval, test_pval))
y_true, y_pred = list(zip(*prediction_data))
return classification_report(y_true, y_pred, digits=3)
[docs]def simple_accuracy(preds, labels):
"""return simple accuracy"""
return (preds == labels).mean()
[docs]def accuracy(preds, labels):
"""return simple accuracy in expected dict format"""
acc = simple_accuracy(preds, labels)
return {"acc": acc}
[docs]def acc_and_f1(preds, labels):
"""return accuracy and f1 score"""
acc = simple_accuracy(preds, labels)
f1 = classification_f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
[docs]def pearson_and_spearman(preds, labels):
"""get pearson and spearman correlation"""
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
[docs]def tagging(preds, labels):
p = sequence_precision_score(labels, preds)
r = sequence_recall_score(labels, preds)
f1 = sequence_f1_score(labels, preds)
return p, r, f1
##
# The below code is taken and changed from package chakki-works/seqeval
# (seqeval/metrics/sequence_labeling.py) The code was changed to support
# BILOU format evaluation
##
"""Metrics to assess performance on sequence labeling task given prediction
Functions named as ``*_score`` return a scalar value to maximize: the higher
the better
"""
[docs]def get_entities(seq, suffix=False):
"""Gets entities from sequence.
Args:
seq (list): sequence of labels.
Returns:
list: list of (chunk_type, chunk_start, chunk_end).
Example:
>>> from seqeval.metrics.sequence_labeling import get_entities
>>> seq = ['B-PER', 'I-PER', 'O', 'B-LOC']
>>> get_entities(seq)
[('PER', 0, 1), ('LOC', 3, 3)]
"""
# for nested list
if any(isinstance(s, list) for s in seq):
seq = [item for sublist in seq for item in sublist + ["O"]]
prev_tag = "O"
prev_type = ""
begin_offset = 0
chunks = []
for i, chunk in enumerate(seq + ["O"]):
if suffix:
tag = chunk[-1]
type_ = chunk.split("-")[0]
else:
tag = chunk[0]
type_ = chunk.split("-")[-1]
if end_of_chunk(prev_tag, tag, prev_type, type_):
chunks.append((prev_type, begin_offset, i - 1))
if start_of_chunk(prev_tag, tag, prev_type, type_):
begin_offset = i
prev_tag = tag
prev_type = type_
return chunks
[docs]def end_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk ended between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_end: boolean.
"""
chunk_end = False
end_tag = ("L", "E")
start_tag = ("U", "S")
if prev_tag in end_tag:
chunk_end = True
if prev_tag in start_tag:
chunk_end = True
if prev_tag == "B" and tag == "B":
chunk_end = True
if prev_tag == "B" and tag in start_tag:
chunk_end = True
if prev_tag == "B" and tag == "O":
chunk_end = True
if prev_tag == "I" and tag == "B":
chunk_end = True
if prev_tag == "I" and tag in start_tag:
chunk_end = True
if prev_tag == "I" and tag == "O":
chunk_end = True
if prev_tag != "O" and prev_tag != "." and prev_type != type_:
chunk_end = True
return chunk_end
[docs]def start_of_chunk(prev_tag, tag, prev_type, type_):
"""Checks if a chunk started between the previous and current word.
Args:
prev_tag: previous chunk tag.
tag: current chunk tag.
prev_type: previous type.
type_: current type.
Returns:
chunk_start: boolean.
"""
chunk_start = False
end_tag = ("L", "E")
start_tag = ("U", "S")
if tag == "B":
chunk_start = True
if tag in start_tag:
chunk_start = True
if prev_tag in end_tag and tag in end_tag:
chunk_start = True
if prev_tag in end_tag and tag == "I":
chunk_start = True
if prev_tag in start_tag and tag in end_tag:
chunk_start = True
if prev_tag in start_tag and tag == "I":
chunk_start = True
if prev_tag == "O" and tag in end_tag:
chunk_start = True
if prev_tag == "O" and tag == "I":
chunk_start = True
if tag != "O" and tag != "." and prev_type != type_:
chunk_start = True
return chunk_start
[docs]def sequence_f1_score(y_true, y_pred, suffix=False):
"""Compute the F1 score.
The F1 score can be interpreted as a weighted average of the precision and
recall, where an F1 score reaches its best value at 1 and worst score at 0.
The relative contribution of precision and recall to the F1 score are
equal. The formula for the F1 score is::
F1 = 2 * (precision * recall) / (precision + recall)
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import f1_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'],
>>> ['B-PER', 'I-PER', 'O']]
>>> f1_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
score = 2 * p * r / (p + r) if p + r > 0 else 0
return score
[docs]def sequence_accuracy_score(y_true, y_pred):
"""Accuracy classification score.
In multilabel classification, this function computes subset accuracy:
the set of labels predicted for a sample must *exactly* match the
corresponding set of labels in y_true.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import accuracy_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'],
>>> ['B-PER', 'I-PER', 'O']]
>>> accuracy_score(y_true, y_pred)
0.80
"""
if any(isinstance(s, list) for s in y_true):
y_true = [item for sublist in y_true for item in sublist]
y_pred = [item for sublist in y_pred for item in sublist]
nb_correct = sum(y_t == y_p for y_t, y_p in zip(y_true, y_pred))
nb_true = len(y_true)
score = nb_correct / nb_true
return score
[docs]def sequence_precision_score(y_true, y_pred, suffix=False):
"""Compute the precision.
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
true positives and ``fp`` the number of false positives. The precision is
intuitively the ability of the classifier not to label as positive a sample.
The best value is 1 and the worst value is 0.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import precision_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'],
>>> ['B-PER', 'I-PER', 'O']]
>>> precision_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
score = nb_correct / nb_pred if nb_pred > 0 else 0
return score
[docs]def sequence_recall_score(y_true, y_pred, suffix=False):
"""Compute the recall.
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
true positives and ``fn`` the number of false negatives. The recall is
intuitively the ability of the classifier to find all the positive samples.
The best value is 1 and the worst value is 0.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a tagger.
Returns:
score : float.
Example:
>>> from seqeval.metrics import recall_score
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'],
>>> ['B-PER', 'I-PER', 'O']]
>>> recall_score(y_true, y_pred)
0.50
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
nb_correct = len(true_entities & pred_entities)
nb_true = len(true_entities)
score = nb_correct / nb_true if nb_true > 0 else 0
return score
[docs]def classification_report(y_true, y_pred, digits=2, suffix=False):
"""Build a text report showing the main classification metrics.
Args:
y_true : 2d array. Ground truth (correct) target values.
y_pred : 2d array. Estimated targets as returned by a classifier.
digits : int. Number of digits for formatting output floating point values.
Returns:
report : string. Text summary of the precision, recall, F1 score for each class.
Examples:
>>> from seqeval.metrics import classification_report
>>> y_true = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']]
>>> y_pred = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'],
>>> ['B-PER', 'I-PER', 'O']]
>>> print(classification_report(y_true, y_pred))
precision recall f1-score support
<BLANKLINE>
MISC 0.00 0.00 0.00 1
PER 1.00 1.00 1.00 1
<BLANKLINE>
micro avg 0.50 0.50 0.50 2
macro avg 0.50 0.50 0.50 2
<BLANKLINE>
"""
true_entities = set(get_entities(y_true, suffix))
pred_entities = set(get_entities(y_pred, suffix))
name_width = 0
d1 = defaultdict(set)
d2 = defaultdict(set)
for e in true_entities:
d1[e[0]].add((e[1], e[2]))
name_width = max(name_width, len(e[0]))
for e in pred_entities:
d2[e[0]].add((e[1], e[2]))
last_line_heading = "macro avg"
width = max(name_width, len(last_line_heading), digits)
headers = ["precision", "recall", "f1-score", "support"]
head_fmt = "{:>{width}s} " + " {:>9}" * len(headers)
report = head_fmt.format("", *headers, width=width)
report += "\n\n"
row_fmt = "{:>{width}s} " + " {:>9.{digits}f}" * 3 + " {:>9}\n"
ps, rs, f1s, s = [], [], [], []
for type_name, true_entities in d1.items():
pred_entities = d2[type_name]
nb_correct = len(true_entities & pred_entities)
nb_pred = len(pred_entities)
nb_true = len(true_entities)
p = nb_correct / nb_pred if nb_pred > 0 else 0
r = nb_correct / nb_true if nb_true > 0 else 0
f1 = 2 * p * r / (p + r) if p + r > 0 else 0
report += row_fmt.format(*[type_name, p, r, f1, nb_true], width=width, digits=digits)
ps.append(p)
rs.append(r)
f1s.append(f1)
s.append(nb_true)
report += "\n"
# compute averages
report += row_fmt.format(
"micro avg",
sequence_precision_score(y_true, y_pred, suffix=suffix),
sequence_recall_score(y_true, y_pred, suffix=suffix),
sequence_f1_score(y_true, y_pred, suffix=suffix),
np.sum(s),
width=width,
digits=digits,
)
report += row_fmt.format(
last_line_heading,
np.average(ps, weights=s),
np.average(rs, weights=s),
np.average(f1s, weights=s),
np.sum(s),
width=width,
digits=digits,
)
return report
# up to here code from seqeval/metrics/sequence_labeling.py
# (https://github.com/chakki-works/seqeval/tree/master/seqeval/metrics)