Source code for nlp_architect.common.cdc.mention_data

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import sys
from typing import List

from nlp_architect.utils.io import load_json_file
from nlp_architect.utils.string_utils import StringUtils


[docs]class MentionDataLight(object):
    def __init__(
        self,
        tokens_str: str,
        mention_context: str = None,
        mention_head: str = None,
        mention_head_lemma: str = None,
        mention_pos: str = None,
        mention_ner: str = None,
    ):
        """
        Object represent a mention with only text values
        Args:
            tokens_str: str the tokens combine text (join with space)
            mention_head: str
            mention_head_lemma: str
        """
        self.tokens_str = tokens_str
        self.mention_context = mention_context
        if not mention_head and not mention_head_lemma:
            (
                self.mention_head,
                self.mention_head_lemma,
                self.mention_head_pos,
                self.mention_ner,
            ) = StringUtils.find_head_lemma_pos_ner(str(tokens_str))
        else:
            self.mention_head = mention_head
            self.mention_head_lemma = mention_head_lemma
            self.mention_head_pos = mention_pos
            self.mention_ner = mention_ner


[docs]class MentionData(MentionDataLight):
    def __init__(
        self,
        topic_id: str,
        doc_id: str,
        sent_id: int,
        tokens_numbers: List[int],
        tokens_str: str,
        mention_context: List[str],
        mention_head: str,
        mention_head_lemma: str,
        coref_chain: str,
        mention_type: str = "NA",
        is_continuous: bool = True,
        is_singleton: bool = False,
        score: float = float(-1),
        predicted_coref_chain: str = None,
        mention_pos: str = None,
        mention_ner: str = None,
        mention_index: int = -1,
    ) -> None:
        """
        Object represent a mention

        Args:
            topic_id: str topic ID
            doc_id: str document ID
            sent_id: int sentence number
            tokens_numbers: List[int] - tokens numbers
            mention_context: List[str] - list of tokens strings
            coref_chain: str
            mention_type: str one of (HUM/NON/TIM/LOC/ACT/NEG)
            is_continuous: bool
            is_singleton: bool
            score: float
            predicted_coref_chain: str (should be field while evaluated)
            mention_pos: str
            mention_ner: str
            mention_index: in case order is of value (default = -1)
        """
        super(MentionData, self).__init__(
            tokens_str, mention_context, mention_head, mention_head_lemma, mention_pos, mention_ner
        )
        self.topic_id = topic_id
        self.doc_id = doc_id
        self.sent_id = sent_id
        self.tokens_number = tokens_numbers
        self.mention_type = mention_type
        self.coref_chain = coref_chain
        self.is_continuous = is_continuous
        self.is_singleton = is_singleton
        self.score = score
        self.predicted_coref_chain = predicted_coref_chain
        self.mention_id = self.gen_mention_id()
        self.mention_index = mention_index

[docs]    @staticmethod
    def read_json_mention_data_line(mention_line: str):
        """
        Args:
            mention_line: a Json representation of a single mention

        Returns:
            MentionData object
        """
        # pylint: disable=too-many-branches

        try:
            topic_id = None
            coref_chain = None
            doc_id = None
            sent_id = None
            tokens_numbers = None
            score = -1
            mention_type = None
            predicted_coref_chain = None
            mention_context = None
            is_continue = False
            is_singleton = False
            mention_pos = None
            mention_ner = None
            mention_index = -1

            mention_text = mention_line["tokens_str"]

            if "topic_id" in mention_line:
                topic_id = mention_line["topic_id"]

            if "coref_chain" in mention_line:
                coref_chain = mention_line["coref_chain"]

            if "doc_id" in mention_line:
                doc_id = mention_line["doc_id"]
                if ".xml" not in doc_id:
                    doc_id = doc_id + ".xml"

            if "sent_id" in mention_line:
                sent_id = mention_line["sent_id"]

            if "tokens_number" in mention_line:
                tokens_numbers = mention_line["tokens_number"]

            if "mention_context" in mention_line:
                mention_context = mention_line["mention_context"]

            if "mention_head" in mention_line and "mention_head_lemma" in mention_line:
                mention_head = mention_line["mention_head"]
                mention_head_lemma = mention_line["mention_head_lemma"]
                if "mention_head_pos" in mention_line:
                    mention_pos = mention_line["mention_head_pos"]
                if "mention_ner" in mention_line:
                    mention_ner = mention_line["mention_ner"]
            else:
                (
                    mention_head,
                    mention_head_lemma,
                    mention_pos,
                    mention_ner,
                ) = StringUtils.find_head_lemma_pos_ner(str(mention_text))

            if "mention_type" in mention_line:
                mention_type = mention_line["mention_type"]
            if "score" in mention_line:
                score = mention_line["score"]

            if "is_continuous" in mention_line:
                is_continue = mention_line["is_continuous"]

            if "is_singleton" in mention_line:
                is_singleton = mention_line["is_singleton"]

            if "predicted_coref_chain" in mention_line:
                predicted_coref_chain = mention_line["predicted_coref_chain"]

            if "mention_index" in mention_line:
                mention_index = mention_line["mention_index"]

            mention_data = MentionData(
                topic_id,
                doc_id,
                sent_id,
                tokens_numbers,
                mention_text,
                mention_context,
                mention_head,
                mention_head_lemma,
                coref_chain,
                mention_type,
                is_continue,
                is_singleton,
                score,
                predicted_coref_chain,
                mention_pos,
                mention_ner,
                mention_index,
            )
        except Exception:
            print("Unexpected error:", sys.exc_info()[0])
            raise Exception("failed reading json line-" + str(mention_line))

        return mention_data

[docs]    @staticmethod
    def read_mentions_json_to_mentions_data_list(mentions_json_file: str):
        """

        Args:
            mentions_json_file: the path of the mentions json file to read

        Returns:
            List[MentionData]
        """
        all_mentions_only = load_json_file(mentions_json_file)

        mentions = []
        for mention_line in all_mentions_only:
            mention_data = MentionData.read_json_mention_data_line(mention_line)
            mentions.append(mention_data)

        return mentions

[docs]    def get_tokens(self):
        return self.tokens_number

[docs]    def gen_mention_id(self) -> str:
        if self.doc_id and self.sent_id is not None and self.tokens_number:
            tokens_ids = [str(self.doc_id), str(self.sent_id)]
            tokens_ids.extend([str(token_id) for token_id in self.tokens_number])
            return "_".join(tokens_ids)

        return "_".join(self.tokens_str.split())

[docs]    def get_mention_id(self) -> str:
        if not self.mention_id:
            self.mention_id = self.gen_mention_id()
        return self.mention_id

[docs]    @staticmethod
    def static_gen_token_unique_id(doc_id: int, sent_id: int, token_id: int) -> str:
        return "_".join([str(doc_id), str(sent_id), str(token_id)])