Source code for nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import re
from typing import Dict

from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import (
    WikipediaPageExtractedRelations,
    DISAMBIGUATION_TITLE,
)
from nlp_architect.utils.string_utils import StringUtils


[docs]class WikipediaPage(object):
    def __init__(
        self,
        orig_phrase: str = None,
        orig_phrase_norm: str = None,
        wiki_title: str = None,
        wiki_title_norm: str = None,
        score: int = 0,
        pageid: int = 0,
        description: str = None,
        relations: WikipediaPageExtractedRelations = None,
    ) -> None:
        """
        Object represent a Wikipedia Page and extracted fields.

        Args:
            orig_phrase (str): original search phrase
            orig_phrase_norm (str): original search phrase normalized
            wiki_title (str): page title
            wiki_title_norm (str): page title normalized
            score (int): score for getting wiki_title from orig_phrase
            pageid (int): the unique page identifier
            description (str, optional): the page description
            relations (WikipediaPageExtractedRelations): Object that represent all
                                                         extracted Wikipedia relations
        """
        self.orig_phrase = orig_phrase
        if orig_phrase_norm is None:
            self.orig_phrase_norm = StringUtils.normalize_str(orig_phrase)
        else:
            self.orig_phrase_norm = orig_phrase_norm

        self.wiki_title = wiki_title.replace(DISAMBIGUATION_TITLE, "")
        if wiki_title_norm is None:
            self.wiki_title_norm = StringUtils.normalize_str(wiki_title)
        else:
            self.wiki_title_norm = wiki_title_norm

        self.score = score
        self.pageid = int(pageid)
        self.description = description
        self.relations = relations

[docs]    def toJson(self) -> Dict:
        result_dict = {}
        result_dict["orig_phrase"] = self.orig_phrase
        result_dict["orig_phrase_norm"] = self.orig_phrase_norm
        result_dict["wiki_title"] = self.wiki_title
        result_dict["wiki_title_norm"] = self.wiki_title_norm
        result_dict["score"] = self.score
        result_dict["pageid"] = self.pageid
        result_dict["description"] = self.description
        result_dict["relations"] = self.relations.toJson()
        return result_dict

    def __eq__(self, other):
        return (
            self.orig_phrase == other.orig_phrase
            and self.wiki_title == other.wiki_title
            and self.pageid == other.pageid
        )

    def __hash__(self):
        return hash(self.orig_phrase) + hash(self.pageid) + hash(self.wiki_title)

    def __str__(self) -> str:
        result_str = ""
        try:
            title_strip = re.sub("(\u2018|\u2019)", "'", self.orig_phrase)
            wiki_title_strip = re.sub("(\u2018|\u2019)", "'", self.wiki_title)
            result_str = (
                str(title_strip)
                + ", "
                + str(wiki_title_strip)
                + ", "
                + str(self.score)
                + ", "
                + str(self.pageid)
                + ", "
                + str(self.description)
                + ", "
                + str(self.relations)
            )
        except Exception:
            result_str = "error in to_string()"

        return result_str