Source code for nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import re
from typing import Dict

from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import (
    WikipediaPageExtractedRelations,
    DISAMBIGUATION_TITLE,
)
from nlp_architect.utils.string_utils import StringUtils


[docs]class WikipediaPage(object): def __init__( self, orig_phrase: str = None, orig_phrase_norm: str = None, wiki_title: str = None, wiki_title_norm: str = None, score: int = 0, pageid: int = 0, description: str = None, relations: WikipediaPageExtractedRelations = None, ) -> None: """ Object represent a Wikipedia Page and extracted fields. Args: orig_phrase (str): original search phrase orig_phrase_norm (str): original search phrase normalized wiki_title (str): page title wiki_title_norm (str): page title normalized score (int): score for getting wiki_title from orig_phrase pageid (int): the unique page identifier description (str, optional): the page description relations (WikipediaPageExtractedRelations): Object that represent all extracted Wikipedia relations """ self.orig_phrase = orig_phrase if orig_phrase_norm is None: self.orig_phrase_norm = StringUtils.normalize_str(orig_phrase) else: self.orig_phrase_norm = orig_phrase_norm self.wiki_title = wiki_title.replace(DISAMBIGUATION_TITLE, "") if wiki_title_norm is None: self.wiki_title_norm = StringUtils.normalize_str(wiki_title) else: self.wiki_title_norm = wiki_title_norm self.score = score self.pageid = int(pageid) self.description = description self.relations = relations
[docs] def toJson(self) -> Dict: result_dict = {} result_dict["orig_phrase"] = self.orig_phrase result_dict["orig_phrase_norm"] = self.orig_phrase_norm result_dict["wiki_title"] = self.wiki_title result_dict["wiki_title_norm"] = self.wiki_title_norm result_dict["score"] = self.score result_dict["pageid"] = self.pageid result_dict["description"] = self.description result_dict["relations"] = self.relations.toJson() return result_dict
def __eq__(self, other): return ( self.orig_phrase == other.orig_phrase and self.wiki_title == other.wiki_title and self.pageid == other.pageid ) def __hash__(self): return hash(self.orig_phrase) + hash(self.pageid) + hash(self.wiki_title) def __str__(self) -> str: result_str = "" try: title_strip = re.sub("(\u2018|\u2019)", "'", self.orig_phrase) wiki_title_strip = re.sub("(\u2018|\u2019)", "'", self.wiki_title) result_str = ( str(title_strip) + ", " + str(wiki_title_strip) + ", " + str(self.score) + ", " + str(self.pageid) + ", " + str(self.description) + ", " + str(self.relations) ) except Exception: result_str = "error in to_string()" return result_str