Source code for nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import re
import string
from typing import Set, Dict

from nlp_architect.utils.string_utils import StringUtils

PART_NAME_CATEGORIES = ["name", "given name", "surname"]
DISAMBIGUATION_TITLE = "(disambiguation)"
DISAMBIGUATION_CATEGORY = ["disambig", "disambiguation"]


[docs]class WikipediaPageExtractedRelations(object):
    def __init__(
        self,
        is_part_name: bool = False,
        is_disambiguation: bool = False,
        parenthesis: Set[str] = None,
        disambiguation_links: Set[str] = None,
        categories: Set[str] = None,
        aliases: Set[str] = None,
        be_comp: Set[str] = None,
        disambiguation_links_norm: Set[str] = None,
        categories_norm: Set[str] = None,
        aliases_norm: Set[str] = None,
        title_parenthesis_norm: Set[str] = None,
        be_comp_norm: Set[str] = None,
    ) -> None:
        """
        Object represent a Wikipedia Relations Schema

        Args:
            is_part_name (bool): Weather page title is part of a Name (ie-family name/given name..)
            is_disambiguation (bool): Weather page is a disambiguation page
            parenthesis (set): a set of all parenthesis links/titles
            disambiguation_links (set): a set of all disambiguation links/titles
            categories (set): a set of all category links/titles
            aliases (set): a set of all aliases links/titles
            be_comp (set): a set of all "is a" links/titles
            disambiguation_links_norm (set): same as disambiguation_link just normalized
            categories_norm (set): same as categories just normalized, lower and clean
            aliases_norm (set): same as aliases just normalized, lower and clean
            title_parenthesis_norm (set): same as parenthesis just normalized, lower and clean
            be_comp_norm (set): same as be_comp just normalized, lower and clean
        """
        self.is_part_name = is_part_name
        self.is_disambiguation = is_disambiguation
        self.disambiguation_links = disambiguation_links
        self.title_parenthesis = parenthesis
        self.categories = categories
        self.aliases = aliases
        self.be_comp = be_comp

        self.disambiguation_links_norm = disambiguation_links_norm
        self.categories_norm = categories_norm
        self.aliases_norm = aliases_norm
        self.title_parenthesis_norm = title_parenthesis_norm
        self.be_comp_norm = be_comp_norm

[docs]    def extract_relations_from_text_v0(self, text):
        self.disambiguation_links = set()
        self.categories = set()
        self.title_parenthesis = set()

        self.disambiguation_links_norm = set()
        self.categories_norm = set()
        self.title_parenthesis_norm = set()
        self.be_comp_norm = set()

        ext_links = set()
        title_parenthesis = set()

        text_lines = text.split("\n")
        for line in text_lines:
            cat_links = self.extract_categories(line)
            if not self.is_part_name:
                self.is_part_name = self.is_name_part(line)
                if not self.is_part_name and [s for s in PART_NAME_CATEGORIES if s in cat_links]:
                    self.is_part_name = True

            self.categories.update(cat_links)
            self.categories_norm.update(StringUtils.normalize_string_list(cat_links))

            links, parenthesis_links = self.extract_links_and_parenthesis(line)
            ext_links.update(links)
            title_parenthesis.update(parenthesis_links)

        if self.is_disambiguation:
            self.disambiguation_links = ext_links
            self.disambiguation_links_norm = StringUtils.normalize_string_list(ext_links)
            self.title_parenthesis = title_parenthesis
            self.title_parenthesis_norm = StringUtils.normalize_string_list(title_parenthesis)

    def __str__(self) -> str:
        return (
            str(self.is_disambiguation)
            + ", "
            + str(self.is_part_name)
            + ", "
            + str(self.disambiguation_links)
            + ", "
            + str(self.be_comp)
            + ", "
            + str(self.title_parenthesis)
            + ", "
            + str(self.categories)
        )

[docs]    def toJson(self) -> Dict:
        result_dict = dict()
        result_dict["isPartName"] = self.is_part_name
        result_dict["isDisambiguation"] = self.is_disambiguation

        if self.disambiguation_links is not None:
            result_dict["disambiguationLinks"] = list(self.disambiguation_links)
            result_dict["disambiguationLinksNorm"] = list(self.disambiguation_links_norm)
        if self.categories is not None:
            result_dict["categories"] = list(self.categories)
            result_dict["categoriesNorm"] = list(self.categories_norm)
        if self.aliases is not None:
            result_dict["aliases"] = list(self.aliases)
        if self.title_parenthesis is not None:
            result_dict["titleParenthesis"] = list(self.title_parenthesis)
            result_dict["titleParenthesisNorm"] = list(self.title_parenthesis_norm)
        if self.be_comp_norm is not None:
            result_dict["beCompRelations"] = list(self.be_comp)
            result_dict["beCompRelationsNorm"] = list(self.be_comp_norm)

        return result_dict

[docs]    @staticmethod
    def extract_categories(line: str) -> Set[str]:
        categories = set()
        category_form1 = re.findall(r"\[\[Category:(.*)\]\]", line)
        for cat in category_form1:
            if DISAMBIGUATION_TITLE in cat:
                cat = cat.replace(DISAMBIGUATION_TITLE, "")
            categories.add(cat)

        prog = re.search("^{{(disambig.*|Disambig.*)}}$", line)
        if prog is not None:
            category_form2 = prog.group(1)
            cats = category_form2.split("|")
            categories.update(cats)

        return categories

[docs]    @staticmethod
    def extract_links_and_parenthesis(line: str):
        links = set()
        parenthesis_links = set()
        ext_links = re.findall(r"\[\[(.*)\]\]", line)
        for link in ext_links:
            split_link = link.split("|")
            for s_link in split_link:
                parenthesis_clean = None
                matcher = re.match(r"(.*)\s?\((.*)\)", s_link)
                if matcher:
                    s_link = matcher.group(1)
                    parenthesis_match = matcher.group(2)
                    if parenthesis_match.lower() != "disambiguation":
                        parenthesis_clean = re.sub(
                            "[" + string.punctuation + string.whitespace + "]",
                            " ",
                            parenthesis_match,
                        ).strip()

                s_link_clean = re.sub(
                    "[" + string.punctuation + string.whitespace + "]", " ", s_link
                ).strip()
                if parenthesis_clean is not None and DISAMBIGUATION_TITLE not in parenthesis_clean:
                    parenthesis_links.add(parenthesis_clean)

                links.add(s_link_clean)

        return links, parenthesis_links

[docs]    @staticmethod
    def is_name_part(line: str) -> bool:
        line = line.lower()
        val = False
        if WikipediaPageExtractedRelations.find_in_line(line, "===as surname==="):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "===as given name==="):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "===given names==="):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "==as a surname=="):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "==people with the surname=="):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "==family name and surname=="):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "category:given names"):
            val = True
        elif WikipediaPageExtractedRelations.find_in_line(line, "{{given name}}"):
            val = True
        return val

[docs]    @staticmethod
    def find_in_line(text: str, pattern: str) -> bool:
        found = re.findall(pattern, text)
        if found:
            return True
        return False