Source code for nlp_architect.data.cdc_resources.wikipedia.wiki_online

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import logging
import os
import re
import sys

from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page import WikipediaPage
from nlp_architect.data.cdc_resources.data_types.wiki.wikipedia_page_extracted_relations import (
    WikipediaPageExtractedRelations,
)
from nlp_architect.data.cdc_resources.wikipedia.wiki_search_page_result import (
    WikipediaSearchPageResult,
)
from nlp_architect.utils.text import SpacyInstance

os.environ["PYWIKIBOT_NO_USER_CONFIG"] = "1"

DISAMBIGUATE_PAGE = ["wikimedia disambiguation page", "wikipedia disambiguation page"]
NAME_DESCRIPTIONS = ["given name", "first name", "family name"]

logger = logging.getLogger(__name__)


[docs]class WikiOnline(object):
    def __init__(self):
        try:
            import pywikibot
        except (AttributeError, ImportError):
            logger.error(
                "pywikibot is not installed, please install nlp_architect with [all] package. "
                + "for example: pip install nlp_architect[all]"
            )
            sys.exit()
        self.spacy = SpacyInstance()
        self.pywikibot = pywikibot
        self.cache = dict()
        self.site = pywikibot.Site("en", "wikipedia")  # The site we want to run our bot on

[docs]    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]

        ret_pages = set()
        word_clean = phrase.replace("-", " ")
        word_lower = word_clean.lower()
        word_upper = word_clean.upper()
        word_title = word_clean.title()
        words_set = {phrase, word_clean, word_lower, word_upper, word_title}
        for appr in words_set:
            try:
                page_result = self.get_page_redirect(appr)
                if page_result.pageid != 0:
                    full_page = self.get_wiki_page_with_items(phrase, page_result)
                    ret_pages.add(WikipediaSearchPageResult(appr, full_page))
            except Exception as e:
                logger.error(e)

        self.cache[phrase] = ret_pages
        return ret_pages

    # pylint: disable=protected-access
[docs]    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid, description, relations)

        logger.debug("Page: {}. Extracted successfully".format(ret_page))

        return ret_page

[docs]    def get_wiki_page_item(self, page):
        if page is not None:
            try:
                item = self.pywikibot.ItemPage.fromPage(
                    page
                )  # this can be used for any page object
                item.get()  # need to call it to access any data.
                return item
            except (self.pywikibot.NoPage, AttributeError, TypeError, NameError):
                pass
        return None

[docs]    def get_page_redirect(self, word):
        page = self.pywikibot.Page(self.site, word)
        if page.pageid != 0 and page.isRedirectPage():
            return page.getRedirectTarget()
        return page

[docs]    @staticmethod
    def get_aliases(item):
        if item is not None and item.aliases is not None:
            if "en" in item.aliases:
                aliases = item.aliases["en"]
                return aliases

        return None

[docs]    @staticmethod
    def get_description(item):
        description = {}
        if item is not None:
            item_desc = item.get()
            if "desctiptions" in item_desc and "en" in item_desc["descriptions"]:
                dict([("age", 25)])
                description["descriptions"] = dict([("en", item_desc["descriptions"]["en"])])

        return description

[docs]    @staticmethod
    def is_disambiguation_page(item):
        if item is not None:
            dic = item.get()
            if dic is not None and "descriptions" in dic:
                desc = dic["descriptions"]
                if desc is not None and "en" in desc:
                    return desc["en"].lower() in DISAMBIGUATE_PAGE

        return False

[docs]    @staticmethod
    def is_name_description(text, item, is_disambiguation):
        if item is not None:
            if is_disambiguation:
                if WikipediaPageExtractedRelations.is_name_part(text):
                    return True
            else:
                dic = item.get()
                if dic is not None and "descriptions" in dic:
                    desc = dic["descriptions"]
                    if desc is not None and "en" in desc:
                        if [s for s in NAME_DESCRIPTIONS if s in desc["en"].lower()]:
                            return True
        return False

    # pylint: disable=no-else-return
[docs]    def extract_be_comp(self, text):
        first_sentence_start_index = text.index("'''")
        if first_sentence_start_index >= 0:
            last_temp_index = text.find("\n", first_sentence_start_index)
        if last_temp_index == -1:
            last_temp_index = len(text)

        first_paragraph = text[first_sentence_start_index:last_temp_index]
        if WikiOnline.extract_be_a_index(first_paragraph) == -1 and last_temp_index != len(text):
            return self.extract_be_comp(text[last_temp_index:])
        elif last_temp_index == len(text):
            return None, None

        first_paragraph_clean = re.sub(r"\([^)]*\)", "", first_paragraph)
        first_paragraph_clean = re.sub(r"<[^>]*>", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"{[^}]*}", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"\[\[[^]]*\]\]", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"[\']", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"&nbsp;", " ", first_paragraph_clean)

        return self.extract_be_comp_relations(first_paragraph_clean)

    # pylint: disable=not-callable
[docs]    def extract_be_comp_relations(self, first_paragraph):
        be_comp = set()
        be_comp_norm = set()
        if first_paragraph:
            doc = self.spacy.parser(first_paragraph)
            for token in doc:
                target = token.text
                target_lemma = token.lemma_
                relation = token.dep_
                governor = token.head.text
                governor_lemma = token.head.lemma_
                if relation == "acl":
                    break
                if relation == "punct" and target == ".":
                    break
                elif relation == "cop":
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == "nsubj":
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)
                elif relation == "dep":
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == "compound":
                    be_comp.add(target + " " + governor)
                    be_comp_norm.add(target_lemma + " " + governor_lemma)
                elif relation == "amod":
                    be_comp.add(target + " " + governor)
                    be_comp_norm.add(target_lemma + " " + governor_lemma)
                elif relation in ["conj", "appos"]:
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)

        return be_comp, be_comp_norm

[docs]    @staticmethod
    def extract_be_a_index(sentence):
        result = None
        if "is a" in sentence:
            result = sentence.index("is a")
        elif "are a" in sentence:
            result = sentence.index("are a")
        elif "was a" in sentence:
            result = sentence.index("was a")
        elif "were a" in sentence:
            result = sentence.index("were a")
        elif "be a" in sentence:
            result = sentence.index("be a")
        elif "is the" in sentence:
            result = sentence.index("is the")
        elif "are the" in sentence:
            result = sentence.index("are the")
        elif "was the" in sentence:
            result = sentence.index("was the")
        elif "were the" in sentence:
            result = sentence.index("were the")
        elif "be the" in sentence:
            result = sentence.index("be the")

        return result