Source code for nlp_architect.data.cdc_resources.relations.wordnet_relation_extraction

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

import logging
import os
from typing import Set, List

from nlp_architect.common.cdc.mention_data import MentionDataLight
from nlp_architect.data.cdc_resources.data_types.wn.wordnet_page import WordnetPage
from nlp_architect.data.cdc_resources.relations.relation_extraction import RelationExtraction
from nlp_architect.data.cdc_resources.relations.relation_types_enums import (
    RelationType,
    OnlineOROfflineMethod,
)
from nlp_architect.data.cdc_resources.wordnet.wordnet_offline import WordnetOffline
from nlp_architect.data.cdc_resources.wordnet.wordnet_online import WordnetOnline
from nlp_architect.utils.string_utils import StringUtils

logger = logging.getLogger(__name__)


[docs]class WordnetRelationExtraction(RelationExtraction): def __init__( self, method: OnlineOROfflineMethod = OnlineOROfflineMethod.ONLINE, wn_file: str = None ): """ Extract Relation between two mentions according to Word Embedding cosine distance Args: method (required): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full wordnet or a sub-set of it (default = ONLINE) wn_file (required on OFFLINE mode): str Location of wordnet subset file to work with """ logger.info("Loading Wordnet module") self.connectivity = method if self.connectivity == OnlineOROfflineMethod.ONLINE: self.wordnet_impl = WordnetOnline() elif self.connectivity == OnlineOROfflineMethod.OFFLINE: if wn_file is not None and os.path.isdir(wn_file): self.wordnet_impl = WordnetOffline(wn_file) else: raise FileNotFoundError("WordNet resource directory not found or not in path") logger.info("Wordnet module lead successfully") super(WordnetRelationExtraction, self).__init__()
[docs] def extract_all_relations( self, mention_x: MentionDataLight, mention_y: MentionDataLight ) -> Set[RelationType]: """ Try to find if mentions has anyone or more of the relations this class support Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: Set[RelationType]: One or more of: RelationType.WORDNET_SAME_SYNSET_ENTITY, RelationType.WORDNET_SAME_SYNSET_EVENT, RelationType.WORDNET_PARTIAL_SYNSET_MATCH, RelationType.WORDNET_DERIVATIONALLY """ relations = set() mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun(mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower() ): relations.add(RelationType.NO_RELATION_FOUND) return relations page_x = self.wordnet_impl.get_pages(mention_x) page_y = self.wordnet_impl.get_pages(mention_y) if page_x and page_y: deriv_rel = self.extract_derivation(page_x, page_y) part_syn_rel = self.extract_partial_synset_match(page_x, page_y) same_syn_rel = self.extract_same_synset_entity(page_x, page_y) if deriv_rel != RelationType.NO_RELATION_FOUND: relations.add(deriv_rel) if part_syn_rel != RelationType.NO_RELATION_FOUND: relations.add(part_syn_rel) if same_syn_rel != RelationType.NO_RELATION_FOUND: relations.add(same_syn_rel) if len(relations) == 0: relations.add(RelationType.NO_RELATION_FOUND) return relations
[docs] def extract_sub_relations( self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType ) -> RelationType: """ Check if input mentions has the given relation between them Args: mention_x: MentionDataLight mention_y: MentionDataLight relation: RelationType Returns: RelationType: relation in case mentions has given relation or RelationType.NO_RELATION_FOUND otherwise """ mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun(mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower() ): return RelationType.NO_RELATION_FOUND page_x = self.wordnet_impl.get_pages(mention_x) page_y = self.wordnet_impl.get_pages(mention_y) if page_x and page_y: if relation == RelationType.WORDNET_DERIVATIONALLY: return self.extract_derivation(page_x, page_y) if relation == RelationType.WORDNET_PARTIAL_SYNSET_MATCH: return self.extract_partial_synset_match(page_x, page_y) if relation == RelationType.WORDNET_SAME_SYNSET: return self.extract_same_synset_entity(page_x, page_y) return RelationType.NO_RELATION_FOUND
[docs] @staticmethod def extract_derivation(page_x: WordnetPage, page_y: WordnetPage) -> RelationType: """ Check if input mentions has derivation relation Args: page_x:WordnetPage page_y:WordnetPage Returns: RelationType.WORDNET_DERIVATIONALLY or RelationType.NO_RELATION_FOUND """ x_head = page_x.head x_head_lemma = page_x.head_lemma y_head = page_y.head y_head_lemma = page_y.head_lemma x_set = set() x_set.update(page_x.head_derivationally) x_set.update(page_x.head_lemma_derivationally) y_set = set() y_set.update(page_y.head_derivationally) y_set.update(page_y.head_lemma_derivationally) relation = RelationType.NO_RELATION_FOUND if ( y_head in x_set or y_head_lemma in x_set or x_head in y_set or x_head_lemma in y_set or len(x_set & y_set) > 0 ): relation = RelationType.WORDNET_DERIVATIONALLY # print 'matched by derivation - ' + str(x_head)+ ' , ' + str(y_head) return relation
[docs] @staticmethod def extract_partial_synset_match(page_x: WordnetPage, page_y: WordnetPage) -> RelationType: """ Check if input mentions has partial synset relation Args: page_x:WordnetPage page_y:WordnetPage Returns: RelationType.WORDNET_PARTIAL_SYNSET_MATCH or RelationType.NO_RELATION_FOUND """ x_words = page_x.clean_phrase.split() y_words = page_y.clean_phrase.split() if len(x_words) == 0 or len(y_words) == 0: return RelationType.NO_RELATION_FOUND x_synonyms = page_x.all_clean_words_synonyms y_synonyms = page_y.all_clean_words_synonyms # One word - check whether there is intersection between synsets if ( len(x_synonyms) == 1 and len(y_synonyms) == 1 and len([w for w in (x_synonyms[0] & y_synonyms[0])]) > 0 ): # print 'matched by partial - ' + str(y) + ' , ' + str(x) return RelationType.WORDNET_PARTIAL_SYNSET_MATCH return RelationType.NO_RELATION_FOUND
[docs] @staticmethod def extract_same_synset_entity(page_x: WordnetPage, page_y: WordnetPage) -> RelationType: """ Check if input mentions has same synset relation for entity mentions Args: page_x:WordnetPage page_y:WordnetPage Returns: RelationType.WORDNET_SAME_SYNSET_ENTITY or RelationType.NO_RELATION_FOUND """ match_result = RelationType.NO_RELATION_FOUND th = 0 if len([w for w in (page_x.head_synonyms & page_y.head_synonyms)]) > th: match_result = RelationType.WORDNET_SAME_SYNSET return match_result
[docs] @staticmethod def get_supported_relations() -> List[RelationType]: """ Return all supported relations by this class Returns: List[RelationType] """ return [ RelationType.WORDNET_SAME_SYNSET, RelationType.WORDNET_PARTIAL_SYNSET_MATCH, RelationType.WORDNET_DERIVATIONALLY, ]