Source code for nlp_architect.data.cdc_resources.relations.wordnet_relation_extraction
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import logging
import os
from typing import Set, List
from nlp_architect.common.cdc.mention_data import MentionDataLight
from nlp_architect.data.cdc_resources.data_types.wn.wordnet_page import WordnetPage
from nlp_architect.data.cdc_resources.relations.relation_extraction import RelationExtraction
from nlp_architect.data.cdc_resources.relations.relation_types_enums import (
RelationType,
OnlineOROfflineMethod,
)
from nlp_architect.data.cdc_resources.wordnet.wordnet_offline import WordnetOffline
from nlp_architect.data.cdc_resources.wordnet.wordnet_online import WordnetOnline
from nlp_architect.utils.string_utils import StringUtils
logger = logging.getLogger(__name__)
[docs]class WordnetRelationExtraction(RelationExtraction):
def __init__(
self, method: OnlineOROfflineMethod = OnlineOROfflineMethod.ONLINE, wn_file: str = None
):
"""
Extract Relation between two mentions according to Word Embedding cosine distance
Args:
method (required): OnlineOROfflineMethod.{ONLINE/OFFLINE} run against full wordnet or
a sub-set of it (default = ONLINE)
wn_file (required on OFFLINE mode): str Location of wordnet subset file to work with
"""
logger.info("Loading Wordnet module")
self.connectivity = method
if self.connectivity == OnlineOROfflineMethod.ONLINE:
self.wordnet_impl = WordnetOnline()
elif self.connectivity == OnlineOROfflineMethod.OFFLINE:
if wn_file is not None and os.path.isdir(wn_file):
self.wordnet_impl = WordnetOffline(wn_file)
else:
raise FileNotFoundError("WordNet resource directory not found or not in path")
logger.info("Wordnet module lead successfully")
super(WordnetRelationExtraction, self).__init__()
[docs] def extract_all_relations(
self, mention_x: MentionDataLight, mention_y: MentionDataLight
) -> Set[RelationType]:
"""
Try to find if mentions has anyone or more of the relations this class support
Args:
mention_x: MentionDataLight
mention_y: MentionDataLight
Returns:
Set[RelationType]: One or more of: RelationType.WORDNET_SAME_SYNSET_ENTITY,
RelationType.WORDNET_SAME_SYNSET_EVENT, RelationType.WORDNET_PARTIAL_SYNSET_MATCH,
RelationType.WORDNET_DERIVATIONALLY
"""
relations = set()
mention_x_str = mention_x.tokens_str
mention_y_str = mention_y.tokens_str
if StringUtils.is_pronoun(mention_x_str.lower()) or StringUtils.is_pronoun(
mention_y_str.lower()
):
relations.add(RelationType.NO_RELATION_FOUND)
return relations
page_x = self.wordnet_impl.get_pages(mention_x)
page_y = self.wordnet_impl.get_pages(mention_y)
if page_x and page_y:
deriv_rel = self.extract_derivation(page_x, page_y)
part_syn_rel = self.extract_partial_synset_match(page_x, page_y)
same_syn_rel = self.extract_same_synset_entity(page_x, page_y)
if deriv_rel != RelationType.NO_RELATION_FOUND:
relations.add(deriv_rel)
if part_syn_rel != RelationType.NO_RELATION_FOUND:
relations.add(part_syn_rel)
if same_syn_rel != RelationType.NO_RELATION_FOUND:
relations.add(same_syn_rel)
if len(relations) == 0:
relations.add(RelationType.NO_RELATION_FOUND)
return relations
[docs] def extract_sub_relations(
self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType
) -> RelationType:
"""
Check if input mentions has the given relation between them
Args:
mention_x: MentionDataLight
mention_y: MentionDataLight
relation: RelationType
Returns:
RelationType: relation in case mentions has given relation or
RelationType.NO_RELATION_FOUND otherwise
"""
mention_x_str = mention_x.tokens_str
mention_y_str = mention_y.tokens_str
if StringUtils.is_pronoun(mention_x_str.lower()) or StringUtils.is_pronoun(
mention_y_str.lower()
):
return RelationType.NO_RELATION_FOUND
page_x = self.wordnet_impl.get_pages(mention_x)
page_y = self.wordnet_impl.get_pages(mention_y)
if page_x and page_y:
if relation == RelationType.WORDNET_DERIVATIONALLY:
return self.extract_derivation(page_x, page_y)
if relation == RelationType.WORDNET_PARTIAL_SYNSET_MATCH:
return self.extract_partial_synset_match(page_x, page_y)
if relation == RelationType.WORDNET_SAME_SYNSET:
return self.extract_same_synset_entity(page_x, page_y)
return RelationType.NO_RELATION_FOUND
[docs] @staticmethod
def extract_derivation(page_x: WordnetPage, page_y: WordnetPage) -> RelationType:
"""
Check if input mentions has derivation relation
Args:
page_x:WordnetPage
page_y:WordnetPage
Returns:
RelationType.WORDNET_DERIVATIONALLY or RelationType.NO_RELATION_FOUND
"""
x_head = page_x.head
x_head_lemma = page_x.head_lemma
y_head = page_y.head
y_head_lemma = page_y.head_lemma
x_set = set()
x_set.update(page_x.head_derivationally)
x_set.update(page_x.head_lemma_derivationally)
y_set = set()
y_set.update(page_y.head_derivationally)
y_set.update(page_y.head_lemma_derivationally)
relation = RelationType.NO_RELATION_FOUND
if (
y_head in x_set
or y_head_lemma in x_set
or x_head in y_set
or x_head_lemma in y_set
or len(x_set & y_set) > 0
):
relation = RelationType.WORDNET_DERIVATIONALLY
# print 'matched by derivation - ' + str(x_head)+ ' , ' + str(y_head)
return relation
[docs] @staticmethod
def extract_partial_synset_match(page_x: WordnetPage, page_y: WordnetPage) -> RelationType:
"""
Check if input mentions has partial synset relation
Args:
page_x:WordnetPage
page_y:WordnetPage
Returns:
RelationType.WORDNET_PARTIAL_SYNSET_MATCH or RelationType.NO_RELATION_FOUND
"""
x_words = page_x.clean_phrase.split()
y_words = page_y.clean_phrase.split()
if len(x_words) == 0 or len(y_words) == 0:
return RelationType.NO_RELATION_FOUND
x_synonyms = page_x.all_clean_words_synonyms
y_synonyms = page_y.all_clean_words_synonyms
# One word - check whether there is intersection between synsets
if (
len(x_synonyms) == 1
and len(y_synonyms) == 1
and len([w for w in (x_synonyms[0] & y_synonyms[0])]) > 0
):
# print 'matched by partial - ' + str(y) + ' , ' + str(x)
return RelationType.WORDNET_PARTIAL_SYNSET_MATCH
return RelationType.NO_RELATION_FOUND
[docs] @staticmethod
def extract_same_synset_entity(page_x: WordnetPage, page_y: WordnetPage) -> RelationType:
"""
Check if input mentions has same synset relation for entity mentions
Args:
page_x:WordnetPage
page_y:WordnetPage
Returns:
RelationType.WORDNET_SAME_SYNSET_ENTITY or RelationType.NO_RELATION_FOUND
"""
match_result = RelationType.NO_RELATION_FOUND
th = 0
if len([w for w in (page_x.head_synonyms & page_y.head_synonyms)]) > th:
match_result = RelationType.WORDNET_SAME_SYNSET
return match_result
[docs] @staticmethod
def get_supported_relations() -> List[RelationType]:
"""
Return all supported relations by this class
Returns:
List[RelationType]
"""
return [
RelationType.WORDNET_SAME_SYNSET,
RelationType.WORDNET_PARTIAL_SYNSET_MATCH,
RelationType.WORDNET_DERIVATIONALLY,
]