Source code for nlp_architect.utils.string_utils

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import os
import re
import string
from typing import List

from nlp_architect.utils.io import load_json_file
from nlp_architect.utils.text import SpacyInstance

CURRENT_DIR = os.path.dirname(os.path.realpath(__file__))

STOP_WORDS_FILE = os.path.join(CURRENT_DIR, "resources/stop_words_en.json")
PRONOUN_FILE = os.path.join(CURRENT_DIR, "resources/pronoun_en.json")
PREPOSITION_FILE = os.path.join(CURRENT_DIR, "resources/preposition_en.json")
DETERMINERS_FILE = os.path.join(CURRENT_DIR, "resources/determiners_en.json")

DISAMBIGUATION_CATEGORY = ["disambig", "disambiguation"]


[docs]class StringUtils: spacy_no_parser = SpacyInstance(disable=["parser"]) spacy_parser = SpacyInstance() stop_words = [] pronouns = [] preposition = [] determiners = [] def __init__(self): pass
[docs] @staticmethod def is_stop(token: str) -> bool: if not StringUtils.stop_words: StringUtils.stop_words = load_json_file(STOP_WORDS_FILE) StringUtils.stop_words.extend(DISAMBIGUATION_CATEGORY) if token not in StringUtils.stop_words: return False return True
[docs] @staticmethod def normalize_str(in_str: str) -> str: str_clean = ( re.sub("[" + string.punctuation + string.whitespace + "]", " ", in_str).strip().lower() ) if isinstance(str_clean, str): str_clean = str(str_clean) parser = StringUtils.spacy_no_parser.parser doc = parser(str_clean) ret_clean = [] for token in doc: lemma = token.lemma_.strip() if not StringUtils.is_pronoun(lemma) and not StringUtils.is_stop(lemma): ret_clean.append(token.lemma_) return " ".join(ret_clean)
[docs] @staticmethod def is_pronoun(in_str: str) -> bool: if not StringUtils.pronouns: StringUtils.pronouns = load_json_file(PRONOUN_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.pronouns: return True return False
[docs] @staticmethod def is_determiner(in_str: str) -> bool: if not StringUtils.determiners: StringUtils.determiners = load_json_file(DETERMINERS_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.determiners: return True return False
[docs] @staticmethod def is_preposition(in_str: str) -> bool: if not StringUtils.preposition: StringUtils.preposition = load_json_file(PREPOSITION_FILE) tokens = in_str.split() if len(tokens) == 1: if tokens[0] in StringUtils.preposition: return True return False
[docs] @staticmethod def normalize_string_list(str_list: str) -> List[str]: ret_list = [] for _str in str_list: normalize_str = StringUtils.normalize_str(_str) if normalize_str != "": ret_list.append(normalize_str) return ret_list
[docs] @staticmethod def find_head_lemma_pos_ner(x: str): """ " :param x: mention :return: the head word and the head word lemma of the mention """ head = "UNK" lemma = "UNK" pos = "UNK" ner = "UNK" # pylint: disable=not-callable doc = StringUtils.spacy_parser.parser(x) for tok in doc: if tok.head == tok: head = tok.text lemma = tok.lemma_ pos = tok.pos_ for ent in doc.ents: if ent.root.text == head: ner = ent.label_ return head, lemma, pos, ner