# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import csv
from enum import Enum
from os import PathLike
from nlp_architect.models.absa import TRAIN_LEXICONS
[docs]class OpinionTerm:
"""Opinion term.
Attributes:
terms (list): list of opinion term
polarity (Polarity): polarity of the sentiment
"""
def __init__(self, terms, polarity):
self.terms = terms
self.polarity = polarity
def __str__(self):
return " ".join(self.terms)
[docs]class AspectTerm(object):
"""Aspect term.
Attributes:
terms (list): list of terms
pos (list): list of pos
"""
def __init__(self, terms, pos, lemmas):
"""
Args:
terms (list): list of terms
pos (list): list of pos
"""
self.terms = terms
self.lemmas = lemmas
self.pos = pos
def __str__(self):
return " ".join(self.terms)
def __eq__(self, other):
"""
Override the default equals behavior.
"""
return self.terms == other.terms and self.pos == other.pos
[docs] @staticmethod
def from_token(token):
return AspectTerm([token.text], [token.norm_pos], [token.lemma])
[docs]class CandidateTerm(object):
"""Candidate opinion term or aspect term.
Attributes:
term (list): list of terms
pos (list): list of pos
source_term (list): list of related anchor terms
sentence (str): sentence text this term
term_polarity (int): term polarity
"""
def __init__(self, term_a, term_b, sent_text, candidate_term_polarity):
"""
Args:
term_a (DepRelationTerm): first term
term_b (DepRelationTerm): second term
sent_text (str): sentence text
candidate_term_polarity (Polarity): term polarity
"""
self.term = [term_a.text]
self.pos = [term_a.norm_pos]
self.lemma = [term_a.lemma]
self.source_term = [term_b.text]
self.sentence = sent_text
self.term_polarity = candidate_term_polarity
def __str__(self):
return " ".join(self.term)
def __eq__(self, other):
if other is None or self.__class__ != other.__class__:
return False
if self.term != other.term if self.term is not None else other.term is not None:
return False
if (
self.source_term != other.source_term
if self.source_term is not None
else other.source_term is not None
):
return False
return (
self.sentence == other.sentence if self.sentence is not None else other.sentence is None
)
def __ne__(self, other):
return not self == other
[docs]class DepRelation(object):
"""Generic Relation Entry contains the governor, it's dependent and the relation between them.
Attributes:
gov (DepRelationTerm): governor
dep (DepRelationTerm): dependent
rel (str): relation type between governor and dependent
"""
def __init__(self, gov=None, dep=None, rel=None):
self.gov = gov
self.dep = dep
rel_split = rel.split(":")
self.rel = rel_split[0]
self.subtype = rel_split[1] if len(rel_split) > 1 else None
[docs]class RelCategory(Enum):
SUBJ = {"nsubj", "nsubjpass", "csubj", "csubjpass"}
MOD = {"amod", "acl", "advcl", "appos", "neg", "nmod"}
OBJ = {"dobj", "iobj", "obj", "pobj"}
[docs]class DepRelationTerm(object):
"""
Attributes:
text (str, optional): token text
lemma (str, optional): token lemma
pos (str, optional): token pos
ner (str, optional): token ner
idx (int, optional): token start index (within the sentence)
"""
def __init__(self, text=None, lemma=None, pos=None, ner=None, idx=None):
self.text = text
self.lemma = lemma
self.pos = pos
self.ner = ner
self.idx = idx
self.dep_rel_list = []
self.gov = None
@property
def norm_pos(self):
return normalize_pos(self.text, self.pos)
[docs]class QualifiedTerm(object):
"""Qualified term - term that is accepted to generated lexicon.
Attributes:
term (list): list of terms
pos (list): list of pos.
frequency (int): frequency of filtered term in corpus.
term_polarity (Polarity): term polarity.
"""
def __init__(self, term, lemma, pos, frequency, term_polarity):
self.term = term
self.lemma = lemma
self.pos = pos
self.frequency = frequency
self.term_polarity = term_polarity
[docs] def as_string_list(self):
return [" ".join(self.term), str(self.frequency), self.term_polarity.name]
[docs] def as_string_list_aspect(self):
return [" ".join(self.term)]
[docs] def as_string_list_aspect_debug(self):
return [str(self.frequency), " ".join(self.term), " ".join(self.lemma)]
[docs]def load_lex_as_dict_from_csv(file_name: str or PathLike):
"""Read lexicon as dictionary, key = term, value = pos.
Args:
file_name: the csv file name
"""
lexicon_map = {}
with open(file_name, encoding="utf-8") as f:
reader = csv.DictReader(f, skipinitialspace=True)
if reader is None:
print("file name is None")
return lexicon_map
next(reader)
for row in reader:
term = row["Term"]
pos = row["POS subtype"]
lexicon_map[term] = pos
return lexicon_map
[docs]class POS(Enum):
"""Part-of-speech labels."""
ADJ = 1
ADV = 2
AUX = 3
AUX_PAST = 3
CONJ = 4
NUM = 5
DET = 6
EX = 7
FW = 8
IN = 9
PREP = 10
LS = 11
MD = 12
MD_CERTAIN = 13
NN = 14
PROPER_NAME = 15
POS = 16
PRON = 17
PRON_1_S = 18
PRON_1_P = 19
PRON_2_S = 20
PRON_3_S = 21
PRON_3_P = 22
PRON_4_S = 23
POSSPRON_1_S = 24
POSSPRON_1_P = 25
POSSPRON_2_S = 26
POSSPRON_2_P = 27
POSSPRON_3_S = 28
POSSPRON_3_P = 29
POSSPRON_4_S = 30
POSSPRON_4_P = 31
RP = 32
SYM = 33
TO = 34
INTERJ = 35
VB = 36
VB_PAST = 37
VB_PRESENT = 38
VBG = 39
VBN = 40
WH_DET = 41
WH_PROP = 42
WH_ADV = 43
PUNCT = 44
OTHER = 45
PRONOUNS_LIST = load_lex_as_dict_from_csv(TRAIN_LEXICONS / "PronounsLex.csv")
[docs]def normalize_pos(word, in_pos):
if in_pos is None:
return POS.OTHER
if word.lower() in PRONOUNS_LIST and in_pos.startswith("PR"):
return POS[PRONOUNS_LIST[word.lower()]]
if in_pos == "CC":
return POS.CONJ
if in_pos == "CD":
return POS.NUM
if in_pos == "DT":
return POS.DET
if in_pos == "EX":
return POS.EX
if in_pos == "FW":
return POS.FW
if in_pos == "IN":
return POS.PREP
if in_pos == "TO":
return POS.PREP
if in_pos.startswith("JJ"):
return POS.ADJ
if in_pos == "LS":
return POS.LS
if in_pos == "MD":
return POS.MD
if in_pos.startswith("NN"):
return POS.NN
if in_pos == "PDT":
return POS.DET
if in_pos == "POS":
return POS.POS
if in_pos.startswith("PR"):
return POS.PRON
if in_pos.startswith("RB"):
return POS.ADV
if in_pos == "RP":
return POS.RP
if in_pos == "SYM":
return POS.SYM
if in_pos == "UH":
return POS.INTERJ
if in_pos.startswith("VB"):
return POS.VB
if in_pos == "WDT":
return POS.WH_DET
if in_pos.startswith("WP"):
return POS.WH_PROP
if in_pos == "WRB":
return POS.WH_ADV
return POS.OTHER
[docs]class LoadAspectStopLists(object):
"""A Filter holding all generic and general lexicons, can verify if a given term is contained
in one of the lexicons - hence belongs to one of the generic / general lexicons or is a valid
term.
Attributes:
generic_opinion_lex (dict): generic opinion lexicon
determiners_lex (dict): determiners lexicon
general_adjectives_lex (dict): general adjectives lexicon
generic_quantifiers_lex (dict): generic quantifiers lexicon
geographical_adjectives_lex (dict): geographical adjectives lexicon
intensifiers_lex (dict): intensifiers lexicon
time_adjective_lex (dict): time adjective lexicon
ordinal_numbers_lex (dict): ordinal numbers lexicon
prepositions_lex (dict): prepositions lexicon
pronouns_lex (dict): pronouns lexicon
colors_lex (dict): colors lexicon
negation_lex (dict): negation terms lexicon
"""
def __init__(
self,
generic_opinion_lex,
determiners_lex,
general_adjectives_lex,
generic_quantifiers_lex,
geographical_adjectives_lex,
intensifiers_lex,
time_adjective_lex,
ordinal_numbers_lex,
prepositions_lex,
pronouns_lex,
colors_lex,
negation_lex,
auxiliaries_lex,
):
self.generic_opinion_lex = generic_opinion_lex
self.determiners_lex = determiners_lex
self.general_adjectives_lex = general_adjectives_lex
self.generic_quantifiers_lex = generic_quantifiers_lex
self.geographical_adjectives_lex = geographical_adjectives_lex
self.intensifiers_lex = intensifiers_lex
self.time_adjective_lex = time_adjective_lex
self.ordinal_numbers_lex = ordinal_numbers_lex
self.prepositions_lex = prepositions_lex
self.pronouns_lex = pronouns_lex
self.colors_lex = colors_lex
self.negation_lex = negation_lex
self.auxiliaries_lex = auxiliaries_lex
[docs] def is_in_stop_list(self, term):
return any(term in lexicon for lexicon in self.__dict__.values())
[docs]class LoadOpinionStopLists(object):
"""A Filter holding all generic and general lexicons, can verify if a given term is contained
in one of the lexicons - hence belongs to one of the generic / general lexicons or is a valid
term.
Attributes:
determiners_lex (dict): determiners lexicon
general_adjectives_lex (dict): general adjectives lexicon
generic_quantifiers_lex (dict): generic quantifiers lexicon
geographical_adjectives_lex (dict): geographical adjectives lexicon
intensifiers_lex (dict): intensifiers lexicon
time_adjective_lex (dict): time adjective lexicon
ordinal_numbers_lex (dict): ordinal numbers lexicon
prepositions_lex (dict): prepositions lexicon
colors_lex (dict): colors lexicon
negation_lex (dict): negation terms lexicon
"""
def __init__(
self,
determiners_lex,
general_adjectives_lex,
generic_quantifiers_lex,
geographical_adjectives_lex,
intensifiers_lex,
time_adjective_lex,
ordinal_numbers_lex,
prepositions_lex,
colors_lex,
negation_lex,
):
self.determiners_lex = determiners_lex
self.general_adjectives_lex = general_adjectives_lex
self.generic_quantifiers_lex = generic_quantifiers_lex
self.geographical_adjectives_lex = geographical_adjectives_lex
self.intensifiers_lex = intensifiers_lex
self.time_adjective_lex = time_adjective_lex
self.ordinal_numbers_lex = ordinal_numbers_lex
self.prepositions_lex = prepositions_lex
self.colors_lex = colors_lex
self.negation_lex = negation_lex
[docs] def is_in_stop_list(self, term):
return any(term in lexicon for lexicon in self.__dict__.values())