Source code for nlp_architect.models.absa.train.train

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from pathlib import Path, PosixPath
from os import PathLike
from typing import Union

from nlp_architect.models.absa import TRAIN_OUT, LEXICONS_OUT
from nlp_architect.models.absa.train.acquire_terms import AcquireTerms
from nlp_architect.models.absa.train.rerank_terms import RerankTerms
from nlp_architect.models.absa.utils import (
    parse_docs,
    parse_docs_bist,
    _download_pretrained_rerank_model,
    _write_aspect_lex,
    _write_opinion_lex,
)
from nlp_architect.utils.io import download_unzip

EMBEDDING_URL = "http://nlp.stanford.edu/data", "glove.840B.300d.zip"
EMBEDDING_PATH = TRAIN_OUT / "word_emb_unzipped" / "glove.840B.300d.txt"
RERANK_MODEL_DEFAULT_PATH = rerank_model_dir = TRAIN_OUT / "reranking_model" / "rerank_model.h5"


[docs]class TrainSentiment: def __init__( self, parse: bool = True, rerank_model: PathLike = None, asp_thresh: int = 3, op_thresh: int = 2, max_iter: int = 3, parser="spacy", spacy_model="en_core_web_sm", ): self.acquire_lexicon = AcquireTerms(asp_thresh, op_thresh, max_iter) self.parser_name = parser if parse: if parser == "bist": from nlp_architect.pipelines.spacy_bist import SpacyBISTParser self.parser = SpacyBISTParser(spacy_model=spacy_model) elif parser == "spacy": from nlp_architect.utils.text import SpacyInstance disable = [ "merge_noun_chunks", "ner", "entity_linker", "textcat", "entity_ruler", "sentencizer", "merge_entities", ] self.parser = SpacyInstance(model=spacy_model, disable=disable, ptb_pos=True) else: self.parser = None if not rerank_model: print("using pre-trained reranking model") rerank_model = _download_pretrained_rerank_model(RERANK_MODEL_DEFAULT_PATH) download_unzip(*EMBEDDING_URL, EMBEDDING_PATH, license_msg="Glove word embeddings.") self.rerank = RerankTerms( vector_cache=True, rerank_model=rerank_model, emb_model_path=EMBEDDING_PATH )
[docs] def run( self, data: Union[str, PathLike] = None, parsed_data: Union[str, PathLike] = None, out_dir: Union[str, PathLike] = TRAIN_OUT, ): if not parsed_data: if not self.parser: raise RuntimeError("Parser not initialized (try parse=True at init)") parsed_dir = Path(out_dir) / "parsed" / Path(data).stem parsed_data = self.parse_data(data, parsed_dir) generated_aspect_lex = self.acquire_lexicon.acquire_lexicons(parsed_data) _write_aspect_lex(parsed_data, generated_aspect_lex, LEXICONS_OUT) generated_opinion_lex_reranked = self.rerank.predict( AcquireTerms.acquired_opinion_terms_path, AcquireTerms.generic_opinion_lex_path ) _write_opinion_lex(parsed_data, generated_opinion_lex_reranked, LEXICONS_OUT) return generated_opinion_lex_reranked, generated_aspect_lex
[docs] def parse_data(self, data: PathLike or PosixPath, out_dir: Union[str, PathLike]): if out_dir: Path(out_dir).mkdir(parents=True, exist_ok=True) if self.parser_name == "bist": _, data_size = parse_docs_bist(self.parser, data, out_dir=out_dir) elif self.parser_name == "spacy": parsed_docs = parse_docs(self.parser, data, out_dir=out_dir) data_size = sum(len(doc.sentences) for doc in parsed_docs) if data_size < 1000: raise ValueError( "The data contains only {0} sentences. A minimum of 1000 " "sentences is required for training.".format(data_size) ) return out_dir