Source code for nlp_architect.api.intent_extraction_api

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import numpy as np
import pickle
from os import makedirs, path, sys

from nlp_architect.api.abstract_api import AbstractApi
from nlp_architect.models.intent_extraction import MultiTaskIntentModel, Seq2SeqIntentModel
from nlp_architect import LIBRARY_OUT
from nlp_architect.utils.generic import pad_sentences
from nlp_architect.utils.io import download_unlicensed_file
from nlp_architect.utils.text import SpacyInstance, bio_to_spans


[docs]class IntentExtractionApi(AbstractApi): model_dir = str(LIBRARY_OUT / "intent-pretrained") pretrained_model_info = path.join(model_dir, "model_info.dat") pretrained_model = path.join(model_dir, "model.h5") def __init__(self, prompt=False): self.model = None self.model_type = None self.word_vocab = None self.tags_vocab = None self.char_vocab = None self.intent_vocab = None self._download_pretrained_model(prompt) self.nlp = SpacyInstance(disable=["tagger", "ner", "parser", "vectors", "textcat"])
[docs] def process_text(self, text): input_text = " ".join(text.strip().split()) return self.nlp.tokenize(input_text)
@staticmethod def _prompt(): response = input("\nTo download '{}', please enter YES: ".format("intent_extraction")) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == "y"): print("Downloading {}...".format("ner")) responded_yes = True else: print("Download declined. Response received {} != YES|Y. ".format(res)) responded_yes = False return responded_yes @staticmethod def _download_pretrained_model(prompt=True): """Downloads the pre-trained BIST model if non-existent.""" model_info_exists = path.isfile(IntentExtractionApi.pretrained_model_info) model_exists = path.isfile(IntentExtractionApi.pretrained_model) if not model_exists or not model_info_exists: print( "The pre-trained models to be downloaded for the intent extraction dataset " "are licensed under Apache 2.0. By downloading, you accept the terms " "and conditions provided by the license" ) makedirs(IntentExtractionApi.model_dir, exist_ok=True) if prompt is True: agreed = IntentExtractionApi._prompt() if agreed is False: sys.exit(0) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/intent/", "model_info.dat", IntentExtractionApi.pretrained_model_info, ) download_unlicensed_file( "https://d2zs9tzlek599f.cloudfront.net" "/models/intent/", "model.h5", IntentExtractionApi.pretrained_model, ) print("Done.")
[docs] @staticmethod def display_results(text_str, predictions, intent_type): ret = {"annotation_set": [], "doc_text": " ".join([t for t in text_str])} spans = [] available_tags = set() for s, e, tag in bio_to_spans(text_str, predictions): spans.append({"start": s, "end": e, "type": tag}) available_tags.add(tag) ret["annotation_set"] = list(available_tags) ret["spans"] = spans ret["title"] = intent_type return {"doc": ret, "type": "high_level"}
[docs] def vectorize(self, doc, vocab, char_vocab=None): words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]).reshape( 1, -1 ) if char_vocab is not None: sentence_chars = [] for w in doc: word_chars = [] for c in w: if c in char_vocab: _cid = char_vocab[c] else: _cid = 1 word_chars.append(_cid) sentence_chars.append(word_chars) sentence_chars = np.expand_dims( pad_sentences(sentence_chars, self.model.word_length), axis=0 ) return [words, sentence_chars] return words
[docs] def inference(self, doc): text_arr = self.process_text(doc) intent_type = None if self.model_type == "mtl": doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab) intent, tags = self.model.predict(doc_vec, batch_size=1) intent = int(intent.argmax(1).flatten()) intent_type = self.intent_vocab.get(intent, None) print("Detected intent type: {}".format(intent_type)) else: doc_vec = self.vectorize(text_arr, self.word_vocab, None) tags = self.model.predict(doc_vec, batch_size=1) tags = tags.argmax(2).flatten() tag_str = [self.tags_vocab.get(n, None) for n in tags] for t, n in zip(text_arr, tag_str): print("{}\t{}\t".format(t, n)) return self.display_results(text_arr, tag_str, intent_type)
[docs] def load_model(self): with open(IntentExtractionApi.pretrained_model_info, "rb") as fp: model_info = pickle.load(fp) self.model_type = model_info["type"] self.word_vocab = model_info["word_vocab"] self.tags_vocab = {v: k for k, v in model_info["tags_vocab"].items()} if self.model_type == "mtl": self.char_vocab = model_info["char_vocab"] self.intent_vocab = {v: k for k, v in model_info["intent_vocab"].items()} model = MultiTaskIntentModel() else: model = Seq2SeqIntentModel() model.load(self.pretrained_model) self.model = model