Source code for nlp_architect.api.intent_extraction_api

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import numpy as np
import pickle
from os import makedirs, path, sys

from nlp_architect.api.abstract_api import AbstractApi
from nlp_architect.models.intent_extraction import MultiTaskIntentModel, Seq2SeqIntentModel
from nlp_architect import LIBRARY_OUT
from nlp_architect.utils.generic import pad_sentences
from nlp_architect.utils.io import download_unlicensed_file
from nlp_architect.utils.text import SpacyInstance, bio_to_spans


[docs]class IntentExtractionApi(AbstractApi):
    model_dir = str(LIBRARY_OUT / "intent-pretrained")
    pretrained_model_info = path.join(model_dir, "model_info.dat")
    pretrained_model = path.join(model_dir, "model.h5")

    def __init__(self, prompt=False):
        self.model = None
        self.model_type = None
        self.word_vocab = None
        self.tags_vocab = None
        self.char_vocab = None
        self.intent_vocab = None
        self._download_pretrained_model(prompt)
        self.nlp = SpacyInstance(disable=["tagger", "ner", "parser", "vectors", "textcat"])

[docs]    def process_text(self, text):
        input_text = " ".join(text.strip().split())
        return self.nlp.tokenize(input_text)

    @staticmethod
    def _prompt():
        response = input("\nTo download '{}', please enter YES: ".format("intent_extraction"))
        res = response.lower().strip()
        if res == "yes" or (len(res) == 1 and res == "y"):
            print("Downloading {}...".format("ner"))
            responded_yes = True
        else:
            print("Download declined. Response received {} != YES|Y. ".format(res))
            responded_yes = False
        return responded_yes

    @staticmethod
    def _download_pretrained_model(prompt=True):
        """Downloads the pre-trained BIST model if non-existent."""
        model_info_exists = path.isfile(IntentExtractionApi.pretrained_model_info)
        model_exists = path.isfile(IntentExtractionApi.pretrained_model)
        if not model_exists or not model_info_exists:
            print(
                "The pre-trained models to be downloaded for the intent extraction dataset "
                "are licensed under Apache 2.0. By downloading, you accept the terms "
                "and conditions provided by the license"
            )
            makedirs(IntentExtractionApi.model_dir, exist_ok=True)
            if prompt is True:
                agreed = IntentExtractionApi._prompt()
                if agreed is False:
                    sys.exit(0)
            download_unlicensed_file(
                "https://d2zs9tzlek599f.cloudfront.net" "/models/intent/",
                "model_info.dat",
                IntentExtractionApi.pretrained_model_info,
            )
            download_unlicensed_file(
                "https://d2zs9tzlek599f.cloudfront.net" "/models/intent/",
                "model.h5",
                IntentExtractionApi.pretrained_model,
            )
            print("Done.")

[docs]    @staticmethod
    def display_results(text_str, predictions, intent_type):
        ret = {"annotation_set": [], "doc_text": " ".join([t for t in text_str])}
        spans = []
        available_tags = set()
        for s, e, tag in bio_to_spans(text_str, predictions):
            spans.append({"start": s, "end": e, "type": tag})
            available_tags.add(tag)
        ret["annotation_set"] = list(available_tags)
        ret["spans"] = spans
        ret["title"] = intent_type
        return {"doc": ret, "type": "high_level"}

[docs]    def vectorize(self, doc, vocab, char_vocab=None):
        words = np.asarray([vocab[w.lower()] if w.lower() in vocab else 1 for w in doc]).reshape(
            1, -1
        )
        if char_vocab is not None:
            sentence_chars = []
            for w in doc:
                word_chars = []
                for c in w:
                    if c in char_vocab:
                        _cid = char_vocab[c]
                    else:
                        _cid = 1
                    word_chars.append(_cid)
                sentence_chars.append(word_chars)
            sentence_chars = np.expand_dims(
                pad_sentences(sentence_chars, self.model.word_length), axis=0
            )
            return [words, sentence_chars]
        return words

[docs]    def inference(self, doc):
        text_arr = self.process_text(doc)
        intent_type = None
        if self.model_type == "mtl":
            doc_vec = self.vectorize(text_arr, self.word_vocab, self.char_vocab)
            intent, tags = self.model.predict(doc_vec, batch_size=1)
            intent = int(intent.argmax(1).flatten())
            intent_type = self.intent_vocab.get(intent, None)
            print("Detected intent type: {}".format(intent_type))
        else:
            doc_vec = self.vectorize(text_arr, self.word_vocab, None)
            tags = self.model.predict(doc_vec, batch_size=1)
        tags = tags.argmax(2).flatten()
        tag_str = [self.tags_vocab.get(n, None) for n in tags]
        for t, n in zip(text_arr, tag_str):
            print("{}\t{}\t".format(t, n))
        return self.display_results(text_arr, tag_str, intent_type)

[docs]    def load_model(self):
        with open(IntentExtractionApi.pretrained_model_info, "rb") as fp:
            model_info = pickle.load(fp)
        self.model_type = model_info["type"]
        self.word_vocab = model_info["word_vocab"]
        self.tags_vocab = {v: k for k, v in model_info["tags_vocab"].items()}
        if self.model_type == "mtl":
            self.char_vocab = model_info["char_vocab"]
            self.intent_vocab = {v: k for k, v in model_info["intent_vocab"].items()}
            model = MultiTaskIntentModel()
        else:
            model = Seq2SeqIntentModel()
        model.load(self.pretrained_model)
        self.model = model