Source code for nlp_architect.utils.generic

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
from __future__ import absolute_import, division, print_function, unicode_literals

import re

import numpy as np

# pylint: disable=invalid-unary-operand-type
[docs]def pad_sentences( sequences: np.ndarray, max_length: int = None, padding_value: int = 0, padding_style="post" ) -> np.ndarray: """ Pad input sequences up to max_length values are aligned to the right Args: sequences (iter): a 2D matrix (np.array) to pad max_length (int, optional): max length of resulting sequences padding_value (int, optional): padding value padding_style (str, optional): add padding values as prefix (use with 'pre') or postfix (use with 'post') Returns: input sequences padded to size 'max_length' """ if isinstance(sequences, list) and len(sequences) > 0: try: sequences = np.asarray(sequences) except ValueError: print("cannot convert sequences into numpy array") assert hasattr(sequences, "shape") if len(sequences) < 1: return sequences if max_length is None: max_length = np.max([len(s) for s in sequences]) elif max_length < 1: raise ValueError("max sequence length must be > 0") if max_length < 1: return sequences padded_sequences = np.ones((len(sequences), max_length), dtype=np.int32) * padding_value for i, sent in enumerate(sequences): if padding_style == "post": trunc = sent[-max_length:] padded_sequences[i, : len(trunc)] = trunc elif padding_style == "pre": trunc = sent[:max_length] padded_sequences[i, -trunc:] = trunc return padded_sequences.astype(dtype=np.int32)
[docs]def one_hot(mat: np.ndarray, num_classes: int) -> np.ndarray: """ Convert a 1D matrix of ints into one-hot encoded vectors. Arguments: mat (numpy.ndarray): A 1D matrix of labels (int) num_classes (int): Number of all possible classes Returns: numpy.ndarray: A 2D matrix """ assert len(mat.shape) < 2 or isinstance(mat.shape, int) vec = np.zeros((mat.shape[0], num_classes)) for i, v in enumerate(mat): vec[i][v] = 1.0 return vec
[docs]def one_hot_sentence(mat: np.ndarray, num_classes: int) -> np.ndarray: """ Convert a 2D matrix of ints into one-hot encoded 3D matrix Arguments: mat (numpy.ndarray): A 2D matrix of labels (int) num_classes (int): Number of all possible classes Returns: numpy.ndarray: A 3D matrix """ new_mat = [] for i in range(mat.shape[0]): new_mat.append(one_hot(mat[i], num_classes)) return np.asarray(new_mat)
[docs]def add_offset(mat: np.ndarray, offset: int = 1) -> np.ndarray: """ Add +1 to all values in matrix mat Arguments: mat (numpy.ndarray): A 2D matrix with int values offset (int): offset to add Returns: numpy.ndarray: input matrix """ for i, vec in enumerate(mat): offset_arr = np.array(vec.shape) offset_arr.fill(offset) mat[i] = vec + offset_arr return mat
[docs]def license_prompt(model_name, model_website, dataset_dir=None): if dataset_dir: print("\n\n***\n{} was not found in the directory: {}".format(model_name, dataset_dir)) else: print("\n\n***\n\n{} was not found on local installation".format(model_name)) print("{} can be downloaded from {}".format(model_name, model_website)) print( "The terms and conditions of the data set license apply. Intel does not " "grant any rights to the data files or database\n" ) response = input( "To download '{}' from {}, please enter YES: ".format(model_name, model_website) ) res = response.lower().strip() if res == "yes" or (len(res) == 1 and res == "y"): print("Downloading {}...".format(model_name)) responded_yes = True else: print("Download declined. Response received {} != YES|Y. ".format(res)) if dataset_dir: print( "Please download the model manually from {} and place in the directory: {}".format( model_website, dataset_dir ) ) else: print("Please download the model manually from {}".format(model_website)) responded_yes = False return responded_yes
# character vocab zhang_lecun_vocab = list("abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’/\\|_@#$%ˆ&*˜‘+=<>()[]{}") vocab_hash = {b: a for a, b in enumerate(zhang_lecun_vocab)}
[docs]def normalize( txt, vocab=None, replace_char=" ", max_length=300, pad_out=True, to_lower=True, reverse=False, truncate_left=False, encoding=None, ): # remove html # This will keep characters and other symbols txt = txt.split() # Remove HTML txt = [re.sub(r"http:.*", "", r) for r in txt] txt = [re.sub(r"https:.*", "", r) for r in txt] txt = " ".join(txt) # Remove punctuation txt = re.sub("[.,!]", " ", txt) txt = " ".join(txt.split()) # store length for multiple comparisons txt_len = len(txt) if truncate_left: txt = txt[-max_length:] else: txt = txt[:max_length] # change case if to_lower: txt = txt.lower() # Reverse order if reverse: txt = txt[::-1] # replace chars if vocab is not None: txt = "".join([c if c in vocab else replace_char for c in txt]) # re-encode text if encoding is not None: txt = txt.encode(encoding, errors="ignore") # pad out if needed if pad_out and max_length > txt_len: txt = txt + replace_char * (max_length - txt_len) return txt
[docs]def to_one_hot(txt, vocab=vocab_hash): vocab_size = len(vocab.keys()) one_hot_vec = np.zeros((vocab_size + 1, len(txt)), dtype=np.float32) # run through txt and "switch on" relevant positions in one-hot vector for idx, char in enumerate(txt): if char in vocab_hash: vocab_idx = vocab_hash[char] one_hot_vec[vocab_idx, idx] = 1 # raised if character is out of vocabulary else: pass return one_hot_vec
[docs]def balance(df): print("Balancing the classes") type_counts = df["Sentiment"].value_counts() min_count = min(type_counts.values) balanced_df = None for key in type_counts.keys(): df_sub = df[df["Sentiment"] == key].sample(n=min_count, replace=False) if balanced_df is not None: balanced_df = balanced_df.append(df_sub) else: balanced_df = df_sub return balanced_df