Source code for nlp_architect.data.sequential_tagging

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

from __future__ import absolute_import, division, print_function, unicode_literals

import logging
import os
from os import path
from typing import List

import numpy as np

from nlp_architect.data.utils import DataProcessor, InputExample, read_column_tagged_file
from nlp_architect.utils.generic import pad_sentences
from nlp_architect.utils.io import validate_existing_directory, validate_existing_filepath
from nlp_architect.utils.text import (
    character_vector_generator,
    read_sequential_tagging_file,
    word_vector_generator,
)
from nlp_architect.utils.text import Vocabulary

logger = logging.getLogger(__name__)


[docs]class SequentialTaggingDataset(object): """ Sequential tagging dataset loader. Loads train/test files with tabular separation. Args: train_file (str): path to train file test_file (str): path to test file max_sentence_length (int, optional): max sentence length max_word_length (int, optional): max word length tag_field_no (int, optional): index of column to use a y-samples """ def __init__( self, train_file, test_file, max_sentence_length=30, max_word_length=20, tag_field_no=2 ): self.files = {"train": train_file, "test": test_file} self.max_sent_len = max_sentence_length self.max_word_len = max_word_length self.tf = tag_field_no self.vocabs = {"token": None, "char": None, "tag": None} # 0=pad, 1=unk # 0=pad self.data = {} sentences = self._read_file(self.files["train"]) train_size = len(sentences) sentences += self._read_file(self.files["test"]) test_size = len(sentences) - train_size texts, tags = list(zip(*sentences)) texts_mat, self.vocabs["token"] = word_vector_generator(texts, lower=True, start=2) tags_mat, self.vocabs["tag"] = word_vector_generator(tags, start=1) chars_mat, self.vocabs["char"] = character_vector_generator(texts, start=2) texts_mat = pad_sentences(texts_mat, max_length=self.max_sent_len) tags_mat = pad_sentences(tags_mat, max_length=self.max_sent_len) chars_mat = [pad_sentences(d, max_length=self.max_word_len) for d in chars_mat] zeros = np.zeros((len(chars_mat), self.max_sent_len, self.max_word_len)) for idx, d in enumerate(chars_mat): d = d[: self.max_sent_len] zeros[idx, : d.shape[0]] = d chars_mat = zeros.astype(dtype=np.int32) self.data["train"] = texts_mat[:train_size], chars_mat[:train_size], tags_mat[:train_size] self.data["test"] = texts_mat[-test_size:], chars_mat[-test_size:], tags_mat[-test_size:] @property def y_labels(self): """return y labels""" return self.vocabs["tag"] @property def word_vocab(self): """words vocabulary""" return self.vocabs["token"] @property def char_vocab(self): """characters vocabulary""" return self.vocabs["char"] @property def word_vocab_size(self): """word vocabulary size""" return len(self.vocabs["token"]) + 2 @property def char_vocab_size(self): """character vocabulary size""" return len(self.vocabs["char"]) + 2 @property def train_set(self): """Get the train set""" return self.data["train"] @property def test_set(self): """Get the test set""" return self.data["test"] def _read_file(self, filepath): with open(filepath, encoding="utf-8") as fp: data = fp.readlines() data = [d.strip() for d in data] sentences = self._split_into_sentences(data) parsed_sentences = [self._parse_sentence(s) for s in sentences if len(s) > 0] return parsed_sentences def _parse_sentence(self, sentence): tokens = [] tags = [] for line in sentence: fields = line.split() assert len(fields) >= self.tf, "tag field exceeds number of fields" if "CD" in fields[1]: tokens.append("0") else: tokens.append(fields[0]) tags.append(fields[self.tf - 1]) return tokens, tags @staticmethod def _split_into_sentences(file_lines): sents = [] s = [] for line in file_lines: line = line.strip() if not line: sents.append(s) s = [] continue s.append(line) sents.append(s) return sents
[docs]class CONLL2000(object): """ CONLL 2000 POS/chunking task data set (numpy) Arguments: data_path (str): directory containing CONLL2000 files sentence_length (int, optional): number of time steps to embed the data. None value will not truncate vectors max_word_length (int, optional): max word length in characters. None value will not truncate vectors extract_chars (boolean, optional): Yield Char RNN features. lowercase (bool, optional): lower case sentence words """ dataset_files = {"train": "train.txt", "test": "test.txt"} def __init__( self, data_path, sentence_length=None, max_word_length=None, extract_chars=False, lowercase=True, ): self._validate_paths(data_path) self.data_path = data_path self.sentence_length = sentence_length self.use_chars = extract_chars self.max_word_length = max_word_length self.lower = lowercase self.vocabs = {"word": None, "char": None, "pos": None, "chunk": None} self._data_dict = {} def _validate_paths(self, data_path): validate_existing_directory(data_path) for f in self.dataset_files: _f_path = path.join(data_path, self.dataset_files[f]) validate_existing_filepath(_f_path) self.dataset_files[f] = _f_path def _load_data(self): """ open files and parse return format: list of 3-tuples (word list, POS list, chunk list) """ train_set = read_sequential_tagging_file(self.dataset_files["train"]) test_set = read_sequential_tagging_file(self.dataset_files["test"]) train_data = [list(zip(*x)) for x in train_set] test_data = [list(zip(*x)) for x in test_set] return train_data, test_data @property def train_set(self): """get the train set""" if self._data_dict.get("train", None) is None: self._gen_data() return self._data_dict.get("train") @property def test_set(self): """get the test set""" if self._data_dict.get("test", None) is None: self._gen_data() return self._data_dict.get("test") @staticmethod def _extract(x, y, n): return list(zip(*x))[n] + list(zip(*y))[n] @property def word_vocab(self): """word Vocabulary""" return self.vocabs["word"] @property def char_vocab(self): """character Vocabulary""" return self.vocabs["char"] @property def pos_vocab(self): """pos label Vocabulary""" return self.vocabs["pos"] @property def chunk_vocab(self): """chunk label Vocabulary""" return self.vocabs["chunk"] def _gen_data(self): train, test = self._load_data() train_size = len(train) test_size = len(test) sentences = self._extract(train, test, 0) pos_tags = self._extract(train, test, 1) chunk_tags = self._extract(train, test, 2) sentence_vecs, word_vocab = word_vector_generator(sentences, self.lower, 2) pos_vecs, pos_vocab = word_vector_generator(pos_tags, start=1) chunk_vecs, chunk_vocab = word_vector_generator(chunk_tags, start=1) self.vocabs = { "word": word_vocab, # 0=pad, 1=unk "pos": pos_vocab, # 0=pad, 1=unk "chunk": chunk_vocab, } # 0=pad if self.sentence_length is not None: sentence_vecs = pad_sentences(sentence_vecs, max_length=self.sentence_length) chunk_vecs = pad_sentences(chunk_vecs, max_length=self.sentence_length) pos_vecs = pad_sentences(pos_vecs, max_length=self.sentence_length) self._data_dict["train"] = ( sentence_vecs[:train_size], pos_vecs[:train_size], chunk_vecs[:train_size], ) self._data_dict["test"] = ( sentence_vecs[-test_size:], pos_vecs[-test_size:], chunk_vecs[-test_size:], ) if self.use_chars: chars_vecs, char_vocab = character_vector_generator(sentences, start=2) self.vocabs.update({"char": char_vocab}) # 0=pad, 1=unk if self.max_word_length is not None: chars_vecs = [pad_sentences(d, max_length=self.max_word_length) for d in chars_vecs] zeros = np.zeros((len(chars_vecs), self.sentence_length, self.max_word_length)) for idx, d in enumerate(chars_vecs): d = d[: self.sentence_length] zeros[idx, -d.shape[0] :] = d chars_vecs = zeros.astype(dtype=np.int32) self._data_dict["train"] += (chars_vecs[:train_size],) self._data_dict["test"] += (chars_vecs[-test_size:],)
[docs]class TokenClsInputExample(InputExample): """A single training/test example for simple sequence token classification.""" def __init__( self, guid: str, text: str, tokens: List[str], shapes: List[int] = None, label: List[str] = None, ): """Constructs a SequenceClassInputExample. Args: guid: Unique id for the example. text: string. The untokenized text of the sequence. tokens (List[str]): The list of tokens. shapes (List[str]): List of tokens shapes. label (List[str], optional): The tags of the tokens. """ super(TokenClsInputExample, self).__init__(guid, text, label) self.tokens = tokens self.shapes = shapes
[docs]class TokenClsProcessor(DataProcessor): """Sequence token classification Processor dataset loader. Loads a directory with train.txt/test.txt/dev.txt files in tab separeted format (one token per line - conll style). Label dictionary is given in labels.txt file. """ def __init__(self, data_dir, tag_col: int = -1, ignore_token=None): if not os.path.exists(data_dir): raise FileNotFoundError self.data_dir = data_dir self.tag_col = tag_col self.labels = None self.ignore_token = ignore_token def _read_examples(self, data_dir, file_name, set_name): if not os.path.exists(data_dir + os.sep + file_name): logger.error( "Requested file {} in path {} for TokenClsProcess not found".format( file_name, data_dir ) ) return None return self._create_examples( read_column_tagged_file( os.path.join(data_dir, file_name), tag_col=self.tag_col, ignore_token=self.ignore_token, ), set_name, )
[docs] def get_train_examples(self, filename="train.txt"): return self._read_examples(self.data_dir, filename, "train")
[docs] def get_dev_examples(self, filename="dev.txt"): return self._read_examples(self.data_dir, filename, "dev")
[docs] def get_test_examples(self, filename="test.txt"): return self._read_examples(self.data_dir, filename, "test")
# pylint: disable=arguments-differ
[docs] def get_labels(self): if self.labels is not None: return self.labels f_path = self.data_dir + os.sep + "labels.txt" if not os.path.exists(f_path): logger.error("Labels file (labels.txt) not found in {}".format(self.data_dir)) raise FileNotFoundError self.labels = [] with open(f_path, encoding="utf-8") as fp: self.labels = [line.strip() for line in fp.readlines()] return self.labels
[docs] @staticmethod def get_labels_filename(): return "labels.txt"
@staticmethod def _get_shape(string): if all(c.isupper() for c in string): return 1 # "AA" if string[0].isupper(): return 2 # "Aa" if any(c for c in string if c.isupper()): return 3 # "aAa" return 4 # "a" @classmethod def _create_examples(cls, lines, set_type): """See base class.""" examples = [] for i, (sentence, labels) in enumerate(lines): guid = "%s-%s" % (set_type, i) text = " ".join(sentence) shapes = [cls._get_shape(w) for w in sentence] examples.append( TokenClsInputExample( guid=guid, text=text, tokens=sentence, label=labels, shapes=shapes ) ) return examples
[docs] def get_vocabulary(self, examples: TokenClsInputExample = None): vocab = Vocabulary(start=1) for e in examples: for t in e.tokens: vocab.add(t) return vocab