Source code for nlp_architect.common.core_nlp_doc

# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************
import json
from nlp_architect.utils.io import validate


[docs]def merge_punct_tok(merged_punct_sentence, last_merged_punct_index, punct_text, is_traverse): # merge the text of the punct tok if is_traverse: merged_punct_sentence[last_merged_punct_index]["text"] = ( punct_text + merged_punct_sentence[last_merged_punct_index]["text"] ) else: merged_punct_sentence[last_merged_punct_index]["text"] = ( merged_punct_sentence[last_merged_punct_index]["text"] + punct_text )
[docs]def find_correct_index(orig_gov, merged_punct_sentence): for tok_index, tok in enumerate(merged_punct_sentence): if ( tok["start"] == orig_gov["start"] and tok["len"] == orig_gov["len"] and tok["pos"] == orig_gov["pos"] and tok["text"] == orig_gov["text"] ): return tok_index return None
[docs]def fix_gov_indexes(merged_punct_sentence, sentence): for merged_token in merged_punct_sentence: tok_gov = merged_token["gov"] if tok_gov == -1: # gov is root merged_token["gov"] = -1 else: orig_gov = sentence[tok_gov] correct_index = find_correct_index(orig_gov, merged_punct_sentence) merged_token["gov"] = correct_index
def _spacy_pos_to_ptb(pos, text): """ Converts a Spacy part-of-speech tag to a Penn Treebank part-of-speech tag. Args: pos (str): Spacy POS tag (`tok.tag_`). text (str): The token text. Returns: ptb_tag (str): Standard PTB POS tag. """ validate((pos, str, 0, 30), (text, str, 0, 1000)) ptb_tag = pos if text in ["...", "—"]: ptb_tag = ":" elif text == "*": ptb_tag = "SYM" elif pos == "AFX": ptb_tag = "JJ" elif pos == "ADD": ptb_tag = "NN" elif text != pos and text in [",", ".", ":", "``", "-RRB-", "-LRB-"]: ptb_tag = text elif pos in ["NFP", "HYPH", "XX"]: ptb_tag = "SYM" return ptb_tag
[docs]def merge_punctuation(sentence): merged_punct_sentence = [] tmp_punct_text = None punct_text = None last_merged_punct_index = -1 for tok_index, token in enumerate(sentence): if token["rel"] == "punct": punct_text = token["text"] if tok_index < 1: # this is the first tok - append to the next token tmp_punct_text = punct_text else: # append to the previous token merge_punct_tok(merged_punct_sentence, last_merged_punct_index, punct_text, False) else: merged_punct_sentence.append(token) last_merged_punct_index = last_merged_punct_index + 1 if tmp_punct_text is not None: merge_punct_tok(merged_punct_sentence, last_merged_punct_index, punct_text, True) tmp_punct_text = None return merged_punct_sentence
[docs]class CoreNLPDoc: """Object for core-components (POS, Dependency Relations, etc). Attributes: _doc_text: the doc text _sentences: list of sentences, each word in a sentence is represented by a dictionary, structured as follows: {'start': (int), 'len': (int), 'pos': (str), 'ner': (str), 'lemma': (str), 'gov': (int), 'rel': (str)} """ def __init__(self, doc_text: str = "", sentences: list = None): if sentences is None: sentences = [] self._doc_text = doc_text self._sentences = sentences @property def doc_text(self): return self._doc_text @doc_text.setter def doc_text(self, val): self._doc_text = val @property def sentences(self): return self._sentences @sentences.setter def sentences(self, val): self._sentences = val
[docs] @staticmethod def decoder(obj): if "_doc_text" in obj and "_sentences" in obj: return CoreNLPDoc(obj["_doc_text"], obj["_sentences"]) return obj
def __repr__(self): return self.pretty_json() def __str__(self): return self.__repr__() def __iter__(self): return self.sentences.__iter__() def __len__(self): return len(self.sentences)
[docs] def json(self): """Returns json representations of the object.""" return json.dumps(self.__dict__)
[docs] def pretty_json(self): """Returns pretty json representations of the object.""" return json.dumps(self.__dict__, indent=4)
[docs] def sent_text(self, i): parsed_sent = self.sentences[i] first_tok, last_tok = parsed_sent[0], parsed_sent[-1] return self.doc_text[first_tok["start"] : last_tok["start"] + last_tok["len"]]
[docs] def sent_iter(self): for parsed_sent in self.sentences: first_tok, last_tok = parsed_sent[0], parsed_sent[-1] sent_text = self.doc_text[first_tok["start"] : last_tok["start"] + last_tok["len"]] yield sent_text, parsed_sent
[docs] def brat_doc(self): """Returns doc adapted to BRAT expected input.""" doc = {"text": "", "entities": [], "relations": []} tok_count = 0 rel_count = 1 for sentence in self.sentences: sentence_start = sentence[0]["start"] sentence_end = sentence[-1]["start"] + sentence[-1]["len"] doc["text"] = doc["text"] + "\n" + self.doc_text[sentence_start:sentence_end] token_offset = tok_count for token in sentence: start = token["start"] end = start + token["len"] doc["entities"].append(["T" + str(tok_count), token["pos"], [[start, end]]]) if token["gov"] != -1 and token["rel"] != "punct": doc["relations"].append( [ rel_count, token["rel"], [ ["", "T" + str(token_offset + token["gov"])], ["", "T" + str(tok_count)], ], ] ) rel_count += 1 tok_count += 1 doc["text"] = doc["text"][1:] return doc
[docs] def displacy_doc(self): """Return doc adapted to displacyENT expected input.""" doc = [] for sentence in self.sentences: sentence_doc = {"arcs": [], "words": []} # Merge punctuation: merged_punct_sentence = merge_punctuation(sentence) fix_gov_indexes(merged_punct_sentence, sentence) for tok_index, token in enumerate(merged_punct_sentence): sentence_doc["words"].append({"text": token["text"], "tag": token["pos"]}) dep_tok = tok_index gov_tok = token["gov"] direction = "left" arc_start = dep_tok arc_end = gov_tok if dep_tok > gov_tok: direction = "right" arc_start = gov_tok arc_end = dep_tok if token["gov"] != -1 and token["rel"] != "punct": sentence_doc["arcs"].append( { "dir": direction, "label": token["rel"], "start": arc_start, "end": arc_end, } ) doc.append(sentence_doc) return doc
[docs] @staticmethod def from_spacy(spacy_doc, show_tok=True, show_doc=True, ptb_pos=False): core_sents = [] for spacy_sent in spacy_doc.sents: cur_sent = [] for tok in spacy_sent: pos = _spacy_pos_to_ptb(tok.tag_, tok.text) if ptb_pos else tok.tag_ core_tok = { "start": tok.idx, "len": len(tok), "pos": pos, "lemma": tok.lemma_, "rel": tok.dep_.lower(), "gov": -1 if tok.dep_ == "ROOT" else tok.head.i - spacy_sent.start, } if show_tok: core_tok["text"] = tok.text cur_sent.append(core_tok) core_sents.append(cur_sent) core_doc = CoreNLPDoc(sentences=core_sents) if show_doc: core_doc.doc_text = spacy_doc.text return core_doc