def apply_tagging_sentence(
starts: List[int],
ends: List[int],
tags: List[str],
entities: List[str],
plain_text: str,
replacement_dict: Dict[str, str],
) -> Tuple[str, str]:
"""
Args:
starts, ends, tags, entity texts of the entities found in the sentence + the text of the sentence + the prepared replacement dictionary for pseudo
Returns:
str, str: a text where the entities have a XML tag, and a text where entities have been pseudonymized
"""
assert (
len(starts) == len(ends) == len(tags) == len(entities)
), "Input lists mast be of the same length"
shift_tags_start, shift_tags_end = 0, 0 # shift due to the add of tags
shift_pseudo_start, shift_pseudo_end = 0, 0
tagged_sentence, pseudo_sentence = plain_text, plain_text
n_entities = len(starts)
for i in range(n_entities):
start, end, entity, tag = starts[i], ends[i], entities[i], tags[i]
replacement = replacement_dict[entity]
pseudo_sentence = (
pseudo_sentence[: start + shift_pseudo_start]
+ replacement
+ pseudo_sentence[end + shift_pseudo_end:]
)
shift_pseudo_start += len(replacement) - (end - start)
shift_pseudo_end += len(replacement) - (end - start)
tagged_sentence = (
tagged_sentence[: start + shift_tags_start]
+ ""
+ f"<{tag}>"
+ plain_text[start:end]
+ f"{tag}>"
+ ""
+ tagged_sentence[end + shift_tags_end:]
)
shift_tags_start += (
5 + 6 + 3 + 4
) # 5 characters for tag (or LOC or ORG) + 6 for + 3 for and 4 for
shift_tags_end += (
5 + 6 + 3 + 4
) # 5 characters for tag (or LOC or ORG) + 6 for + 3 for and 4 for
tagged_sentence = "" + tagged_sentence + ""
tagged_sentence = tagged_sentence.replace("", "")
return (
f"{tagged_sentence}",
pseudo_sentence,
)