def apply_tagging_sentence( starts: List[int], ends: List[int], tags: List[str], entities: List[str], plain_text: str, replacement_dict: Dict[str, str], ) -> Tuple[str, str]: """ Args: starts, ends, tags, entity texts of the entities found in the sentence + the text of the sentence + the prepared replacement dictionary for pseudo Returns: str, str: a text where the entities have a XML tag, and a text where entities have been pseudonymized """ assert ( len(starts) == len(ends) == len(tags) == len(entities) ), "Input lists mast be of the same length" shift_tags_start, shift_tags_end = 0, 0 # shift due to the add of tags shift_pseudo_start, shift_pseudo_end = 0, 0 tagged_sentence, pseudo_sentence = plain_text, plain_text n_entities = len(starts) for i in range(n_entities): start, end, entity, tag = starts[i], ends[i], entities[i], tags[i] replacement = replacement_dict[entity] pseudo_sentence = ( pseudo_sentence[: start + shift_pseudo_start] + replacement + pseudo_sentence[end + shift_pseudo_end:] ) shift_pseudo_start += len(replacement) - (end - start) shift_pseudo_end += len(replacement) - (end - start) tagged_sentence = ( tagged_sentence[: start + shift_tags_start] + "" + f"<{tag}>" + plain_text[start:end] + f"" + "" + tagged_sentence[end + shift_tags_end:] ) shift_tags_start += ( 5 + 6 + 3 + 4 ) # 5 characters for tag (or LOC or ORG) + 6 for + 3 for and 4 for shift_tags_end += ( 5 + 6 + 3 + 4 ) # 5 characters for tag (or LOC or ORG) + 6 for + 3 for and 4 for tagged_sentence = "" + tagged_sentence + "" tagged_sentence = tagged_sentence.replace("", "") return ( f"{tagged_sentence}", pseudo_sentence, )