LiuFan
/
PrivacyScanData


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
							def apply_tagging_sentence(
        starts: List[int],
        ends: List[int],
        tags: List[str],
        entities: List[str],
        plain_text: str,
        replacement_dict: Dict[str, str],
) -> Tuple[str, str]:
    """
    Args:
        starts, ends, tags, entity texts of the entities found in the sentence + the text of the sentence + the prepared replacement dictionary for pseudo
    Returns:
        str, str: a text where the entities have a XML tag, and a text where entities have been pseudonymized
    """

    assert (
            len(starts) == len(ends) == len(tags) == len(entities)
    ), "Input lists mast be of the same length"
    shift_tags_start, shift_tags_end = 0, 0  # shift due to the add of tags
    shift_pseudo_start, shift_pseudo_end = 0, 0
    tagged_sentence, pseudo_sentence = plain_text, plain_text
    n_entities = len(starts)

    for i in range(n_entities):
        start, end, entity, tag = starts[i], ends[i], entities[i], tags[i]
        replacement = replacement_dict[entity]

        pseudo_sentence = (
                pseudo_sentence[: start + shift_pseudo_start]
                + replacement
                + pseudo_sentence[end + shift_pseudo_end:]
        )
        shift_pseudo_start += len(replacement) - (end - start)
        shift_pseudo_end += len(replacement) - (end - start)
        tagged_sentence = (
                tagged_sentence[: start + shift_tags_start]
                + "</a>"
                + f"<{tag}>"
                + plain_text[start:end]
                + f"</{tag}>"
                + "<a>"
                + tagged_sentence[end + shift_tags_end:]
        )
        shift_tags_start += (
                5 + 6 + 3 + 4
        )  # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
        shift_tags_end += (
                5 + 6 + 3 + 4
        )  # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
    tagged_sentence = "<a>" + tagged_sentence + "</a>"
    tagged_sentence = tagged_sentence.replace("<a></a>", "")
    return (
        f"<sentence>{tagged_sentence}</sentence>",
        pseudo_sentence,
    )