12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455 |
- def apply_tagging_sentence(
- starts: List[int],
- ends: List[int],
- tags: List[str],
- entities: List[str],
- plain_text: str,
- replacement_dict: Dict[str, str],
- ) -> Tuple[str, str]:
- """
- Args:
- starts, ends, tags, entity texts of the entities found in the sentence + the text of the sentence + the prepared replacement dictionary for pseudo
- Returns:
- str, str: a text where the entities have a XML tag, and a text where entities have been pseudonymized
- """
- assert (
- len(starts) == len(ends) == len(tags) == len(entities)
- ), "Input lists mast be of the same length"
- shift_tags_start, shift_tags_end = 0, 0 # shift due to the add of tags
- shift_pseudo_start, shift_pseudo_end = 0, 0
- tagged_sentence, pseudo_sentence = plain_text, plain_text
- n_entities = len(starts)
- for i in range(n_entities):
- start, end, entity, tag = starts[i], ends[i], entities[i], tags[i]
- replacement = replacement_dict[entity]
- pseudo_sentence = (
- pseudo_sentence[: start + shift_pseudo_start]
- + replacement
- + pseudo_sentence[end + shift_pseudo_end:]
- )
- shift_pseudo_start += len(replacement) - (end - start)
- shift_pseudo_end += len(replacement) - (end - start)
- tagged_sentence = (
- tagged_sentence[: start + shift_tags_start]
- + "</a>"
- + f"<{tag}>"
- + plain_text[start:end]
- + f"</{tag}>"
- + "<a>"
- + tagged_sentence[end + shift_tags_end:]
- )
- shift_tags_start += (
- 5 + 6 + 3 + 4
- ) # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
- shift_tags_end += (
- 5 + 6 + 3 + 4
- ) # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
- tagged_sentence = "<a>" + tagged_sentence + "</a>"
- tagged_sentence = tagged_sentence.replace("<a></a>", "")
- return (
- f"<sentence>{tagged_sentence}</sentence>",
- pseudo_sentence,
- )
|