main_22.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. def apply_tagging_sentence(
  2. starts: List[int],
  3. ends: List[int],
  4. tags: List[str],
  5. entities: List[str],
  6. plain_text: str,
  7. replacement_dict: Dict[str, str],
  8. ) -> Tuple[str, str]:
  9. """
  10. Args:
  11. starts, ends, tags, entity texts of the entities found in the sentence + the text of the sentence + the prepared replacement dictionary for pseudo
  12. Returns:
  13. str, str: a text where the entities have a XML tag, and a text where entities have been pseudonymized
  14. """
  15. assert (
  16. len(starts) == len(ends) == len(tags) == len(entities)
  17. ), "Input lists mast be of the same length"
  18. shift_tags_start, shift_tags_end = 0, 0 # shift due to the add of tags
  19. shift_pseudo_start, shift_pseudo_end = 0, 0
  20. tagged_sentence, pseudo_sentence = plain_text, plain_text
  21. n_entities = len(starts)
  22. for i in range(n_entities):
  23. start, end, entity, tag = starts[i], ends[i], entities[i], tags[i]
  24. replacement = replacement_dict[entity]
  25. pseudo_sentence = (
  26. pseudo_sentence[: start + shift_pseudo_start]
  27. + replacement
  28. + pseudo_sentence[end + shift_pseudo_end:]
  29. )
  30. shift_pseudo_start += len(replacement) - (end - start)
  31. shift_pseudo_end += len(replacement) - (end - start)
  32. tagged_sentence = (
  33. tagged_sentence[: start + shift_tags_start]
  34. + "</a>"
  35. + f"<{tag}>"
  36. + plain_text[start:end]
  37. + f"</{tag}>"
  38. + "<a>"
  39. + tagged_sentence[end + shift_tags_end:]
  40. )
  41. shift_tags_start += (
  42. 5 + 6 + 3 + 4
  43. ) # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
  44. shift_tags_end += (
  45. 5 + 6 + 3 + 4
  46. ) # 5 characters for tag <PER> (or LOC or ORG) + 6 for </PER> + 3 for <a> and 4 for </a>
  47. tagged_sentence = "<a>" + tagged_sentence + "</a>"
  48. tagged_sentence = tagged_sentence.replace("<a></a>", "")
  49. return (
  50. f"<sentence>{tagged_sentence}</sentence>",
  51. pseudo_sentence,
  52. )