process_file.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. import textract
  2. from docx import Document
  3. import re
  4. from string import ascii_uppercase
  5. from flair.data import Sentence
  6. import dash_html_components as html
  7. import run.tokenization as tkz
  8. import itertools
  9. def load_text(doc_path):
  10. return textract.process(doc_path, encoding='utf-8').decode("utf8").replace("|", "\t")
  11. def flair_predict_tags(text, tagger):
  12. """ Predict using flair tagger"""
  13. sentence = Sentence(text)
  14. tagger.predict(sentence)
  15. return sentence
  16. def build_pseudonymisation_map_flair(sentences, pseudos, acceptance_score, tags="all"):
  17. """
  18. Gets all replacements to be made in pseudonimized text using flair tagged sentences
  19. :param sentences: list of tuples (flair tagged sentences, original)
  20. :param pseudos: list of pseudos to be used
  21. :param acceptance_score: minimum confidence score to accept NER tag
  22. :return: dict: keys are spans in sentence, values are pseudos
  23. """
  24. replacements = {}
  25. mapping = {}
  26. for sentence in sentences:
  27. for entity in sentence[0].get_spans('ner'):
  28. if entity.score > acceptance_score and entity.tag != '0' and (entity.tag in tags or tags == "all"):
  29. # ajouter le score en param
  30. # TODO refaire la gestion des B et I tags
  31. for token in entity.tokens:
  32. if token.text.lower() not in mapping:
  33. mapping[token.text.lower()] = pseudos.pop(0)
  34. replacements[sentence[1][token.idx - 1]] = mapping[token.text.lower()]
  35. return replacements
  36. def pseudonymize_text(replacements, text):
  37. """ Create new text with pseudos in place of NER """
  38. index = 0
  39. pseudonymized_text = ''
  40. for key in sorted(replacements.keys()):
  41. chunk = text[index:key[0]]
  42. # print(text[key[0]:key[1]])
  43. pseudonymized_text += chunk
  44. pseudonymized_text += replacements[key]
  45. index = key[1]
  46. pseudonymized_text += text[index:]
  47. return pseudonymized_text
  48. def pseudonymize_html_text(replacements, text):
  49. """ Create html blocs with pseudos in place of NER for dash tool"""
  50. index = 0
  51. pseudonymized_text = ''
  52. for key in sorted(replacements.keys()):
  53. chunk = text[index:key[0]]
  54. # print(text[key[0]:key[1]])
  55. pseudonymized_text += chunk
  56. pseudonymized_text += "<ano>" + replacements[key] + "</ano>"
  57. index = key[1]
  58. pseudonymized_text += text[index:]
  59. return pseudonymized_text
  60. def write_docx_file(text, path):
  61. """Write pseudonimized file to docx"""
  62. document = Document()
  63. paragraph = document.add_paragraph(text)
  64. document.save(path)
  65. def create_html_file(text, sent_tokenizer, word_tokenizer, tagger, acceptance_score=0.5):
  66. """ Create HMTL files for the Dash tool """
  67. singles = ["{}...".format(letter) for letter in ascii_uppercase]
  68. doubles = ["{}{}...".format(d[0], d[1]) for d in list(itertools.combinations(ascii_uppercase, 2))]
  69. pseudos = singles + doubles
  70. sentences = tkz.tokenize_text(text, sent_tokenizer, word_tokenizer)
  71. tagged_sentences = []
  72. for sentence in sentences:
  73. pseudo_sentence = " ".join([text[word[0]: word[1]] for word in sentence])
  74. tagged_sentence = flair_predict_tags(pseudo_sentence, tagger)
  75. tagged_sentences.append((tagged_sentence, sentence))
  76. replacements = build_pseudonymisation_map_flair(tagged_sentences, pseudos, acceptance_score)
  77. pseudonymized_text = pseudonymize_html_text(replacements, text)
  78. html_text = []
  79. for p in pseudonymized_text.split("\n"):
  80. html_text.append(highlight_pseudo(p))
  81. return html_text
  82. def highlight_pseudo(paragraph):
  83. """ Hghlight pseudonymized text for Dash tool """
  84. index = 0
  85. new_str = []
  86. for change in re.finditer('<ano>(.*?)</ano>', paragraph):
  87. b = change.start(0)
  88. e = change.end(0)
  89. new_str.append(paragraph[index:b])
  90. new_str.append(html.Mark(change.group(1), style={'color': 'blue'}))
  91. index = e
  92. new_str.append(paragraph[index:])
  93. return html.P(new_str)
  94. def create_CoNLL(tagged_sentences, path):
  95. """ Write CoNLL file """
  96. with open(path, "w") as file:
  97. for sent in tagged_sentences:
  98. for token in sent[0]:
  99. file.write(f"{token.text}\t{token.get_tag('ner').value}\n")
  100. def process_file(path, sent_tokenizer, word_tokenizer, tagger, acceptance_score=0.5, docx_path=False, CoNLL_path=False, tags="all"):
  101. """
  102. Pseudonymization of text file. Can create CoNLL file, HTML of docx. Only NLTK tokenizer supported for the moment
  103. :param acceptance_score: minimum confidence score to accept ner tagging
  104. :param tagger: A NER tagger
  105. :param path: original file path
  106. :param sent_tokenizer: Sentence tokenizer
  107. :param word_tokenizer: Word tokenizer
  108. :param docx_path: If path is given will write DOCX file
  109. :param CoNLL_path: If path is given will write CoNLL file
  110. :return: Pseudonymized text
  111. """
  112. text = load_text(path)
  113. singles = ["{}...".format(letter) for letter in ascii_uppercase]
  114. doubles = ["{}{}...".format(d[0], d[1]) for d in list(itertools.combinations(ascii_uppercase, 2))]
  115. pseudos = singles + doubles
  116. sentences = tkz.tokenize_text(text, sent_tokenizer, word_tokenizer)
  117. tagged_sentences = []
  118. for sentence in sentences:
  119. pseudo_sentence = " ".join([text[word[0]: word[1]] for word in sentence])
  120. tagged_sentence = flair_predict_tags(pseudo_sentence, tagger)
  121. tagged_sentences.append((tagged_sentence, sentence))
  122. if CoNLL_path:
  123. create_CoNLL(tagged_sentences, CoNLL_path)
  124. replacements = build_pseudonymisation_map_flair(tagged_sentences, pseudos, acceptance_score, tags)
  125. pseudonymized_text = pseudonymize_text(replacements, text)
  126. if docx_path:
  127. write_docx_file(pseudonymized_text, docx_path)
  128. return pseudonymized_text
  129. if __name__ == '__main__':
  130. from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer
  131. from flair.models import SequenceTagger
  132. word_tokenizer = WordPunctTokenizer()
  133. tagger = SequenceTagger.load('fr-ner')
  134. sent_tokenizer = PunktSentenceTokenizer("nltk_data/tokenizers/punkt/french.pickle")
  135. path = "path_to_doc"
  136. process_file(path, sent_tokenizer, word_tokenizer, tagger, docx_path=False, CoNLL_path=False, tags=["PER"])