import textract
from docx import Document
import re
from string import ascii_uppercase
from flair.data import Sentence
import dash_html_components as html
import run.tokenization as tkz
import itertools
def load_text(doc_path):
return textract.process(doc_path, encoding='utf-8').decode("utf8").replace("|", "\t")
def flair_predict_tags(text, tagger):
""" Predict using flair tagger"""
sentence = Sentence(text)
tagger.predict(sentence)
return sentence
def build_pseudonymisation_map_flair(sentences, pseudos, acceptance_score, tags="all"):
"""
Gets all replacements to be made in pseudonimized text using flair tagged sentences
:param sentences: list of tuples (flair tagged sentences, original)
:param pseudos: list of pseudos to be used
:param acceptance_score: minimum confidence score to accept NER tag
:return: dict: keys are spans in sentence, values are pseudos
"""
replacements = {}
mapping = {}
for sentence in sentences:
for entity in sentence[0].get_spans('ner'):
if entity.score > acceptance_score and entity.tag != '0' and (entity.tag in tags or tags == "all"):
# ajouter le score en param
# TODO refaire la gestion des B et I tags
for token in entity.tokens:
if token.text.lower() not in mapping:
mapping[token.text.lower()] = pseudos.pop(0)
replacements[sentence[1][token.idx - 1]] = mapping[token.text.lower()]
return replacements
def pseudonymize_text(replacements, text):
""" Create new text with pseudos in place of NER """
index = 0
pseudonymized_text = ''
for key in sorted(replacements.keys()):
chunk = text[index:key[0]]
# print(text[key[0]:key[1]])
pseudonymized_text += chunk
pseudonymized_text += replacements[key]
index = key[1]
pseudonymized_text += text[index:]
return pseudonymized_text
def pseudonymize_html_text(replacements, text):
""" Create html blocs with pseudos in place of NER for dash tool"""
index = 0
pseudonymized_text = ''
for key in sorted(replacements.keys()):
chunk = text[index:key[0]]
# print(text[key[0]:key[1]])
pseudonymized_text += chunk
pseudonymized_text += "" + replacements[key] + ""
index = key[1]
pseudonymized_text += text[index:]
return pseudonymized_text
def write_docx_file(text, path):
"""Write pseudonimized file to docx"""
document = Document()
paragraph = document.add_paragraph(text)
document.save(path)
def create_html_file(text, sent_tokenizer, word_tokenizer, tagger, acceptance_score=0.5):
""" Create HMTL files for the Dash tool """
singles = ["{}...".format(letter) for letter in ascii_uppercase]
doubles = ["{}{}...".format(d[0], d[1]) for d in list(itertools.combinations(ascii_uppercase, 2))]
pseudos = singles + doubles
sentences = tkz.tokenize_text(text, sent_tokenizer, word_tokenizer)
tagged_sentences = []
for sentence in sentences:
pseudo_sentence = " ".join([text[word[0]: word[1]] for word in sentence])
tagged_sentence = flair_predict_tags(pseudo_sentence, tagger)
tagged_sentences.append((tagged_sentence, sentence))
replacements = build_pseudonymisation_map_flair(tagged_sentences, pseudos, acceptance_score)
pseudonymized_text = pseudonymize_html_text(replacements, text)
html_text = []
for p in pseudonymized_text.split("\n"):
html_text.append(highlight_pseudo(p))
return html_text
def highlight_pseudo(paragraph):
""" Hghlight pseudonymized text for Dash tool """
index = 0
new_str = []
for change in re.finditer('(.*?)', paragraph):
b = change.start(0)
e = change.end(0)
new_str.append(paragraph[index:b])
new_str.append(html.Mark(change.group(1), style={'color': 'blue'}))
index = e
new_str.append(paragraph[index:])
return html.P(new_str)
def create_CoNLL(tagged_sentences, path):
""" Write CoNLL file """
with open(path, "w") as file:
for sent in tagged_sentences:
for token in sent[0]:
file.write(f"{token.text}\t{token.get_tag('ner').value}\n")
def process_file(path, sent_tokenizer, word_tokenizer, tagger, acceptance_score=0.5, docx_path=False, CoNLL_path=False, tags="all"):
"""
Pseudonymization of text file. Can create CoNLL file, HTML of docx. Only NLTK tokenizer supported for the moment
:param acceptance_score: minimum confidence score to accept ner tagging
:param tagger: A NER tagger
:param path: original file path
:param sent_tokenizer: Sentence tokenizer
:param word_tokenizer: Word tokenizer
:param docx_path: If path is given will write DOCX file
:param CoNLL_path: If path is given will write CoNLL file
:return: Pseudonymized text
"""
text = load_text(path)
singles = ["{}...".format(letter) for letter in ascii_uppercase]
doubles = ["{}{}...".format(d[0], d[1]) for d in list(itertools.combinations(ascii_uppercase, 2))]
pseudos = singles + doubles
sentences = tkz.tokenize_text(text, sent_tokenizer, word_tokenizer)
tagged_sentences = []
for sentence in sentences:
pseudo_sentence = " ".join([text[word[0]: word[1]] for word in sentence])
tagged_sentence = flair_predict_tags(pseudo_sentence, tagger)
tagged_sentences.append((tagged_sentence, sentence))
if CoNLL_path:
create_CoNLL(tagged_sentences, CoNLL_path)
replacements = build_pseudonymisation_map_flair(tagged_sentences, pseudos, acceptance_score, tags)
pseudonymized_text = pseudonymize_text(replacements, text)
if docx_path:
write_docx_file(pseudonymized_text, docx_path)
return pseudonymized_text
if __name__ == '__main__':
from nltk.tokenize import WordPunctTokenizer, PunktSentenceTokenizer
from flair.models import SequenceTagger
word_tokenizer = WordPunctTokenizer()
tagger = SequenceTagger.load('fr-ner')
sent_tokenizer = PunktSentenceTokenizer("nltk_data/tokenizers/punkt/french.pickle")
path = "path_to_doc"
process_file(path, sent_tokenizer, word_tokenizer, tagger, docx_path=False, CoNLL_path=False, tags=["PER"])