LiuFan
/
PrivacyScanData


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							"""
Pseudonymize a doc file. It takes as input a .doc file, converts it to txt, pseudonymizes it and outputs a
pseudonymized txt file.

Usage:
    doc2pseudo.py <input_file_path> <model_folder> [options]

Arguments:
    <input_file_path>       A required path parameter
    <model_folder>          A folder with a model inside
"""
from pathlib import Path

from argopt import argopt
from flair.models import SequenceTagger
from tqdm import tqdm

from data_ETL import pseudonymize


def doc2txt(doc_path: Path):
    if doc_path.suffix == ".doc":
        try:
            import textract
        except ImportError:
            raise Exception("Textract is not installed. Cannot convert .doc file")
        text = textract.process(doc_path.as_posix()).decode("utf-8").replace("|", "")
        return text
    elif doc_path.suffix == ".txt":
        with open(doc_path.as_posix()) as filo:
            return filo.read()
    else:
        raise Exception("File type not handled: either .doc or .txt")


def save_text_file(text: str, output_file: Path):
    with open(output_file.as_posix(), "w") as out:
        out.write(text)


def run(doc_path: Path):
    text = doc2txt(doc_path=doc_path)
    output_text = Path(doc_path.stem + "_anon.txt")
    tags, pseudo = pseudonymize(text=text, tagger=TAGGER)
    save_text_file(pseudo, output_file=Path(output_text))
    print(pseudo)


def main(input_file_path: Path, model_folder: Path):
    global TAGGER

    doc_paths = []
    TAGGER = SequenceTagger.load(model_folder)
    job_output = []
    tqdm.write(f"Converting file {input_file_path}")
    job_output.append(run(input_file_path))

    return doc_paths


if __name__ == "__main__":
    parser = argopt(__doc__).parse_args()
    input_file_path = Path(parser.input_file_path)
    model_folder = parser.model_folder
    main(input_file_path=input_file_path, model_folder=model_folder)