doc2pseudo.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. """
  2. Pseudonymize a doc file. It takes as input a .doc file, converts it to txt, pseudonymizes it and outputs a
  3. pseudonymized txt file.
  4. Usage:
  5. doc2pseudo.py <input_file_path> <model_folder> [options]
  6. Arguments:
  7. <input_file_path> A required path parameter
  8. <model_folder> A folder with a model inside
  9. """
  10. from pathlib import Path
  11. from argopt import argopt
  12. from flair.models import SequenceTagger
  13. from tqdm import tqdm
  14. from data_ETL import pseudonymize
  15. def doc2txt(doc_path: Path):
  16. if doc_path.suffix == ".doc":
  17. try:
  18. import textract
  19. except ImportError:
  20. raise Exception("Textract is not installed. Cannot convert .doc file")
  21. text = textract.process(doc_path.as_posix()).decode("utf-8").replace("|", "")
  22. return text
  23. elif doc_path.suffix == ".txt":
  24. with open(doc_path.as_posix()) as filo:
  25. return filo.read()
  26. else:
  27. raise Exception("File type not handled: either .doc or .txt")
  28. def save_text_file(text: str, output_file: Path):
  29. with open(output_file.as_posix(), "w") as out:
  30. out.write(text)
  31. def run(doc_path: Path):
  32. text = doc2txt(doc_path=doc_path)
  33. output_text = Path(doc_path.stem + "_anon.txt")
  34. tags, pseudo = pseudonymize(text=text, tagger=TAGGER)
  35. save_text_file(pseudo, output_file=Path(output_text))
  36. print(pseudo)
  37. def main(input_file_path: Path, model_folder: Path):
  38. global TAGGER
  39. doc_paths = []
  40. TAGGER = SequenceTagger.load(model_folder)
  41. job_output = []
  42. tqdm.write(f"Converting file {input_file_path}")
  43. job_output.append(run(input_file_path))
  44. return doc_paths
  45. if __name__ == "__main__":
  46. parser = argopt(__doc__).parse_args()
  47. input_file_path = Path(parser.input_file_path)
  48. model_folder = parser.model_folder
  49. main(input_file_path=input_file_path, model_folder=model_folder)