pseudodepseudonimizer.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. '''
  2. Replaces the specified pseduonym string by the same string BUT numbered according to its apparition in the input text
  3. Usage:
  4. txt2conll.py <s> <i> <o>
  5. Arguments:
  6. <s> The string to replace by its numbered version
  7. <i> An input file or directory (if dir it will convert all txt files inside).
  8. <o> An output directory.
  9. '''
  10. import re
  11. from argopt import argopt
  12. COUNTER = 0
  13. def dots_repl(matchobject):
  14. global COUNTER
  15. numbered_dots = "{0} PSEUDOPSEUDOPSEUDO{2} {3}".format(matchobject.groups()[0], matchobject.groups()[1], COUNTER,
  16. matchobject.groups()[2])
  17. COUNTER += 1
  18. return numbered_dots
  19. def dots2numberedDots(all_text, replace_string="..."):
  20. replace_string = re.escape(replace_string)
  21. dots_regex = re.compile(r"(\s)[A-Z]?\[?({})\]?(\s)".format(replace_string))
  22. all_text = dots_regex.sub(dots_repl, all_text)
  23. return all_text
  24. if __name__ == '__main__':
  25. parser = argopt(__doc__).parse_args()
  26. input_path = parser.i
  27. output_path = parser.o
  28. string_to_replace = parser.s
  29. with open(input_path, "r") as filo:
  30. all_text = filo.read()
  31. numbered_dots_text = dots2numberedDots(all_text, replace_string=string_to_replace)
  32. with open(output_path, "w") as filo:
  33. filo.write(numbered_dots_text)