dataset_pseudonymizer_2.py 1.2 KB

123456789101112131415161718192021222324252627282930313233
  1. def _pseudonymize_identifier_only(
  2. input_csv_path: str,
  3. unit_id_type: str,
  4. job_id: str
  5. ) -> str:
  6. unique_identifiers = set()
  7. with open(input_csv_path, newline='', encoding='utf8') as csv_file:
  8. for line in csv_file:
  9. unit_id = line.strip().split(';')[1]
  10. unique_identifiers.add(unit_id)
  11. identifier_to_pseudonym = pseudonym_service.pseudonymize(
  12. list(unique_identifiers), unit_id_type, job_id
  13. )
  14. output_csv_path = input_csv_path.replace('.csv', '_pseudonymized.csv')
  15. target_file = open(output_csv_path, 'w', newline='', encoding='utf-8')
  16. with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
  17. for line in csv_file:
  18. row = line.strip().split(';')
  19. line_number: int = row[0]
  20. unit_id: str = row[1]
  21. value: str = row[2]
  22. start_date: str = row[3]
  23. stop_date: str = row[4]
  24. target_file.write(
  25. ';'.join([
  26. str(line_number),
  27. str(identifier_to_pseudonym[unit_id]),
  28. value,
  29. start_date, stop_date
  30. ]) + '\n'
  31. )
  32. target_file.close()
  33. return output_csv_path