1234567891011121314151617181920212223242526272829303132333435363738394041 |
- def _pseudonymize_identifier_and_measure(
- input_csv_path: str,
- identifier_unit_id_type: str,
- measure_unit_id_type: str,
- job_id: str
- ) -> str:
- unique_idents = set()
- unique_measure_values = set()
- with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
- for line in csv_file:
- row = line.strip().split(';')
- unit_id = row[1]
- value = row[2]
- unique_idents.add(unit_id)
- unique_measure_values.add(value)
- identifier_to_pseudonym = pseudonym_service.pseudonymize(
- list(unique_idents), identifier_unit_id_type, job_id
- )
- value_to_pseudonym = pseudonym_service.pseudonymize(
- list(unique_measure_values), measure_unit_id_type, job_id
- )
- output_csv_path = input_csv_path.replace('.csv', '_pseudonymized.csv')
- target_file = open(output_csv_path, 'w', newline='', encoding='utf-8')
- with open(input_csv_path, newline='', encoding='utf-8') as csv_file:
- for line in csv_file:
- row = line.strip().split(';')
- line_number: int = row[0]
- unit_id: str = row[1]
- value: str = row[2]
- start_date: str = row[3]
- stop_date: str = row[4]
- target_file.write(
- ';'.join([
- str(line_number),
- str(identifier_to_pseudonym[unit_id]),
- str(value_to_pseudonym[value]),
- start_date, stop_date
- ]) + '\n'
- )
- target_file.close()
- return output_csv_path
|