main_8.py 2.0 KB

123456789101112131415161718192021222324252627282930313233
  1. def pseudonymize_1(self, df, schema): #: list[list[str]]):
  2. """ Performs pseudonymization of the given dataframe based on the provided schema.
  3. For example, if the given df is for an entity called person,
  4. 2 dataframes will be returned, one called person that has hashed ids and masked fields,
  5. and one called person_lookup that contains the original person_id, person_id_pseudo,
  6. and the non-masked values for columns marked to be masked."""
  7. df_pseudo = df_lookup = df
  8. for col_name, dtype, op in schema:
  9. if op == "hash-no-lookup" or op == "hnl":
  10. # This means that the lookup can be performed against a different table so no lookup is needed.
  11. df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
  12. 256)).withColumnRenamed(col_name,
  13. col_name + "_pseudonym")
  14. df_lookup = df_lookup.drop(col_name)
  15. elif op == "hash" or op == 'h':
  16. df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
  17. 256)).withColumnRenamed(col_name,
  18. col_name + "_pseudonym")
  19. df_lookup = df_lookup.withColumn(col_name + "_pseudonym",
  20. F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256))
  21. elif op == "mask" or op == 'm':
  22. df_pseudo = df_pseudo.withColumn(col_name, F.lit('*'))
  23. elif op == "partition-by":
  24. pass # make no changes for this column so that it will be in both dataframes and can be used for partitioning
  25. elif op == "no-op" or op == 'x':
  26. df_lookup = df_lookup.drop(col_name)
  27. df_pseudo = self.fix_column_names(df_pseudo)
  28. df_lookup = self.fix_column_names(df_lookup)
  29. return (df_pseudo, df_lookup)