123456789101112131415161718192021222324252627282930313233 |
- def pseudonymize_1(self, df, schema): #: list[list[str]]):
- """ Performs pseudonymization of the given dataframe based on the provided schema.
- For example, if the given df is for an entity called person,
- 2 dataframes will be returned, one called person that has hashed ids and masked fields,
- and one called person_lookup that contains the original person_id, person_id_pseudo,
- and the non-masked values for columns marked to be masked."""
- df_pseudo = df_lookup = df
- for col_name, dtype, op in schema:
- if op == "hash-no-lookup" or op == "hnl":
- # This means that the lookup can be performed against a different table so no lookup is needed.
- df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
- 256)).withColumnRenamed(col_name,
- col_name + "_pseudonym")
- df_lookup = df_lookup.drop(col_name)
- elif op == "hash" or op == 'h':
- df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
- 256)).withColumnRenamed(col_name,
- col_name + "_pseudonym")
- df_lookup = df_lookup.withColumn(col_name + "_pseudonym",
- F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256))
- elif op == "mask" or op == 'm':
- df_pseudo = df_pseudo.withColumn(col_name, F.lit('*'))
- elif op == "partition-by":
- pass # make no changes for this column so that it will be in both dataframes and can be used for partitioning
- elif op == "no-op" or op == 'x':
- df_lookup = df_lookup.drop(col_name)
- df_pseudo = self.fix_column_names(df_pseudo)
- df_lookup = self.fix_column_names(df_lookup)
- return (df_pseudo, df_lookup)
|