def pseudonymize_1(self, df, schema): #: list[list[str]]): """ Performs pseudonymization of the given dataframe based on the provided schema. For example, if the given df is for an entity called person, 2 dataframes will be returned, one called person that has hashed ids and masked fields, and one called person_lookup that contains the original person_id, person_id_pseudo, and the non-masked values for columns marked to be masked.""" df_pseudo = df_lookup = df for col_name, dtype, op in schema: if op == "hash-no-lookup" or op == "hnl": # This means that the lookup can be performed against a different table so no lookup is needed. df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256)).withColumnRenamed(col_name, col_name + "_pseudonym") df_lookup = df_lookup.drop(col_name) elif op == "hash" or op == 'h': df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256)).withColumnRenamed(col_name, col_name + "_pseudonym") df_lookup = df_lookup.withColumn(col_name + "_pseudonym", F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256)) elif op == "mask" or op == 'm': df_pseudo = df_pseudo.withColumn(col_name, F.lit('*')) elif op == "partition-by": pass # make no changes for this column so that it will be in both dataframes and can be used for partitioning elif op == "no-op" or op == 'x': df_lookup = df_lookup.drop(col_name) df_pseudo = self.fix_column_names(df_pseudo) df_lookup = self.fix_column_names(df_lookup) return (df_pseudo, df_lookup)