LiuFan
/
PrivacyScanData


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233
							def pseudonymize_1(self, df, schema):  #: list[list[str]]):
    """ Performs pseudonymization of the given dataframe based on the provided schema.
        For example, if the given df is for an entity called person,
        2 dataframes will be returned, one called person that has hashed ids and masked fields,
        and one called person_lookup that contains the original person_id, person_id_pseudo,
        and the non-masked values for columns marked to be masked."""

    df_pseudo = df_lookup = df

    for col_name, dtype, op in schema:
        if op == "hash-no-lookup" or op == "hnl":
            # This means that the lookup can be performed against a different table so no lookup is needed.
            df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
                                                              256)).withColumnRenamed(col_name,
                                                                                      col_name + "_pseudonym")
            df_lookup = df_lookup.drop(col_name)
        elif op == "hash" or op == 'h':
            df_pseudo = df_pseudo.withColumn(col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)),
                                                              256)).withColumnRenamed(col_name,
                                                                                      col_name + "_pseudonym")
            df_lookup = df_lookup.withColumn(col_name + "_pseudonym",
                                             F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256))
        elif op == "mask" or op == 'm':
            df_pseudo = df_pseudo.withColumn(col_name, F.lit('*'))
        elif op == "partition-by":
            pass  # make no changes for this column so that it will be in both dataframes and can be used for partitioning
        elif op == "no-op" or op == 'x':
            df_lookup = df_lookup.drop(col_name)

    df_pseudo = self.fix_column_names(df_pseudo)
    df_lookup = self.fix_column_names(df_lookup)

    return (df_pseudo, df_lookup)