1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- def build_anonymized_dataframe(self, df, partitions) -> DataFrame:
- aggregations = {}
- sensitive_columns = self.sensitive_attribute_columns
- feature_columns = self.feature_columns
- sa_len = len(sensitive_columns)
- for column in feature_columns:
- if self.format_to_str:
- aggregations[column] = self.__agg_column_str
- else:
- aggregations[column] = self.__agg_column_list
- rows = []
- for i, partition in enumerate(partitions):
- dfp = df.loc[partition]
- grouped_columns = dfp.agg(aggregations, squeeze=False)
- values = grouped_columns.to_dict()
- if self.avg_columns:
- # handle average columns and set average instead of interval
- # overwrite column with average
- for avg_col in self.avg_columns:
- col_name = avg_col + '_avg' if not self.AVG_OVERWRITE else avg_col
- if avg_col in feature_columns:
- avg_val = dfp[avg_col].mean()
- values.update({col_name: avg_val})
- grouped_sensitive_columns = dfp.groupby(sensitive_columns, as_index=False)
- for grouped_sensitive_value in grouped_sensitive_columns:
- for sensitive_column in sensitive_columns:
- if sa_len > 1:
- # Value is tuple
- sensitive_value = grouped_sensitive_value[0][sensitive_columns.index(sensitive_column)]
- else:
- sensitive_value = grouped_sensitive_value[0]
- count = len(grouped_sensitive_value[1])
- values.update(
- {
- sensitive_column: sensitive_value,
- sensitive_column + "_count": count,
- }
- )
- rows.append(values.copy())
- return pd.DataFrame(rows)
|