def build_anonymized_dataframe(self, df, partitions) -> DataFrame: aggregations = {} sensitive_columns = self.sensitive_attribute_columns feature_columns = self.feature_columns sa_len = len(sensitive_columns) for column in feature_columns: if self.format_to_str: aggregations[column] = self.__agg_column_str else: aggregations[column] = self.__agg_column_list rows = [] for i, partition in enumerate(partitions): dfp = df.loc[partition] grouped_columns = dfp.agg(aggregations, squeeze=False) values = grouped_columns.to_dict() if self.avg_columns: # handle average columns and set average instead of interval # overwrite column with average for avg_col in self.avg_columns: col_name = avg_col + '_avg' if not self.AVG_OVERWRITE else avg_col if avg_col in feature_columns: avg_val = dfp[avg_col].mean() values.update({col_name: avg_val}) grouped_sensitive_columns = dfp.groupby(sensitive_columns, as_index=False) for grouped_sensitive_value in grouped_sensitive_columns: for sensitive_column in sensitive_columns: if sa_len > 1: # Value is tuple sensitive_value = grouped_sensitive_value[0][sensitive_columns.index(sensitive_column)] else: sensitive_value = grouped_sensitive_value[0] count = len(grouped_sensitive_value[1]) values.update( { sensitive_column: sensitive_value, sensitive_column + "_count": count, } ) rows.append(values.copy()) return pd.DataFrame(rows)