dataFrameAnonymizer_7.py 1.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. def build_anonymized_dataframe(self, df, partitions) -> DataFrame:
  2. aggregations = {}
  3. sensitive_columns = self.sensitive_attribute_columns
  4. feature_columns = self.feature_columns
  5. sa_len = len(sensitive_columns)
  6. for column in feature_columns:
  7. if self.format_to_str:
  8. aggregations[column] = self.__agg_column_str
  9. else:
  10. aggregations[column] = self.__agg_column_list
  11. rows = []
  12. for i, partition in enumerate(partitions):
  13. dfp = df.loc[partition]
  14. grouped_columns = dfp.agg(aggregations, squeeze=False)
  15. values = grouped_columns.to_dict()
  16. if self.avg_columns:
  17. # handle average columns and set average instead of interval
  18. # overwrite column with average
  19. for avg_col in self.avg_columns:
  20. col_name = avg_col + '_avg' if not self.AVG_OVERWRITE else avg_col
  21. if avg_col in feature_columns:
  22. avg_val = dfp[avg_col].mean()
  23. values.update({col_name: avg_val})
  24. grouped_sensitive_columns = dfp.groupby(sensitive_columns, as_index=False)
  25. for grouped_sensitive_value in grouped_sensitive_columns:
  26. for sensitive_column in sensitive_columns:
  27. if sa_len > 1:
  28. # Value is tuple
  29. sensitive_value = grouped_sensitive_value[0][sensitive_columns.index(sensitive_column)]
  30. else:
  31. sensitive_value = grouped_sensitive_value[0]
  32. count = len(grouped_sensitive_value[1])
  33. values.update(
  34. {
  35. sensitive_column: sensitive_value,
  36. sensitive_column + "_count": count,
  37. }
  38. )
  39. rows.append(values.copy())
  40. return pd.DataFrame(rows)