dataset.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. """
  2. DataSet: data structure for potentially mixed-type Attribute.
  3. """
  4. from pandas import DataFrame, Series
  5. from ds4ml.attribute import Attribute
  6. class DataSetPattern:
  7. """
  8. A helper class of ``DataSet`` to store its patterns.
  9. """
  10. # DataSet's pattern data has following members:
  11. _network = None
  12. _cond_prs = None
  13. _attrs = None
  14. _records = None
  15. # Options of DataSet constructor to preset some properties:
  16. _categories = [] # categorical columns setting from command lines
  17. _config = None # configurations for data-pattern command
  18. _pattern_generated = False
  19. class DataSet(DataSetPattern, DataFrame):
  20. def __init__(self, *args, **kwargs):
  21. """
  22. An improved DataFrame with extra patterns information, e.g. its bayesian
  23. network structure, conditional probabilities on the network, and pattern
  24. information of all its columns.
  25. The ``DataSet`` class has two modes:
  26. - it has raw data, and then can calculate its pattern from the data;
  27. - it doesn't have raw data, and only have the pattern from customer.
  28. Parameters
  29. ----------
  30. categories : list of columns (optional)
  31. Column names whose values are categorical.
  32. """
  33. categories = kwargs.pop("categories", [])
  34. self._categories = [] if categories is None else categories
  35. pattern = kwargs.pop('pattern', None)
  36. super(DataSet, self).__init__(*args, **kwargs)
  37. self.separator = '_'
  38. if pattern is not None and all(k in pattern for k in
  39. ['network', 'prs', 'attrs', 'records']):
  40. self._set_pattern(pattern)
  41. else:
  42. self._records = self.shape[0]
  43. @property
  44. def _constructor(self):
  45. return DataSet
  46. # disable _constructor_sliced method for single column slicing. Try to
  47. # use __getitem__ method.
  48. # @property
  49. # def _constructor_sliced(self):
  50. # return Attribute
  51. def __getitem__(self, key):
  52. result = super(DataSet, self).__getitem__(key)
  53. if isinstance(result, Series):
  54. result.__class__ = Attribute
  55. if self._attrs is not None:
  56. result.set_pattern(self._attrs.get(key),
  57. categorical=key in self._categories)
  58. else:
  59. result.set_pattern(categorical=key in self._categories)
  60. return result
  61. @classmethod
  62. def from_pattern(cls, filename):
  63. """
  64. Alternate constructor to create a ``DataSet`` from a pattern file.
  65. """
  66. import json
  67. with open(filename) as f:
  68. pattern = json.load(f)
  69. # set columns to DataSet, which will set column name to each Attribute.
  70. columns = pattern['attrs'].keys()
  71. dataset = DataSet(columns=columns, pattern=pattern)
  72. return dataset
  73. def _set_pattern(self, pattern=None):
  74. """ Set pattern data for the DataSet. """
  75. if not self._pattern_generated:
  76. self._network = pattern['network']
  77. self._cond_prs = pattern['prs']
  78. self._attrs = pattern['attrs']
  79. self._config = pattern['config']
  80. self._records = pattern['records']
  81. self._pattern_generated = True
  82. def mi(self):
  83. """ Return mutual information of pairwise attributes. """
  84. from ds4ml.metrics import pairwise_mutual_information
  85. return pairwise_mutual_information(self)
  86. def encode(self, data=None):
  87. """
  88. Transform data set to values by kinds of encoders.
  89. If data is set, use this data set's encoders to transform.
  90. """
  91. # If the data to encode is None, then transform source data _data;
  92. frame = DataFrame()
  93. for col in self.columns:
  94. attr = self[col]
  95. if data is not None and col not in data:
  96. continue
  97. # when attribute is string-typed but not categorical, ignore its
  98. # encode method.
  99. if attr.categorical:
  100. subs = attr.encode(None if data is None else data[col])
  101. for label in attr.bins:
  102. frame[col + self.separator + str(label)] = subs[label]
  103. elif attr.type != 'string':
  104. frame[col] = attr.encode(None if data is None else data[col])
  105. return frame
  106. def _sampling_dataset(self, network, cond_prs, n):
  107. """
  108. Returns a sampling dataset (n rows) based on bayesian network and
  109. conditional probability.
  110. """
  111. from numpy import random
  112. root_col = network[0][1][0]
  113. root_prs = cond_prs[root_col]
  114. columns = [root_col] # columns from bayesian network
  115. for node, _ in network:
  116. columns.append(node)
  117. frame = DataFrame(columns=columns) # encoded DataFrame
  118. frame[root_col] = random.choice(len(root_prs), size=n, p=root_prs)
  119. for child, parents in network:
  120. child_cond_prs = cond_prs[child]
  121. for indexes in child_cond_prs.keys():
  122. prs = child_cond_prs[indexes]
  123. indexes = list(eval(indexes))
  124. filters = ''
  125. for parent, value in zip(parents, indexes):
  126. filters += f"(frame['{parent}']=={value})&"
  127. filters = eval(filters[:-1])
  128. size = frame[filters].shape[0]
  129. if size:
  130. frame.loc[filters, child] = random.choice(len(prs),
  131. size=size,
  132. p=prs)
  133. child_prs = self[child].prs
  134. frame.loc[frame[child].isnull(), child] = random.choice(
  135. len(child_prs), size=frame[child].isnull().sum(), p=child_prs)
  136. frame[frame.columns] = frame[frame.columns].astype(int)
  137. return frame
  138. def _construct_bayesian_network(self, epsilon=0.1, degree=2,
  139. pseudonyms=None, deletes=None, retains=None):
  140. """
  141. Construct bayesian network of the DataSet.
  142. """
  143. deletes = deletes or []
  144. pseudonyms = pseudonyms or []
  145. retains = retains or []
  146. columns = [col for col in self.columns.values if col not in deletes]
  147. # nodes for bayesian networks, which does not include pseudonym columns
  148. # or non-categorical string columns.
  149. nodes = set()
  150. for col in columns:
  151. if col in pseudonyms or (
  152. self[col].type == 'string' and not self[col].categorical):
  153. continue
  154. nodes.add(col)
  155. # main steps of private bayesian network for synthesis
  156. # encode dataset into bin indexes for bayesian network
  157. indexes = DataFrame()
  158. for col in nodes:
  159. indexes[col] = self[col].bin_indexes()
  160. if indexes.shape[1] < 2:
  161. raise Exception('If infer bayesian network, it requires at least 2 '
  162. 'attributes in dataset.')
  163. # Bayesian network is defined as a set of AP (attribute-parent) pairs.
  164. # e.g. [(x1, p1), (x2, p2), ...], and pi is the parents of xi.
  165. #
  166. # The algorithm follows the composability property of differential
  167. # privacy, so the privacy budget is split to two parts.
  168. from ds4ml.synthesizer import greedy_bayes, noisy_conditionals
  169. network = greedy_bayes(indexes, epsilon / 2, degree=degree,
  170. retains=retains)
  171. cond_prs = noisy_conditionals(network, indexes, epsilon / 2)
  172. return network, cond_prs
  173. def to_pattern(self, path, epsilon=0.1, degree=2, pseudonyms=None,
  174. deletes=None, retains=None) -> None:
  175. """
  176. Serialize this dataset's patterns into a json file.
  177. """
  178. import json
  179. network, cond_prs = self._construct_bayesian_network(
  180. epsilon, degree=degree, pseudonyms=pseudonyms, deletes=deletes,
  181. retains=retains)
  182. pattern = dict({
  183. "attrs": {col: self[col].to_pattern() for col in self.columns
  184. if col not in (deletes or [])},
  185. "config": {"pseudonyms": pseudonyms},
  186. "network": network,
  187. "prs": cond_prs,
  188. "records": self._records
  189. })
  190. with open(path, 'w') as fp:
  191. json.dump(pattern, fp, indent=2)
  192. def synthesize(self, epsilon=0.1, degree=2,
  193. pseudonyms=None, deletes=None, retains=None, records=None):
  194. """
  195. Synthesize data set by a bayesian network to infer attributes'
  196. dependence relationship and differential privacy to keep differentially
  197. private.
  198. """
  199. deletes = deletes or []
  200. pseudonyms = pseudonyms or (
  201. self._config is not None and self._config['pseudonyms']) or []
  202. retains = retains or []
  203. if self._network is None and self._cond_prs is None:
  204. self._network, self._cond_prs = self._construct_bayesian_network(
  205. epsilon, degree=degree, pseudonyms=pseudonyms, deletes=deletes,
  206. retains=retains)
  207. columns = [col for col in self.columns.values if col not in deletes]
  208. records = records if records is not None else self._records
  209. sampling = self._sampling_dataset(self._network, self._cond_prs, records)
  210. frame = DataFrame(columns=columns)
  211. for col in self.columns:
  212. attr = self[col]
  213. if col in deletes:
  214. continue
  215. if col in pseudonyms: # pseudonym column is not in bayesian network
  216. frame[col] = attr.pseudonymize(size=records)
  217. continue
  218. if col in retains:
  219. frame[col] = attr.retain(records)
  220. continue
  221. if col in sampling:
  222. frame[col] = attr.choice(indexes=sampling[col])
  223. continue
  224. if not attr.categorical:
  225. frame[col] = attr.random()
  226. else:
  227. frame[col] = attr.choice()
  228. return frame