123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258 |
- """
- DataSet: data structure for potentially mixed-type Attribute.
- """
- from pandas import DataFrame, Series
- from ds4ml.attribute import Attribute
- class DataSetPattern:
- """
- A helper class of ``DataSet`` to store its patterns.
- """
- # DataSet's pattern data has following members:
- _network = None
- _cond_prs = None
- _attrs = None
- _records = None
- # Options of DataSet constructor to preset some properties:
- _categories = [] # categorical columns setting from command lines
- _config = None # configurations for data-pattern command
- _pattern_generated = False
- class DataSet(DataSetPattern, DataFrame):
- def __init__(self, *args, **kwargs):
- """
- An improved DataFrame with extra patterns information, e.g. its bayesian
- network structure, conditional probabilities on the network, and pattern
- information of all its columns.
- The ``DataSet`` class has two modes:
- - it has raw data, and then can calculate its pattern from the data;
- - it doesn't have raw data, and only have the pattern from customer.
- Parameters
- ----------
- categories : list of columns (optional)
- Column names whose values are categorical.
- """
- categories = kwargs.pop("categories", [])
- self._categories = [] if categories is None else categories
- pattern = kwargs.pop('pattern', None)
- super(DataSet, self).__init__(*args, **kwargs)
- self.separator = '_'
- if pattern is not None and all(k in pattern for k in
- ['network', 'prs', 'attrs', 'records']):
- self._set_pattern(pattern)
- else:
- self._records = self.shape[0]
- @property
- def _constructor(self):
- return DataSet
- # disable _constructor_sliced method for single column slicing. Try to
- # use __getitem__ method.
- # @property
- # def _constructor_sliced(self):
- # return Attribute
- def __getitem__(self, key):
- result = super(DataSet, self).__getitem__(key)
- if isinstance(result, Series):
- result.__class__ = Attribute
- if self._attrs is not None:
- result.set_pattern(self._attrs.get(key),
- categorical=key in self._categories)
- else:
- result.set_pattern(categorical=key in self._categories)
- return result
- @classmethod
- def from_pattern(cls, filename):
- """
- Alternate constructor to create a ``DataSet`` from a pattern file.
- """
- import json
- with open(filename) as f:
- pattern = json.load(f)
- # set columns to DataSet, which will set column name to each Attribute.
- columns = pattern['attrs'].keys()
- dataset = DataSet(columns=columns, pattern=pattern)
- return dataset
- def _set_pattern(self, pattern=None):
- """ Set pattern data for the DataSet. """
- if not self._pattern_generated:
- self._network = pattern['network']
- self._cond_prs = pattern['prs']
- self._attrs = pattern['attrs']
- self._config = pattern['config']
- self._records = pattern['records']
- self._pattern_generated = True
- def mi(self):
- """ Return mutual information of pairwise attributes. """
- from ds4ml.metrics import pairwise_mutual_information
- return pairwise_mutual_information(self)
- def encode(self, data=None):
- """
- Transform data set to values by kinds of encoders.
- If data is set, use this data set's encoders to transform.
- """
- # If the data to encode is None, then transform source data _data;
- frame = DataFrame()
- for col in self.columns:
- attr = self[col]
- if data is not None and col not in data:
- continue
- # when attribute is string-typed but not categorical, ignore its
- # encode method.
- if attr.categorical:
- subs = attr.encode(None if data is None else data[col])
- for label in attr.bins:
- frame[col + self.separator + str(label)] = subs[label]
- elif attr.type != 'string':
- frame[col] = attr.encode(None if data is None else data[col])
- return frame
- def _sampling_dataset(self, network, cond_prs, n):
- """
- Returns a sampling dataset (n rows) based on bayesian network and
- conditional probability.
- """
- from numpy import random
- root_col = network[0][1][0]
- root_prs = cond_prs[root_col]
- columns = [root_col] # columns from bayesian network
- for node, _ in network:
- columns.append(node)
- frame = DataFrame(columns=columns) # encoded DataFrame
- frame[root_col] = random.choice(len(root_prs), size=n, p=root_prs)
- for child, parents in network:
- child_cond_prs = cond_prs[child]
- for indexes in child_cond_prs.keys():
- prs = child_cond_prs[indexes]
- indexes = list(eval(indexes))
- filters = ''
- for parent, value in zip(parents, indexes):
- filters += f"(frame['{parent}']=={value})&"
- filters = eval(filters[:-1])
- size = frame[filters].shape[0]
- if size:
- frame.loc[filters, child] = random.choice(len(prs),
- size=size,
- p=prs)
- child_prs = self[child].prs
- frame.loc[frame[child].isnull(), child] = random.choice(
- len(child_prs), size=frame[child].isnull().sum(), p=child_prs)
- frame[frame.columns] = frame[frame.columns].astype(int)
- return frame
- def _construct_bayesian_network(self, epsilon=0.1, degree=2,
- pseudonyms=None, deletes=None, retains=None):
- """
- Construct bayesian network of the DataSet.
- """
- deletes = deletes or []
- pseudonyms = pseudonyms or []
- retains = retains or []
- columns = [col for col in self.columns.values if col not in deletes]
- # nodes for bayesian networks, which does not include pseudonym columns
- # or non-categorical string columns.
- nodes = set()
- for col in columns:
- if col in pseudonyms or (
- self[col].type == 'string' and not self[col].categorical):
- continue
- nodes.add(col)
- # main steps of private bayesian network for synthesis
- # encode dataset into bin indexes for bayesian network
- indexes = DataFrame()
- for col in nodes:
- indexes[col] = self[col].bin_indexes()
- if indexes.shape[1] < 2:
- raise Exception('If infer bayesian network, it requires at least 2 '
- 'attributes in dataset.')
- # Bayesian network is defined as a set of AP (attribute-parent) pairs.
- # e.g. [(x1, p1), (x2, p2), ...], and pi is the parents of xi.
- #
- # The algorithm follows the composability property of differential
- # privacy, so the privacy budget is split to two parts.
- from ds4ml.synthesizer import greedy_bayes, noisy_conditionals
- network = greedy_bayes(indexes, epsilon / 2, degree=degree,
- retains=retains)
- cond_prs = noisy_conditionals(network, indexes, epsilon / 2)
- return network, cond_prs
- def to_pattern(self, path, epsilon=0.1, degree=2, pseudonyms=None,
- deletes=None, retains=None) -> None:
- """
- Serialize this dataset's patterns into a json file.
- """
- import json
- network, cond_prs = self._construct_bayesian_network(
- epsilon, degree=degree, pseudonyms=pseudonyms, deletes=deletes,
- retains=retains)
- pattern = dict({
- "attrs": {col: self[col].to_pattern() for col in self.columns
- if col not in (deletes or [])},
- "config": {"pseudonyms": pseudonyms},
- "network": network,
- "prs": cond_prs,
- "records": self._records
- })
- with open(path, 'w') as fp:
- json.dump(pattern, fp, indent=2)
- def synthesize(self, epsilon=0.1, degree=2,
- pseudonyms=None, deletes=None, retains=None, records=None):
- """
- Synthesize data set by a bayesian network to infer attributes'
- dependence relationship and differential privacy to keep differentially
- private.
- """
- deletes = deletes or []
- pseudonyms = pseudonyms or (
- self._config is not None and self._config['pseudonyms']) or []
- retains = retains or []
- if self._network is None and self._cond_prs is None:
- self._network, self._cond_prs = self._construct_bayesian_network(
- epsilon, degree=degree, pseudonyms=pseudonyms, deletes=deletes,
- retains=retains)
- columns = [col for col in self.columns.values if col not in deletes]
- records = records if records is not None else self._records
- sampling = self._sampling_dataset(self._network, self._cond_prs, records)
- frame = DataFrame(columns=columns)
- for col in self.columns:
- attr = self[col]
- if col in deletes:
- continue
- if col in pseudonyms: # pseudonym column is not in bayesian network
- frame[col] = attr.pseudonymize(size=records)
- continue
- if col in retains:
- frame[col] = attr.retain(records)
- continue
- if col in sampling:
- frame[col] = attr.choice(indexes=sampling[col])
- continue
- if not attr.categorical:
- frame[col] = attr.random()
- else:
- frame[col] = attr.choice()
- return frame
|