attribute.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. """
  2. Attribute: data structure for 1-dimensional cross-sectional data
  3. This class only handle integer, float, string, datetime columns, and it can be
  4. labeled as categorical column.
  5. """
  6. from bisect import bisect_right
  7. from random import uniform
  8. from pandas import Series, DataFrame
  9. from dateutil.parser import parse
  10. from datetime import datetime, timedelta
  11. import numpy as np
  12. from ds4ml import utils
  13. # Default environment variables for data processing and analysis
  14. DEFAULT_BIN_SIZE = 20
  15. class AttributePattern:
  16. """
  17. A helper class of ``Attribute`` to store its patterns.
  18. """
  19. # _type: date type for handle different kinds of attributes in data
  20. # synthesis, only support: integer, float, string, datetime.
  21. _type = None
  22. categorical = False
  23. # min, max has been defined as member function of pandas.Series
  24. min_ = None
  25. max_ = None
  26. _decimals = None
  27. # probability distribution (pr)
  28. bins = None
  29. prs = None
  30. _counts = None
  31. _pattern_generated = False
  32. # Here _bin_size is int-typed (to show the size of histogram bins), which
  33. # is different from bins in np.histogram.
  34. _bin_size = DEFAULT_BIN_SIZE
  35. @property
  36. def type(self):
  37. return self._type
  38. class Attribute(AttributePattern, Series):
  39. _epoch = datetime(1970, 1, 1) # for datetime handling
  40. def __init__(self, *args, **kwargs):
  41. """
  42. An improved Series with extra pattern information, e.g. categorical,
  43. min/max value, and probability distribution.
  44. The ``Attribute`` class has two modes:
  45. - it has raw data, and then can calculate its pattern from the data;
  46. - it doesn't have raw data, and only have the pattern from customer.
  47. Parameters
  48. ----------
  49. categorical : bool
  50. set categorical label for attribute. If categorical, this attribute
  51. takes on a limited and fixed number of possible values. Examples:
  52. blood type, gender.
  53. """
  54. categorical = kwargs.pop('categorical', False)
  55. super().__init__(*args, **kwargs)
  56. self.set_pattern(categorical=categorical)
  57. def _calculate_pattern(self):
  58. from pandas.api.types import infer_dtype
  59. self._type = infer_dtype(self, skipna=True)
  60. if self._type == 'integer':
  61. pass
  62. elif self._type == 'floating' or self._type == 'mixed-integer-float':
  63. self._type = 'float'
  64. elif self._type in ['string', 'mixed-integer', 'mixed']:
  65. self._type = 'string'
  66. if all(map(utils.is_datetime, self._values)):
  67. self._type = 'datetime'
  68. # fill the missing values with the most frequent value
  69. if self.hasnans:
  70. self.fillna(self.mode()[0], inplace=True)
  71. # for datetime attribute is converted to seconds since Unix epoch time
  72. if self.type == 'datetime':
  73. self.update(self.map(self._to_seconds))
  74. if self.type == 'float':
  75. self._decimals = self.decimals()
  76. # The `categorical` option can be set to true when the attribute is
  77. # string-typed and all values are not unique, and its value can be
  78. # overrode by user.
  79. self.categorical = self.categorical or (
  80. self.type == 'string' and not self.is_unique)
  81. self._set_domain()
  82. self._set_distribution()
  83. # handling functions for datetime attribute
  84. def _to_seconds(self, timestr):
  85. return int((parse(timestr) - self._epoch).total_seconds())
  86. def _date_formatter(self, seconds):
  87. date = self._epoch + timedelta(seconds=seconds)
  88. return '%d/%d/%d' % (date.month, date.day, date.year)
  89. # Take pandas.Series as manipulation result.
  90. @property
  91. def _constructor(self):
  92. return Series
  93. @property
  94. def _constructor_expanddim(self):
  95. from ds4ml.dataset import DataSet
  96. return DataSet
  97. def set_pattern(self, pattern=None, **kwargs):
  98. """
  99. Set an attribute's pattern, including its type, min/max value, and
  100. probability distributions.
  101. If patter is None, then calculation its pattern from its data.
  102. """
  103. if not self._pattern_generated:
  104. self.categorical = kwargs.pop("categorical", False)
  105. if pattern is None:
  106. # to calculate the pattern use its data
  107. self._calculate_pattern()
  108. else:
  109. self._type = pattern['type']
  110. if self.type == 'float':
  111. self._decimals = pattern['decimals']
  112. self.categorical = pattern['categorical']
  113. self.min_ = pattern['min']
  114. self.max_ = pattern['max']
  115. self.bins = np.array(pattern['bins'])
  116. self.prs = np.array(pattern['prs'])
  117. self._pattern_generated = True
  118. @property
  119. def is_numerical(self):
  120. return self._type == 'integer' or self._type == 'float'
  121. @property
  122. def domain(self):
  123. """
  124. Return attribute's domain, which can be a list of values for categorical
  125. attribute, and an interval with min/max value for non-categorical
  126. attribute.
  127. """
  128. if self.categorical:
  129. return self.bins
  130. return [self.min_, self.max_]
  131. def _step(self):
  132. """ Return step for numerical or datetime attribute. """
  133. return (self.max_ - self.min_) / self._bin_size
  134. @domain.setter
  135. def domain(self, domain: list):
  136. """
  137. Set attribute's domain, includes min, max, frequency, or distribution.
  138. Generally, the domain of one attribute can be calculated automatically.
  139. This method can be manually called for specific purposes, e.g. compare
  140. two same attributes based on same domain.
  141. Parameters
  142. ----------
  143. domain : list
  144. domain of one attribute. For numerical or datetime attributes, it
  145. should be a list of two elements [min, max]; For categorical
  146. attributes, it should a list of potential values of this attribute.
  147. """
  148. # if a attribute is numerical and categorical and domain's length is
  149. # bigger than 2, take it as categorical. e.g. zip code.
  150. if self.type == 'datetime':
  151. domain = list(map(self._to_seconds, domain))
  152. if (self.is_numerical and self.categorical and len(domain) > 2) or (
  153. self.categorical):
  154. self.min_, self.max_ = min(domain), max(domain)
  155. self.bins = np.array(domain)
  156. elif self.is_numerical:
  157. self.min_, self.max_ = domain
  158. self.bins = np.array([self.min_, self.max_])
  159. elif self._type == 'string':
  160. lengths = [len(str(i)) for i in domain]
  161. self.min_, self.max_ = min(lengths), max(lengths)
  162. self.bins = np.array(domain)
  163. self._set_distribution()
  164. def _set_domain(self):
  165. """
  166. Compute domain (min, max, distribution bins) from input data
  167. """
  168. if self.categorical:
  169. self.bins = self.unique()
  170. if self._type == 'string':
  171. items = self.astype(str).map(len)
  172. self.min_ = int(items.min())
  173. self.max_ = int(items.max())
  174. if not self.categorical:
  175. self.bins = np.array([self.min_, self.max_])
  176. elif self._type == 'datetime':
  177. if not self.categorical:
  178. self.min_ = float(self.min())
  179. self.max_ = float(self.max())
  180. self.bins = np.array([self.min_, self.max_])
  181. else:
  182. self.min_ = float(self.min())
  183. self.max_ = float(self.max())
  184. if not self.categorical:
  185. self.bins = np.array([self.min_, self.max_])
  186. def _set_distribution(self):
  187. if self.categorical:
  188. counts = self.value_counts()
  189. for value in set(self.bins) - set(counts.index):
  190. counts[value] = 0
  191. counts.sort_index(inplace=True)
  192. if self.type == 'datetime':
  193. counts.index = list(map(self._date_formatter, counts.index))
  194. self._counts = counts.values
  195. self.prs = utils.normalize_distribution(counts)
  196. self.bins = np.array(counts.index)
  197. else:
  198. # Note: hist, edges = numpy.histogram(), all but the last bin
  199. # is half-open. If bins is 20, then len(hist)=20, len(edges)=21
  200. if self.type == 'string':
  201. hist, edges = np.histogram(self.astype(str).map(len),
  202. bins=self._bin_size)
  203. else:
  204. hist, edges = np.histogram(self, bins=self._bin_size,
  205. range=(self.min_, self.max_))
  206. self.bins = edges[:-1] # Remove the last bin edge
  207. self._counts = hist
  208. self.prs = utils.normalize_distribution(hist)
  209. if self.type == 'integer':
  210. self.min_ = int(self.min_)
  211. self.max_ = int(self.max_)
  212. def counts(self, bins=None, normalize=True):
  213. """
  214. Return an array of counts (or normalized density) of unique values.
  215. This function works with `attribute.bins`. Combination of both are
  216. like `Series.value_counts`. The parameter `bins` can be none, or a list.
  217. """
  218. if bins is None:
  219. return self._counts
  220. if self.categorical:
  221. if self.type == 'datetime':
  222. bins = list(map(self._to_seconds, bins))
  223. counts = self.value_counts()
  224. for value in set(bins) - set(counts.index):
  225. counts[value] = 0
  226. if normalize:
  227. return np.array([round(counts.get(b)/sum(counts) * 100, 2)
  228. for b in bins])
  229. return np.array([counts.get(b) for b in bins])
  230. if len(bins) == 1:
  231. return np.array([self.size])
  232. hist, _ = np.histogram(self, bins=bins)
  233. if normalize:
  234. return (hist / hist.sum() * 100).round(2)
  235. return hist
  236. def bin_indexes(self):
  237. """
  238. Encode values into bin indexes for Bayesian Network.
  239. """
  240. if self.categorical:
  241. mapping = {value: idx for idx, value in enumerate(self.bins)}
  242. indexes = self.map(lambda x: mapping[x], na_action='ignore')
  243. else:
  244. indexes = self.map(lambda x: bisect_right(self.bins, x) - 1,
  245. na_action='ignore')
  246. indexes.fillna(len(self.bins), inplace=True)
  247. return indexes.astype(int, copy=False)
  248. def to_pattern(self):
  249. """
  250. Return attribution's metadata information in JSON format or Python
  251. dictionary. Usually used in debug and testing.
  252. """
  253. return {
  254. 'name': self.name,
  255. 'type': self._type,
  256. 'categorical': self.categorical,
  257. 'min': self.min_,
  258. 'max': self.max_,
  259. 'decimals': self._decimals if self.type == 'float' else None,
  260. 'bins': self.bins.tolist(),
  261. 'prs': self.prs.tolist()
  262. }
  263. def decimals(self):
  264. """
  265. Returns number of decimals places for floating attribute. Used for
  266. generated dataset to keep consistent decimal places for float attribute.
  267. """
  268. def decimals_of(value: float):
  269. value = str(value)
  270. return len(value) - value.rindex('.') - 1
  271. counts = self.map(decimals_of).value_counts()
  272. slot = 0
  273. for i in range(len(counts)):
  274. if sum(counts.head(i + 1)) / sum(counts) > 0.8:
  275. slot = i + 1
  276. break
  277. return max(counts.index[:slot])
  278. def pseudonymize(self, size=None):
  279. """
  280. Return pseudonymized values for this attribute, which is used to
  281. substitute identifiable data with a reversible, consistent value.
  282. """
  283. size = size or self.size
  284. if size != self.size:
  285. attr = Series(np.random.choice(self.bins, size=size, p=self.prs))
  286. else:
  287. attr = self
  288. if self.categorical:
  289. mapping = {b: utils.pseudonymise_string(b) for b in self.bins}
  290. return attr.map(lambda x: mapping[x])
  291. if self.type == 'string':
  292. return attr.map(utils.pseudonymise_string)
  293. elif self.is_numerical or self.type == 'datetime':
  294. return attr.map(str).map(utils.pseudonymise_string)
  295. def random(self, size=None):
  296. """
  297. Return an random array with same length (usually used for
  298. non-categorical attribute).
  299. """
  300. size = size or self.size
  301. if self.min_ == self.max_:
  302. rands = np.ones(size) * self.min_
  303. else:
  304. rands = np.arange(self.min_, self.max_, (self.max_-self.min_)/size)
  305. np.random.shuffle(rands)
  306. if self.type == 'string':
  307. if self.min_ == self.max_:
  308. length = self.min_
  309. else:
  310. length = np.random.randint(self.min_, self.max_)
  311. vectorized = np.vectorize(lambda x: utils.randomize_string(length))
  312. rands = vectorized(rands)
  313. elif self.type == 'integer':
  314. rands = list(map(int, rands))
  315. elif self.type == 'datetime':
  316. rands = list(map(self._date_formatter, rands))
  317. return Series(rands)
  318. def retain(self, size=None):
  319. """ Return retained attribute with the size """
  320. size = size or self.size
  321. if size < self.size:
  322. return self.head(size)
  323. if size == self.size:
  324. return self
  325. copies = size // self.size
  326. remainder = size - (copies * self.size)
  327. return Series(self.tolist() * copies + self.head(remainder).tolist())
  328. def _random_sample_at(self, index: int):
  329. """ Sample a value from distribution bins at position 'index'"""
  330. if self.categorical:
  331. return self.bins[index]
  332. length = len(self.bins)
  333. if index < length - 1:
  334. return uniform(self.bins[index], self.bins[index + 1])
  335. return uniform(self.bins[-1], self.max_)
  336. def choice(self, size=None, indexes=None):
  337. """
  338. Return a random sample based on this attribute's probability and
  339. distribution bins (default value is base random distribution bins based
  340. on its probability).
  341. Parameters
  342. ----------
  343. size : int
  344. size of random sample
  345. indexes : array-like
  346. array of indexes in distribution bins
  347. """
  348. if indexes is None:
  349. size = size or self.size
  350. indexes = Series(np.random.choice(len(self.prs),
  351. size=size, p=self.prs))
  352. column = indexes.map(self._random_sample_at)
  353. if self.type == 'datetime':
  354. if not self.categorical:
  355. column = column.map(self._date_formatter)
  356. elif self.type == 'float':
  357. column = column.round(self._decimals)
  358. elif self.type == 'integer':
  359. column = column.round().astype(int)
  360. elif self.type == 'string':
  361. if not self.categorical:
  362. column = column.map(lambda x: utils.randomize_string(int(x)))
  363. return column
  364. def encode(self, data=None):
  365. """
  366. Encode labels to normalized encoding.
  367. Parameters
  368. ----------
  369. data : array-like
  370. target values
  371. """
  372. if data is None:
  373. data = self.copy()
  374. else:
  375. if self.type == 'datetime':
  376. if all(map(utils.is_datetime, data)):
  377. data = data.map(self._to_seconds)
  378. else:
  379. data = data.map(int)
  380. if self.categorical:
  381. frame = DataFrame()
  382. for col in self.bins:
  383. frame[col] = data.apply(lambda v: 1 if v == col else 0)
  384. return frame
  385. if self.type != 'string':
  386. step = self._step()
  387. return data.apply(lambda v: # 1e-8 is a small delta
  388. int((v - self.min_) / (step + 1e-8))
  389. / self._bin_size)
  390. raise ValueError('Can\'t encode Non-categorical attribute.')