hashing.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. """
  2. data hash pandas / numpy objects
  3. """
  4. import itertools
  5. from typing import Optional
  6. import numpy as np
  7. import pandas._libs.hashing as hashing
  8. from pandas.core.dtypes.common import (
  9. is_categorical_dtype,
  10. is_extension_array_dtype,
  11. is_list_like,
  12. )
  13. from pandas.core.dtypes.generic import (
  14. ABCDataFrame,
  15. ABCIndexClass,
  16. ABCMultiIndex,
  17. ABCSeries,
  18. )
  19. # 16 byte long hashing key
  20. _default_hash_key = "0123456789123456"
  21. def _combine_hash_arrays(arrays, num_items: int):
  22. """
  23. Parameters
  24. ----------
  25. arrays : generator
  26. num_items : int
  27. Should be the same as CPython's tupleobject.c
  28. """
  29. try:
  30. first = next(arrays)
  31. except StopIteration:
  32. return np.array([], dtype=np.uint64)
  33. arrays = itertools.chain([first], arrays)
  34. mult = np.uint64(1000003)
  35. out = np.zeros_like(first) + np.uint64(0x345678)
  36. for i, a in enumerate(arrays):
  37. inverse_i = num_items - i
  38. out ^= a
  39. out *= mult
  40. mult += np.uint64(82520 + inverse_i + inverse_i)
  41. assert i + 1 == num_items, "Fed in wrong num_items"
  42. out += np.uint64(97531)
  43. return out
  44. def hash_pandas_object(
  45. obj,
  46. index: bool = True,
  47. encoding: str = "utf8",
  48. hash_key: Optional[str] = _default_hash_key,
  49. categorize: bool = True,
  50. ):
  51. """
  52. Return a data hash of the Index/Series/DataFrame.
  53. Parameters
  54. ----------
  55. index : bool, default True
  56. Include the index in the hash (if Series/DataFrame).
  57. encoding : str, default 'utf8'
  58. Encoding for data & key when strings.
  59. hash_key : str, default _default_hash_key
  60. Hash_key for string key to encode.
  61. categorize : bool, default True
  62. Whether to first categorize object arrays before hashing. This is more
  63. efficient when the array contains duplicate values.
  64. Returns
  65. -------
  66. Series of uint64, same length as the object
  67. """
  68. from pandas import Series
  69. if hash_key is None:
  70. hash_key = _default_hash_key
  71. if isinstance(obj, ABCMultiIndex):
  72. return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
  73. elif isinstance(obj, ABCIndexClass):
  74. h = hash_array(obj._values, encoding, hash_key, categorize).astype(
  75. "uint64", copy=False
  76. )
  77. h = Series(h, index=obj, dtype="uint64", copy=False)
  78. elif isinstance(obj, ABCSeries):
  79. h = hash_array(obj._values, encoding, hash_key, categorize).astype(
  80. "uint64", copy=False
  81. )
  82. if index:
  83. index_iter = (
  84. hash_pandas_object(
  85. obj.index,
  86. index=False,
  87. encoding=encoding,
  88. hash_key=hash_key,
  89. categorize=categorize,
  90. )._values
  91. for _ in [None]
  92. )
  93. arrays = itertools.chain([h], index_iter)
  94. h = _combine_hash_arrays(arrays, 2)
  95. h = Series(h, index=obj.index, dtype="uint64", copy=False)
  96. elif isinstance(obj, ABCDataFrame):
  97. hashes = (hash_array(series._values) for _, series in obj.items())
  98. num_items = len(obj.columns)
  99. if index:
  100. index_hash_generator = (
  101. hash_pandas_object(
  102. obj.index,
  103. index=False,
  104. encoding=encoding,
  105. hash_key=hash_key,
  106. categorize=categorize,
  107. )._values
  108. for _ in [None]
  109. )
  110. num_items += 1
  111. # keep `hashes` specifically a generator to keep mypy happy
  112. _hashes = itertools.chain(hashes, index_hash_generator)
  113. hashes = (x for x in _hashes)
  114. h = _combine_hash_arrays(hashes, num_items)
  115. h = Series(h, index=obj.index, dtype="uint64", copy=False)
  116. else:
  117. raise TypeError(f"Unexpected type for hashing {type(obj)}")
  118. return h
  119. def hash_tuples(vals, encoding="utf8", hash_key: str = _default_hash_key):
  120. """
  121. Hash an MultiIndex / list-of-tuples efficiently
  122. Parameters
  123. ----------
  124. vals : MultiIndex, list-of-tuples, or single tuple
  125. encoding : str, default 'utf8'
  126. hash_key : str, default _default_hash_key
  127. Returns
  128. -------
  129. ndarray of hashed values array
  130. """
  131. is_tuple = False
  132. if isinstance(vals, tuple):
  133. vals = [vals]
  134. is_tuple = True
  135. elif not is_list_like(vals):
  136. raise TypeError("must be convertible to a list-of-tuples")
  137. from pandas import Categorical, MultiIndex
  138. if not isinstance(vals, ABCMultiIndex):
  139. vals = MultiIndex.from_tuples(vals)
  140. # create a list-of-Categoricals
  141. vals = [
  142. Categorical(vals.codes[level], vals.levels[level], ordered=False, fastpath=True)
  143. for level in range(vals.nlevels)
  144. ]
  145. # hash the list-of-ndarrays
  146. hashes = (
  147. _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in vals
  148. )
  149. h = _combine_hash_arrays(hashes, len(vals))
  150. if is_tuple:
  151. h = h[0]
  152. return h
  153. def _hash_categorical(c, encoding: str, hash_key: str):
  154. """
  155. Hash a Categorical by hashing its categories, and then mapping the codes
  156. to the hashes
  157. Parameters
  158. ----------
  159. c : Categorical
  160. encoding : str
  161. hash_key : str
  162. Returns
  163. -------
  164. ndarray of hashed values array, same size as len(c)
  165. """
  166. # Convert ExtensionArrays to ndarrays
  167. values = np.asarray(c.categories._values)
  168. hashed = hash_array(values, encoding, hash_key, categorize=False)
  169. # we have uint64, as we don't directly support missing values
  170. # we don't want to use take_nd which will coerce to float
  171. # instead, directly construct the result with a
  172. # max(np.uint64) as the missing value indicator
  173. #
  174. # TODO: GH 15362
  175. mask = c.isna()
  176. if len(hashed):
  177. result = hashed.take(c.codes)
  178. else:
  179. result = np.zeros(len(mask), dtype="uint64")
  180. if mask.any():
  181. result[mask] = np.iinfo(np.uint64).max
  182. return result
  183. def hash_array(
  184. vals,
  185. encoding: str = "utf8",
  186. hash_key: str = _default_hash_key,
  187. categorize: bool = True,
  188. ):
  189. """
  190. Given a 1d array, return an array of deterministic integers.
  191. Parameters
  192. ----------
  193. vals : ndarray, Categorical
  194. encoding : str, default 'utf8'
  195. Encoding for data & key when strings.
  196. hash_key : str, default _default_hash_key
  197. Hash_key for string key to encode.
  198. categorize : bool, default True
  199. Whether to first categorize object arrays before hashing. This is more
  200. efficient when the array contains duplicate values.
  201. Returns
  202. -------
  203. 1d uint64 numpy array of hash values, same length as the vals
  204. """
  205. if not hasattr(vals, "dtype"):
  206. raise TypeError("must pass a ndarray-like")
  207. dtype = vals.dtype
  208. # For categoricals, we hash the categories, then remap the codes to the
  209. # hash values. (This check is above the complex check so that we don't ask
  210. # numpy if categorical is a subdtype of complex, as it will choke).
  211. if is_categorical_dtype(dtype):
  212. return _hash_categorical(vals, encoding, hash_key)
  213. elif is_extension_array_dtype(dtype):
  214. vals, _ = vals._values_for_factorize()
  215. dtype = vals.dtype
  216. # we'll be working with everything as 64-bit values, so handle this
  217. # 128-bit value early
  218. if np.issubdtype(dtype, np.complex128):
  219. return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
  220. # First, turn whatever array this is into unsigned 64-bit ints, if we can
  221. # manage it.
  222. elif isinstance(dtype, bool):
  223. vals = vals.astype("u8")
  224. elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
  225. vals = vals.view("i8").astype("u8", copy=False)
  226. elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
  227. vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
  228. else:
  229. # With repeated values, its MUCH faster to categorize object dtypes,
  230. # then hash and rename categories. We allow skipping the categorization
  231. # when the values are known/likely to be unique.
  232. if categorize:
  233. from pandas import Categorical, Index, factorize
  234. codes, categories = factorize(vals, sort=False)
  235. cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
  236. return _hash_categorical(cat, encoding, hash_key)
  237. try:
  238. vals = hashing.hash_object_array(vals, hash_key, encoding)
  239. except TypeError:
  240. # we have mixed types
  241. vals = hashing.hash_object_array(
  242. vals.astype(str).astype(object), hash_key, encoding
  243. )
  244. # Then, redistribute these 64-bit ints within the space of 64-bit ints
  245. vals ^= vals >> 30
  246. vals *= np.uint64(0xBF58476D1CE4E5B9)
  247. vals ^= vals >> 27
  248. vals *= np.uint64(0x94D049BB133111EB)
  249. vals ^= vals >> 31
  250. return vals