Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2md5: Hashing can be used to generate reproducible pseudo-randomization. This can be useful in
3contexts where the user does not want to store a fixed seed to ensure replicability of test
4randomization.
6Examples
7--------
8>>> # numpy randomization with fixed seed
9>>> random_state = np.random.RandomState(42)
10>>> random_state.choice(["Lauren", "Sam", "Ben"], size=1)
11>>> # hd5 random sample (no seed required)
12>>> md5shuffle(["Lauren", "Sam", "Ben"])[0]
14"""
16import logging
17from typing import Union, List
19import numpy as np
20from numpy import ndarray, vectorize, asarray
21from hashlib import md5
23# set logging
24logging.basicConfig(level=logging.INFO)
25logger = logging.getLogger(__name__)
27# define public functions (ignored by jupyter notebooks)
28__all__ = [
29 "md5shuffle",
30 "draw_percentile"
31]
33####################################################################################################
36_hash_size = md5().digest_size
37_hash_max_length = 2.0**(8.0*_hash_size)
40def _str_to_md5_hexidec(s: str) -> hex:
41 """utility for converting a string into an MD5 hexidecimal hash"""
42 hd5 = md5(s.encode())
43 # byte = hd5.digest()
44 hexadecimal = hd5.hexdigest()
45 return hexadecimal
48_str_to_md5_hexidec = vectorize(
49 pyfunc=_str_to_md5_hexidec,
50 doc="Utility for converting an array of strings into an array of MD5 hexidecimal hashs"
51)
54def _hash_to_int(s: str) -> int:
55 """utility to transform md5 hash to an integer"""
56 return int(s, _hash_size)
59_vhash_to_int = vectorize(
60 pyfunc=_hash_to_int,
61 doc="utility to transform an array of md5 hashs to an array of integers"
62)
65# def _find_hash_space(s):
66# """Utility to convert first 6 hex digits to an int"""
67# return int(s[:6], 16)
70# _vfind_hash_space = vectorize(_find_hash_space)
72####################################################################################################
75def md5shuffle(arr: ndarray, salt: str = None) -> ndarray:
76 """
77 md5shuffle
79 Will shuffle the input array pseudo-randomly in a deterministic manner using MD5 hashing.
81 Parameters
82 ----------
83 arr: list, ndarray
84 The array of values that you want shuffled
85 salt: str
86 A sting to append to sample ids to avoid collisions across experiments testing on the same
87 population. If None, then no salt is applied.
89 Returns
90 -------
91 ndarray
92 the input array in a shuffled order
95 Examples
96 --------
97 >>> md5shuffle(
98 >>> arr=[i for i in range(1000)],
99 >>> salt="whale"
100 >>> )
102 """
103 arr_salted = arr = asarray(arr).astype(str)
104 if salt is not None:
105 arr_salted = np.core.defchararray.add(arr, asarray([salt]))
106 return arr[
107 _str_to_md5_hexidec(arr_salted).argsort()
108 ]
111def draw_percentile(arr: Union[List, ndarray], lb: float = 0.25, ub: float = 0.75,
112 salt: str = None) -> ndarray:
113 """
114 draw_percentile
116 Draw array values that fall within a certain percentile of the hash space.
118 Parameters
119 ----------
120 arr: list, ndarray
121 An array of objects that you want to sample from
122 lb: float, optional
123 The lower bound of the percentile; must be between 0 and 1
124 ub: float, optional
125 The upper bound of the percentile; must be between 0 and 1; must be greater than lb
126 salt: str
127 A sting to append to sample ids to avoid collisions across experiments testing on the same
128 population. If None, then no salt is applied.
130 Returns
131 -------
132 ndarray
133 an array of values from arr that fall within the specified percentile of the hash space
135 Examples
136 --------
137 >>> draw_percentile([i for i in range(1000)], lb=0.25, ub=0.75) # sample 50% of inputs
139 """
141 assert ub > lb, "Input ub must be greater than input lb."
142 assert 0.0 <= ub <= 1.0, "Input ub must be between 0.0 and 1.0!"
143 assert 0.0 <= lb <= 1.0, "Input lb must be between 0.0 and 1.0!"
145 arr_salted = arr = asarray(arr).astype(str)
146 if salt is not None:
147 arr_salted = np.core.defchararray.add(arr, asarray([salt]))
148 hash_arr = _str_to_md5_hexidec(arr)
150 # hash_spaces = _vfind_hash_space(hash_arr)
151 # max_hash = 0xffffff
152 # lb *= max_hash
153 # ub *= max_hash
154 # return arr[
155 # np.where((hash_spaces >= lb) & (hash_spaces <= ub))[0]
156 # ]
158 hash_arr = _vhash_to_int(hash_arr).astype(float)
159 percentiles = hash_arr / _hash_max_length
160 return arr[(lb <= percentiles) & (percentiles < ub)]