Coverage for lind/design/randomization/md5.py: 97%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

1"""

2md5: Hashing can be used to generate reproducible pseudo-randomization. This can be useful in

3contexts where the user does not want to store a fixed seed to ensure replicability of test

4randomization.

6Examples

7--------

8>>> # numpy randomization with fixed seed

9>>> random_state = np.random.RandomState(42)

10>>> random_state.choice(["Lauren", "Sam", "Ben"], size=1)

11>>> # hd5 random sample (no seed required)

12>>> md5shuffle(["Lauren", "Sam", "Ben"])[0]

14"""

16import logging

17from typing import Union, List

19import numpy as np

20from numpy import ndarray, vectorize, asarray

21from hashlib import md5

23# set logging

24logging.basicConfig(level=logging.INFO)

25logger = logging.getLogger(__name__)

27# define public functions (ignored by jupyter notebooks)

28__all__ = [

29 "md5shuffle",

30 "draw_percentile"

31]

33####################################################################################################

36_hash_size = md5().digest_size

37_hash_max_length = 2.0**(8.0*_hash_size)

40def _str_to_md5_hexidec(s: str) -> hex:

41 """utility for converting a string into an MD5 hexidecimal hash"""

42 hd5 = md5(s.encode())

43 # byte = hd5.digest()

44 hexadecimal = hd5.hexdigest()

45 return hexadecimal

48_str_to_md5_hexidec = vectorize(

49 pyfunc=_str_to_md5_hexidec,

50 doc="Utility for converting an array of strings into an array of MD5 hexidecimal hashs"

51)

54def _hash_to_int(s: str) -> int:

55 """utility to transform md5 hash to an integer"""

56 return int(s, _hash_size)

59_vhash_to_int = vectorize(

60 pyfunc=_hash_to_int,

61 doc="utility to transform an array of md5 hashs to an array of integers"

62)

65# def _find_hash_space(s):

66# """Utility to convert first 6 hex digits to an int"""

67# return int(s[:6], 16)

70# _vfind_hash_space = vectorize(_find_hash_space)

72####################################################################################################

75def md5shuffle(arr: ndarray, salt: str = None) -> ndarray:

76 """

77 md5shuffle

79 Will shuffle the input array pseudo-randomly in a deterministic manner using MD5 hashing.

81 Parameters

82 ----------

83 arr: list, ndarray

84 The array of values that you want shuffled

85 salt: str

86 A sting to append to sample ids to avoid collisions across experiments testing on the same

87 population. If None, then no salt is applied.

89 Returns

90 -------

91 ndarray

92 the input array in a shuffled order

95 Examples

96 --------

97 >>> md5shuffle(

98 >>> arr=[i for i in range(1000)],

99 >>> salt="whale"

100 >>> )

101

102 """

103 arr_salted = arr = asarray(arr).astype(str)

104 if salt is not None:

105 arr_salted = np.core.defchararray.add(arr, asarray([salt]))

106 return arr[

107 _str_to_md5_hexidec(arr_salted).argsort()

108 ]

109

110

111def draw_percentile(arr: Union[List, ndarray], lb: float = 0.25, ub: float = 0.75,

112 salt: str = None) -> ndarray:

113 """

114 draw_percentile

115

116 Draw array values that fall within a certain percentile of the hash space.

117

118 Parameters

119 ----------

120 arr: list, ndarray

121 An array of objects that you want to sample from

122 lb: float, optional

123 The lower bound of the percentile; must be between 0 and 1

124 ub: float, optional

125 The upper bound of the percentile; must be between 0 and 1; must be greater than lb

126 salt: str

127 A sting to append to sample ids to avoid collisions across experiments testing on the same

128 population. If None, then no salt is applied.

129

130 Returns

131 -------

132 ndarray

133 an array of values from arr that fall within the specified percentile of the hash space

134

135 Examples

136 --------

137 >>> draw_percentile([i for i in range(1000)], lb=0.25, ub=0.75) # sample 50% of inputs

138

139 """

140

141 assert ub > lb, "Input ub must be greater than input lb."

142 assert 0.0 <= ub <= 1.0, "Input ub must be between 0.0 and 1.0!"

143 assert 0.0 <= lb <= 1.0, "Input lb must be between 0.0 and 1.0!"

144

145 arr_salted = arr = asarray(arr).astype(str)

146 if salt is not None:

147 arr_salted = np.core.defchararray.add(arr, asarray([salt]))

148 hash_arr = _str_to_md5_hexidec(arr)

149

150 # hash_spaces = _vfind_hash_space(hash_arr)

151 # max_hash = 0xffffff

152 # lb *= max_hash

153 # ub *= max_hash

154 # return arr[

155 # np.where((hash_spaces >= lb) & (hash_spaces <= ub))[0]

156 # ]

157

158 hash_arr = _vhash_to_int(hash_arr).astype(float)

159 percentiles = hash_arr / _hash_max_length

160 return arr[(lb <= percentiles) & (percentiles < ub)]

Coverage for lind/design/randomization/md5.py : 97%

34 statements 33 run 1 missing 0 excluded