Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2md5: Hashing can be used to generate reproducible pseudo-randomization. This can be useful in 

3contexts where the user does not want to store a fixed seed to ensure replicability of test 

4randomization. 

5 

6Examples 

7-------- 

8>>> # numpy randomization with fixed seed 

9>>> random_state = np.random.RandomState(42) 

10>>> random_state.choice(["Lauren", "Sam", "Ben"], size=1) 

11>>> # hd5 random sample (no seed required) 

12>>> md5shuffle(["Lauren", "Sam", "Ben"])[0] 

13 

14""" 

15 

16import logging 

17from typing import Union, List 

18 

19import numpy as np 

20from numpy import ndarray, vectorize, asarray 

21from hashlib import md5 

22 

23# set logging 

24logging.basicConfig(level=logging.INFO) 

25logger = logging.getLogger(__name__) 

26 

27# define public functions (ignored by jupyter notebooks) 

28__all__ = [ 

29 "md5shuffle", 

30 "draw_percentile" 

31] 

32 

33#################################################################################################### 

34 

35 

36_hash_size = md5().digest_size 

37_hash_max_length = 2.0**(8.0*_hash_size) 

38 

39 

40def _str_to_md5_hexidec(s: str) -> hex: 

41 """utility for converting a string into an MD5 hexidecimal hash""" 

42 hd5 = md5(s.encode()) 

43 # byte = hd5.digest() 

44 hexadecimal = hd5.hexdigest() 

45 return hexadecimal 

46 

47 

48_str_to_md5_hexidec = vectorize( 

49 pyfunc=_str_to_md5_hexidec, 

50 doc="Utility for converting an array of strings into an array of MD5 hexidecimal hashs" 

51) 

52 

53 

54def _hash_to_int(s: str) -> int: 

55 """utility to transform md5 hash to an integer""" 

56 return int(s, _hash_size) 

57 

58 

59_vhash_to_int = vectorize( 

60 pyfunc=_hash_to_int, 

61 doc="utility to transform an array of md5 hashs to an array of integers" 

62) 

63 

64 

65# def _find_hash_space(s): 

66# """Utility to convert first 6 hex digits to an int""" 

67# return int(s[:6], 16) 

68 

69 

70# _vfind_hash_space = vectorize(_find_hash_space) 

71 

72#################################################################################################### 

73 

74 

75def md5shuffle(arr: ndarray, salt: str = None) -> ndarray: 

76 """ 

77 md5shuffle 

78 

79 Will shuffle the input array pseudo-randomly in a deterministic manner using MD5 hashing. 

80 

81 Parameters 

82 ---------- 

83 arr: list, ndarray 

84 The array of values that you want shuffled 

85 salt: str 

86 A sting to append to sample ids to avoid collisions across experiments testing on the same 

87 population. If None, then no salt is applied. 

88 

89 Returns 

90 ------- 

91 ndarray 

92 the input array in a shuffled order 

93 

94 

95 Examples 

96 -------- 

97 >>> md5shuffle( 

98 >>> arr=[i for i in range(1000)], 

99 >>> salt="whale" 

100 >>> ) 

101 

102 """ 

103 arr_salted = arr = asarray(arr).astype(str) 

104 if salt is not None: 

105 arr_salted = np.core.defchararray.add(arr, asarray([salt])) 

106 return arr[ 

107 _str_to_md5_hexidec(arr_salted).argsort() 

108 ] 

109 

110 

111def draw_percentile(arr: Union[List, ndarray], lb: float = 0.25, ub: float = 0.75, 

112 salt: str = None) -> ndarray: 

113 """ 

114 draw_percentile 

115 

116 Draw array values that fall within a certain percentile of the hash space. 

117 

118 Parameters 

119 ---------- 

120 arr: list, ndarray 

121 An array of objects that you want to sample from 

122 lb: float, optional 

123 The lower bound of the percentile; must be between 0 and 1 

124 ub: float, optional 

125 The upper bound of the percentile; must be between 0 and 1; must be greater than lb 

126 salt: str 

127 A sting to append to sample ids to avoid collisions across experiments testing on the same 

128 population. If None, then no salt is applied. 

129 

130 Returns 

131 ------- 

132 ndarray 

133 an array of values from arr that fall within the specified percentile of the hash space 

134 

135 Examples 

136 -------- 

137 >>> draw_percentile([i for i in range(1000)], lb=0.25, ub=0.75) # sample 50% of inputs 

138 

139 """ 

140 

141 assert ub > lb, "Input ub must be greater than input lb." 

142 assert 0.0 <= ub <= 1.0, "Input ub must be between 0.0 and 1.0!" 

143 assert 0.0 <= lb <= 1.0, "Input lb must be between 0.0 and 1.0!" 

144 

145 arr_salted = arr = asarray(arr).astype(str) 

146 if salt is not None: 

147 arr_salted = np.core.defchararray.add(arr, asarray([salt])) 

148 hash_arr = _str_to_md5_hexidec(arr) 

149 

150 # hash_spaces = _vfind_hash_space(hash_arr) 

151 # max_hash = 0xffffff 

152 # lb *= max_hash 

153 # ub *= max_hash 

154 # return arr[ 

155 # np.where((hash_spaces >= lb) & (hash_spaces <= ub))[0] 

156 # ] 

157 

158 hash_arr = _vhash_to_int(hash_arr).astype(float) 

159 percentiles = hash_arr / _hash_max_length 

160 return arr[(lb <= percentiles) & (percentiles < ub)]