| 1 | """ |
|---|
| 2 | Statistical utilities. |
|---|
| 3 | |
|---|
| 4 | Ported to Python 3. |
|---|
| 5 | """ |
|---|
| 6 | # Copyright (c) 2009 Shawn Willden |
|---|
| 7 | # mailto:shawn@willden.org |
|---|
| 8 | # I hereby license all patches I have contributed or will contribute to the |
|---|
| 9 | # Allmydata Tahoe-LAFS project, including the file 'statistics.py', under |
|---|
| 10 | # either the GNU General Public License, version 2 or later, or under the |
|---|
| 11 | # Transitive Grace Period Public License, version 1 or later. |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | from allmydata.util.mathutil import round_sigfigs |
|---|
| 15 | import math |
|---|
| 16 | from functools import reduce |
|---|
| 17 | import sys |
|---|
| 18 | |
|---|
| 19 | def pr_file_loss(p_list, k): |
|---|
| 20 | """ |
|---|
| 21 | Probability of single-file loss for shares with reliabilities in |
|---|
| 22 | p_list. |
|---|
| 23 | |
|---|
| 24 | Computes the probability that a single file will become |
|---|
| 25 | unrecoverable, based on the individual share survival |
|---|
| 26 | probabilities and and k (number of shares needed for recovery). |
|---|
| 27 | |
|---|
| 28 | Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the |
|---|
| 29 | probability that a file with k=3, N=10 and stored on five servers |
|---|
| 30 | with reliability .9 and five servers with reliability .99 is lost. |
|---|
| 31 | |
|---|
| 32 | See survival_pmf docstring for important statistical assumptions. |
|---|
| 33 | |
|---|
| 34 | """ |
|---|
| 35 | assert 0 < k <= len(p_list) |
|---|
| 36 | assert valid_probability_list(p_list) |
|---|
| 37 | |
|---|
| 38 | # Sum elements 0 through k-1 of the share set PMF to get the |
|---|
| 39 | # probability that less than k shares survived. |
|---|
| 40 | return sum(survival_pmf(p_list)[0:k]) |
|---|
| 41 | |
|---|
| 42 | def survival_pmf(p_list): |
|---|
| 43 | """ |
|---|
| 44 | Return the collective PMF of share survival count for a set of |
|---|
| 45 | shares with the individual survival probabilities in p_list. |
|---|
| 46 | |
|---|
| 47 | Example: survival_pmf([.99] * 10 + [.8] * 6) returns the |
|---|
| 48 | probability mass function for the number of shares that will |
|---|
| 49 | survive from an initial set of 16, 10 with p=0.99 and 6 with |
|---|
| 50 | p=0.8. The ith element of the resulting list is the probability |
|---|
| 51 | that exactly i shares will survive. |
|---|
| 52 | |
|---|
| 53 | This calculation makes the following assumptions: |
|---|
| 54 | |
|---|
| 55 | 1. p_list[i] is the probability that any individual share will |
|---|
| 56 | will survive during the time period in question (whatever that may |
|---|
| 57 | be). |
|---|
| 58 | |
|---|
| 59 | 2. The share failures are "independent", in the statistical |
|---|
| 60 | sense. Note that if a group of shares are stored on the same |
|---|
| 61 | machine or even in the same data center, they are NOT independent |
|---|
| 62 | and this calculation is therefore wrong. |
|---|
| 63 | """ |
|---|
| 64 | assert valid_probability_list(p_list) |
|---|
| 65 | |
|---|
| 66 | pmf = survival_pmf_via_conv(p_list) |
|---|
| 67 | |
|---|
| 68 | assert valid_pmf(pmf) |
|---|
| 69 | return pmf |
|---|
| 70 | |
|---|
| 71 | def survival_pmf_via_bd(p_list): |
|---|
| 72 | """ |
|---|
| 73 | Compute share survival PMF using the binomial distribution PMF as |
|---|
| 74 | much as possible. |
|---|
| 75 | |
|---|
| 76 | This is more efficient than the convolution method below, but |
|---|
| 77 | doesn't work for large numbers of shares because the |
|---|
| 78 | binomial_coeff calculation blows up. Since the efficiency gains |
|---|
| 79 | only matter in the case of large numbers of shares, it's pretty |
|---|
| 80 | much useless except for testing the convolution methond. |
|---|
| 81 | |
|---|
| 82 | Note that this function does little to no error checking and is |
|---|
| 83 | intended for internal use and testing only. |
|---|
| 84 | """ |
|---|
| 85 | pmf_list = [ binomial_distribution_pmf(p_list.count(p), p) |
|---|
| 86 | for p in set(p_list) ] |
|---|
| 87 | return list(reduce(convolve, pmf_list)) |
|---|
| 88 | |
|---|
| 89 | def survival_pmf_via_conv(p_list): |
|---|
| 90 | """ |
|---|
| 91 | Compute share survival PMF using iterated convolution of trivial |
|---|
| 92 | PMFs. |
|---|
| 93 | |
|---|
| 94 | Note that this function does little to no error checking and is |
|---|
| 95 | intended for internal use and testing only. |
|---|
| 96 | """ |
|---|
| 97 | pmf_list = [ [1 - p, p] for p in p_list ]; |
|---|
| 98 | return list(reduce(convolve, pmf_list)) |
|---|
| 99 | |
|---|
| 100 | def print_pmf(pmf, n=4, out=sys.stdout): |
|---|
| 101 | """ |
|---|
| 102 | Print a PMF in a readable form, with values rounded to n |
|---|
| 103 | significant digits. |
|---|
| 104 | """ |
|---|
| 105 | for k, p in enumerate(pmf): |
|---|
| 106 | print("i=" + str(k) + ":", round_sigfigs(p, n), file=out) |
|---|
| 107 | |
|---|
| 108 | def pr_backup_file_loss(p_list, backup_p, k): |
|---|
| 109 | """ |
|---|
| 110 | Probability of single-file loss in a backup context |
|---|
| 111 | |
|---|
| 112 | Same as pr_file_loss, except it factors in the probability of |
|---|
| 113 | survival of the original source, specified as backup_p. Because |
|---|
| 114 | that's a precondition to caring about the availability of the |
|---|
| 115 | backup, it's an independent event. |
|---|
| 116 | """ |
|---|
| 117 | assert valid_probability_list(p_list) |
|---|
| 118 | assert 0 < backup_p <= 1 |
|---|
| 119 | assert 0 < k <= len(p_list) |
|---|
| 120 | |
|---|
| 121 | return pr_file_loss(p_list, k) * (1 - backup_p) |
|---|
| 122 | |
|---|
| 123 | |
|---|
| 124 | def find_k(p_list, target_loss_prob): |
|---|
| 125 | """ |
|---|
| 126 | Find the highest k value that achieves the targeted loss |
|---|
| 127 | probability, given the share reliabilities given in p_list. |
|---|
| 128 | """ |
|---|
| 129 | assert valid_probability_list(p_list) |
|---|
| 130 | assert 0 < target_loss_prob < 1 |
|---|
| 131 | |
|---|
| 132 | pmf = survival_pmf(p_list) |
|---|
| 133 | return find_k_from_pmf(pmf, target_loss_prob) |
|---|
| 134 | |
|---|
| 135 | def find_k_from_pmf(pmf, target_loss_prob): |
|---|
| 136 | """ |
|---|
| 137 | Find the highest k value that achieves the targeted loss |
|---|
| 138 | probability, given the share survival PMF given in pmf. |
|---|
| 139 | """ |
|---|
| 140 | assert valid_pmf(pmf) |
|---|
| 141 | assert 0 < target_loss_prob < 1 |
|---|
| 142 | |
|---|
| 143 | loss_prob = 0.0 |
|---|
| 144 | for k, p_k in enumerate(pmf): |
|---|
| 145 | loss_prob += p_k |
|---|
| 146 | if loss_prob > target_loss_prob: |
|---|
| 147 | return k |
|---|
| 148 | |
|---|
| 149 | # we shouldn't be able to get here, since sum(pmf)==1.0 |
|---|
| 150 | k = len(pmf) - 1 |
|---|
| 151 | return k |
|---|
| 152 | |
|---|
| 153 | def repair_count_pmf(survival_pmf, k): |
|---|
| 154 | """ |
|---|
| 155 | Return Pr[D=d], where D represents the number of shares that have |
|---|
| 156 | to be repaired at the end of an interval, starting with a full |
|---|
| 157 | set and subject to losses described in survival_pmf. |
|---|
| 158 | """ |
|---|
| 159 | n = len(survival_pmf) - 1 |
|---|
| 160 | |
|---|
| 161 | # Probability of 0 to repair is the probability of all shares |
|---|
| 162 | # surviving plus the probability of less than k surviving. |
|---|
| 163 | pmf = [ survival_pmf[n] + sum(survival_pmf[0:k]) ] |
|---|
| 164 | |
|---|
| 165 | # Probability of more than 0, up to N-k to repair |
|---|
| 166 | for i in range(1, n-k+1): |
|---|
| 167 | pmf.append(survival_pmf[n-i]) |
|---|
| 168 | |
|---|
| 169 | # Probability of more than N-k to repair is 0, because that means |
|---|
| 170 | # there are less than k available and the file is irreparable. |
|---|
| 171 | for i in range(n-k+1, n+1): |
|---|
| 172 | pmf.append(0.0) |
|---|
| 173 | |
|---|
| 174 | assert(valid_pmf(pmf)) |
|---|
| 175 | return pmf |
|---|
| 176 | |
|---|
| 177 | def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio): |
|---|
| 178 | return file_size + float(file_size) / k * shares * ul_dl_ratio |
|---|
| 179 | |
|---|
| 180 | def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio): |
|---|
| 181 | """ |
|---|
| 182 | Return the expected cost for a repair run on a file with the given |
|---|
| 183 | survival_pmf and requiring k shares, in which upload cost is |
|---|
| 184 | 'ul_dl_ratio' times download cost. |
|---|
| 185 | """ |
|---|
| 186 | repair_pmf = repair_count_pmf(survival_pmf, k) |
|---|
| 187 | expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio) |
|---|
| 188 | * repair_pmf[new_shares] |
|---|
| 189 | for new_shares in range(1, len(repair_pmf))]) |
|---|
| 190 | return expected_cost |
|---|
| 191 | |
|---|
| 192 | def eternal_repair_cost(cost_function, file_size, survival_pmf, k, |
|---|
| 193 | discount_rate=0, ul_dl_ratio=1.0): |
|---|
| 194 | """ |
|---|
| 195 | Calculate the eternal repair cost for a file that is aggressively |
|---|
| 196 | repaired, i.e. the sum of repair costs until the file is dead. |
|---|
| 197 | """ |
|---|
| 198 | c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio) |
|---|
| 199 | f = 1 - sum(survival_pmf[0:k]) |
|---|
| 200 | r = float(discount_rate) |
|---|
| 201 | |
|---|
| 202 | return (c * (1-r)) / (1 - (1-r) * f) |
|---|
| 203 | |
|---|
| 204 | def valid_pmf(pmf): |
|---|
| 205 | """ |
|---|
| 206 | Validate that pmf looks like a proper discrete probability mass |
|---|
| 207 | function in list form. |
|---|
| 208 | |
|---|
| 209 | Returns true if the elements of pmf sum to 1. |
|---|
| 210 | """ |
|---|
| 211 | return round(sum(pmf),5) == 1.0 |
|---|
| 212 | |
|---|
| 213 | def valid_probability_list(p_list): |
|---|
| 214 | """ |
|---|
| 215 | Validate that p_list is a list of probibilities |
|---|
| 216 | """ |
|---|
| 217 | for p in p_list: |
|---|
| 218 | if p < 0 or p > 1: |
|---|
| 219 | return False |
|---|
| 220 | |
|---|
| 221 | return True |
|---|
| 222 | |
|---|
| 223 | def convolve(list_a, list_b): |
|---|
| 224 | """ |
|---|
| 225 | Returns the discrete convolution of two lists. |
|---|
| 226 | |
|---|
| 227 | Given two random variables X and Y, the convolution of their |
|---|
| 228 | probability mass functions Pr(X) and Pr(Y) is equal to the |
|---|
| 229 | Pr(X+Y). |
|---|
| 230 | """ |
|---|
| 231 | n = len(list_a) |
|---|
| 232 | m = len(list_b) |
|---|
| 233 | |
|---|
| 234 | result = [] |
|---|
| 235 | for i in range(n + m - 1): |
|---|
| 236 | sum = 0.0 |
|---|
| 237 | |
|---|
| 238 | lower = max(0, i - n + 1) |
|---|
| 239 | upper = min(m - 1, i) |
|---|
| 240 | |
|---|
| 241 | for j in range(lower, upper+1): |
|---|
| 242 | sum += list_a[i-j] * list_b[j] |
|---|
| 243 | |
|---|
| 244 | result.append(sum) |
|---|
| 245 | |
|---|
| 246 | return result |
|---|
| 247 | |
|---|
| 248 | def binomial_distribution_pmf(n, p): |
|---|
| 249 | """ |
|---|
| 250 | Returns Pr(K), where K ~ B(n,p), as a list of values. |
|---|
| 251 | |
|---|
| 252 | Returns the full probability mass function of a B(n, p) as a list |
|---|
| 253 | of values, where the kth element is Pr(K=k), or, in the Tahoe |
|---|
| 254 | context, the probability that exactly k copies of a file share |
|---|
| 255 | survive, when placed on n independent servers with survival |
|---|
| 256 | probability p. |
|---|
| 257 | """ |
|---|
| 258 | assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p |
|---|
| 259 | assert n > 0 |
|---|
| 260 | |
|---|
| 261 | result = [] |
|---|
| 262 | for k in range(n+1): |
|---|
| 263 | result.append(math.pow(p , k ) * |
|---|
| 264 | math.pow(1 - p, n - k) * |
|---|
| 265 | binomial_coeff(n, k)) |
|---|
| 266 | |
|---|
| 267 | assert valid_pmf(result) |
|---|
| 268 | return result; |
|---|
| 269 | |
|---|
| 270 | def binomial_coeff(n, k): |
|---|
| 271 | """ |
|---|
| 272 | Returns the number of ways that k items can be chosen from a set |
|---|
| 273 | of n. |
|---|
| 274 | """ |
|---|
| 275 | assert n >= k |
|---|
| 276 | |
|---|
| 277 | if k > n/2: |
|---|
| 278 | k = n - k |
|---|
| 279 | |
|---|
| 280 | accum = 1.0 |
|---|
| 281 | for i in range(1, k+1): |
|---|
| 282 | accum = accum * (n - k + i) // i; |
|---|
| 283 | |
|---|
| 284 | return int(accum + 0.5) |
|---|