Context Navigation

source: trunk/src/allmydata/util/statistics.py

Visit:

Last change on this file was 1cfe843d, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-02-22T23:40:25Z
more python2 removal
Property mode set to `100644`
File size: 8.7 KB

Line
1	"""
2	Statistical utilities.
3
4	Ported to Python 3.
5	"""
6	# Copyright (c) 2009 Shawn Willden
7	# mailto:shawn@willden.org
8	# I hereby license all patches I have contributed or will contribute to the
9	# Allmydata Tahoe-LAFS project, including the file 'statistics.py', under
10	# either the GNU General Public License, version 2 or later, or under the
11	# Transitive Grace Period Public License, version 1 or later.
12
13
14	from allmydata.util.mathutil import round_sigfigs
15	import math
16	from functools import reduce
17	import sys
18
19	def pr_file_loss(p_list, k):
20	"""
21	Probability of single-file loss for shares with reliabilities in
22	p_list.
23
24	Computes the probability that a single file will become
25	unrecoverable, based on the individual share survival
26	probabilities and and k (number of shares needed for recovery).
27
28	Example: pr_file_loss([.9] * 5 + [.99] * 5, 3) returns the
29	probability that a file with k=3, N=10 and stored on five servers
30	with reliability .9 and five servers with reliability .99 is lost.
31
32	See survival_pmf docstring for important statistical assumptions.
33
34	"""
35	assert 0 < k <= len(p_list)
36	assert valid_probability_list(p_list)
37
38	# Sum elements 0 through k-1 of the share set PMF to get the
39	# probability that less than k shares survived.
40	return sum(survival_pmf(p_list)[0:k])
41
42	def survival_pmf(p_list):
43	"""
44	Return the collective PMF of share survival count for a set of
45	shares with the individual survival probabilities in p_list.
46
47	Example: survival_pmf([.99] * 10 + [.8] * 6) returns the
48	probability mass function for the number of shares that will
49	survive from an initial set of 16, 10 with p=0.99 and 6 with
50	p=0.8. The ith element of the resulting list is the probability
51	that exactly i shares will survive.
52
53	This calculation makes the following assumptions:
54
55	1. p_list[i] is the probability that any individual share will
56	will survive during the time period in question (whatever that may
57	be).
58
59	2. The share failures are "independent", in the statistical
60	sense. Note that if a group of shares are stored on the same
61	machine or even in the same data center, they are NOT independent
62	and this calculation is therefore wrong.
63	"""
64	assert valid_probability_list(p_list)
65
66	pmf = survival_pmf_via_conv(p_list)
67
68	assert valid_pmf(pmf)
69	return pmf
70
71	def survival_pmf_via_bd(p_list):
72	"""
73	Compute share survival PMF using the binomial distribution PMF as
74	much as possible.
75
76	This is more efficient than the convolution method below, but
77	doesn't work for large numbers of shares because the
78	binomial_coeff calculation blows up. Since the efficiency gains
79	only matter in the case of large numbers of shares, it's pretty
80	much useless except for testing the convolution methond.
81
82	Note that this function does little to no error checking and is
83	intended for internal use and testing only.
84	"""
85	pmf_list = [ binomial_distribution_pmf(p_list.count(p), p)
86	for p in set(p_list) ]
87	return list(reduce(convolve, pmf_list))
88
89	def survival_pmf_via_conv(p_list):
90	"""
91	Compute share survival PMF using iterated convolution of trivial
92	PMFs.
93
94	Note that this function does little to no error checking and is
95	intended for internal use and testing only.
96	"""
97	pmf_list = [ [1 - p, p] for p in p_list ];
98	return list(reduce(convolve, pmf_list))
99
100	def print_pmf(pmf, n=4, out=sys.stdout):
101	"""
102	Print a PMF in a readable form, with values rounded to n
103	significant digits.
104	"""
105	for k, p in enumerate(pmf):
106	print("i=" + str(k) + ":", round_sigfigs(p, n), file=out)
107
108	def pr_backup_file_loss(p_list, backup_p, k):
109	"""
110	Probability of single-file loss in a backup context
111
112	Same as pr_file_loss, except it factors in the probability of
113	survival of the original source, specified as backup_p. Because
114	that's a precondition to caring about the availability of the
115	backup, it's an independent event.
116	"""
117	assert valid_probability_list(p_list)
118	assert 0 < backup_p <= 1
119	assert 0 < k <= len(p_list)
120
121	return pr_file_loss(p_list, k) * (1 - backup_p)
122
123
124	def find_k(p_list, target_loss_prob):
125	"""
126	Find the highest k value that achieves the targeted loss
127	probability, given the share reliabilities given in p_list.
128	"""
129	assert valid_probability_list(p_list)
130	assert 0 < target_loss_prob < 1
131
132	pmf = survival_pmf(p_list)
133	return find_k_from_pmf(pmf, target_loss_prob)
134
135	def find_k_from_pmf(pmf, target_loss_prob):
136	"""
137	Find the highest k value that achieves the targeted loss
138	probability, given the share survival PMF given in pmf.
139	"""
140	assert valid_pmf(pmf)
141	assert 0 < target_loss_prob < 1
142
143	loss_prob = 0.0
144	for k, p_k in enumerate(pmf):
145	loss_prob += p_k
146	if loss_prob > target_loss_prob:
147	return k
148
149	# we shouldn't be able to get here, since sum(pmf)==1.0
150	k = len(pmf) - 1
151	return k
152
153	def repair_count_pmf(survival_pmf, k):
154	"""
155	Return Pr[D=d], where D represents the number of shares that have
156	to be repaired at the end of an interval, starting with a full
157	set and subject to losses described in survival_pmf.
158	"""
159	n = len(survival_pmf) - 1
160
161	# Probability of 0 to repair is the probability of all shares
162	# surviving plus the probability of less than k surviving.
163	pmf = [ survival_pmf[n] + sum(survival_pmf[0:k]) ]
164
165	# Probability of more than 0, up to N-k to repair
166	for i in range(1, n-k+1):
167	pmf.append(survival_pmf[n-i])
168
169	# Probability of more than N-k to repair is 0, because that means
170	# there are less than k available and the file is irreparable.
171	for i in range(n-k+1, n+1):
172	pmf.append(0.0)
173
174	assert(valid_pmf(pmf))
175	return pmf
176
177	def bandwidth_cost_function(file_size, shares, k, ul_dl_ratio):
178	return file_size + float(file_size) / k * shares * ul_dl_ratio
179
180	def mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio):
181	"""
182	Return the expected cost for a repair run on a file with the given
183	survival_pmf and requiring k shares, in which upload cost is
184	'ul_dl_ratio' times download cost.
185	"""
186	repair_pmf = repair_count_pmf(survival_pmf, k)
187	expected_cost = sum([cost_function(file_size, new_shares, k, ul_dl_ratio)
188	* repair_pmf[new_shares]
189	for new_shares in range(1, len(repair_pmf))])
190	return expected_cost
191
192	def eternal_repair_cost(cost_function, file_size, survival_pmf, k,
193	discount_rate=0, ul_dl_ratio=1.0):
194	"""
195	Calculate the eternal repair cost for a file that is aggressively
196	repaired, i.e. the sum of repair costs until the file is dead.
197	"""
198	c = mean_repair_cost(cost_function, file_size, survival_pmf, k, ul_dl_ratio)
199	f = 1 - sum(survival_pmf[0:k])
200	r = float(discount_rate)
201
202	return (c * (1-r)) / (1 - (1-r) * f)
203
204	def valid_pmf(pmf):
205	"""
206	Validate that pmf looks like a proper discrete probability mass
207	function in list form.
208
209	Returns true if the elements of pmf sum to 1.
210	"""
211	return round(sum(pmf),5) == 1.0
212
213	def valid_probability_list(p_list):
214	"""
215	Validate that p_list is a list of probibilities
216	"""
217	for p in p_list:
218	if p < 0 or p > 1:
219	return False
220
221	return True
222
223	def convolve(list_a, list_b):
224	"""
225	Returns the discrete convolution of two lists.
226
227	Given two random variables X and Y, the convolution of their
228	probability mass functions Pr(X) and Pr(Y) is equal to the
229	Pr(X+Y).
230	"""
231	n = len(list_a)
232	m = len(list_b)
233
234	result = []
235	for i in range(n + m - 1):
236	sum = 0.0
237
238	lower = max(0, i - n + 1)
239	upper = min(m - 1, i)
240
241	for j in range(lower, upper+1):
242	sum += list_a[i-j] * list_b[j]
243
244	result.append(sum)
245
246	return result
247
248	def binomial_distribution_pmf(n, p):
249	"""
250	Returns Pr(K), where K ~ B(n,p), as a list of values.
251
252	Returns the full probability mass function of a B(n, p) as a list
253	of values, where the kth element is Pr(K=k), or, in the Tahoe
254	context, the probability that exactly k copies of a file share
255	survive, when placed on n independent servers with survival
256	probability p.
257	"""
258	assert p >= 0 and p <= 1, 'p=%s must be in the range [0,1]'%p
259	assert n > 0
260
261	result = []
262	for k in range(n+1):
263	result.append(math.pow(p , k ) *
264	math.pow(1 - p, n - k) *
265	binomial_coeff(n, k))
266
267	assert valid_pmf(result)
268	return result;
269
270	def binomial_coeff(n, k):
271	"""
272	Returns the number of ways that k items can be chosen from a set
273	of n.
274	"""
275	assert n >= k
276
277	if k > n/2:
278	k = n - k
279
280	accum = 1.0
281	for i in range(1, k+1):
282	accum = accum * (n - k + i) // i;
283
284	return int(accum + 0.5)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: