| 1 | #! /usr/bin/python |
|---|
| 2 | |
|---|
| 3 | # used to discuss ticket #302: "stop permuting peerlist?" |
|---|
| 4 | |
|---|
| 5 | import time |
|---|
| 6 | import math |
|---|
| 7 | from hashlib import sha1, md5, sha256 |
|---|
| 8 | myhash = md5 |
|---|
| 9 | # md5: 1520 "uploads" per second |
|---|
| 10 | # sha1: 1350 ups |
|---|
| 11 | # sha256: 930 ups |
|---|
| 12 | from itertools import count |
|---|
| 13 | from twisted.python import usage |
|---|
| 14 | |
|---|
| 15 | def abbreviate_space(s, SI=True): |
|---|
| 16 | if s is None: |
|---|
| 17 | return "unknown" |
|---|
| 18 | if SI: |
|---|
| 19 | U = 1000.0 |
|---|
| 20 | isuffix = "B" |
|---|
| 21 | else: |
|---|
| 22 | U = 1024.0 |
|---|
| 23 | isuffix = "iB" |
|---|
| 24 | def r(count, suffix): |
|---|
| 25 | return "%.2f %s%s" % (count, suffix, isuffix) |
|---|
| 26 | |
|---|
| 27 | if s < 1024: # 1000-1023 get emitted as bytes, even in SI mode |
|---|
| 28 | return "%d B" % s |
|---|
| 29 | if s < U*U: |
|---|
| 30 | return r(s/U, "k") |
|---|
| 31 | if s < U*U*U: |
|---|
| 32 | return r(s/(U*U), "M") |
|---|
| 33 | if s < U*U*U*U: |
|---|
| 34 | return r(s/(U*U*U), "G") |
|---|
| 35 | if s < U*U*U*U*U: |
|---|
| 36 | return r(s/(U*U*U*U), "T") |
|---|
| 37 | return r(s/(U*U*U*U*U), "P") |
|---|
| 38 | |
|---|
| 39 | def make_up_a_file_size(seed): |
|---|
| 40 | h = int(myhash(seed).hexdigest(),16) |
|---|
| 41 | max=2**31 |
|---|
| 42 | if 1: # exponential distribution |
|---|
| 43 | e = 8 + (h % (31-8)) |
|---|
| 44 | return 2 ** e |
|---|
| 45 | # uniform distribution |
|---|
| 46 | return h % max # avg 1GB |
|---|
| 47 | |
|---|
| 48 | sizes = [make_up_a_file_size(str(i)) for i in range(10000)] |
|---|
| 49 | avg_filesize = sum(sizes)/len(sizes) |
|---|
| 50 | print "average file size:", abbreviate_space(avg_filesize) |
|---|
| 51 | |
|---|
| 52 | SERVER_CAPACITY = 10**12 |
|---|
| 53 | |
|---|
| 54 | class Server: |
|---|
| 55 | def __init__(self, nodeid, capacity): |
|---|
| 56 | self.nodeid = nodeid |
|---|
| 57 | self.used = 0 |
|---|
| 58 | self.capacity = capacity |
|---|
| 59 | self.numshares = 0 |
|---|
| 60 | self.full_at_tick = None |
|---|
| 61 | |
|---|
| 62 | def upload(self, sharesize): |
|---|
| 63 | if self.used + sharesize < self.capacity: |
|---|
| 64 | self.used += sharesize |
|---|
| 65 | self.numshares += 1 |
|---|
| 66 | return True |
|---|
| 67 | return False |
|---|
| 68 | |
|---|
| 69 | def __repr__(self): |
|---|
| 70 | if self.full_at_tick is not None: |
|---|
| 71 | return "<%s %s full at %d>" % (self.__class__.__name__, self.nodeid, self.full_at_tick) |
|---|
| 72 | else: |
|---|
| 73 | return "<%s %s>" % (self.__class__.__name__, self.nodeid) |
|---|
| 74 | |
|---|
| 75 | class Ring: |
|---|
| 76 | SHOW_MINMAX = False |
|---|
| 77 | def __init__(self, numservers, seed, permute): |
|---|
| 78 | self.servers = [] |
|---|
| 79 | for i in range(numservers): |
|---|
| 80 | nodeid = myhash(str(seed)+str(i)).hexdigest() |
|---|
| 81 | capacity = SERVER_CAPACITY |
|---|
| 82 | s = Server(nodeid, capacity) |
|---|
| 83 | self.servers.append(s) |
|---|
| 84 | self.servers.sort(key=lambda s: s.nodeid) |
|---|
| 85 | self.permute = permute |
|---|
| 86 | #self.list_servers() |
|---|
| 87 | |
|---|
| 88 | def list_servers(self): |
|---|
| 89 | for i in range(len(self.servers)): |
|---|
| 90 | s = self.servers[i] |
|---|
| 91 | next_s = self.servers[(i+1)%len(self.servers)] |
|---|
| 92 | diff = "%032x" % (int(next_s.nodeid,16) - int(s.nodeid,16)) |
|---|
| 93 | s.next_diff = diff |
|---|
| 94 | prev_s = self.servers[(i-1)%len(self.servers)] |
|---|
| 95 | diff = "%032x" % (int(s.nodeid,16) - int(prev_s.nodeid,16)) |
|---|
| 96 | s.prev_diff = diff |
|---|
| 97 | print s, s.prev_diff |
|---|
| 98 | |
|---|
| 99 | print "sorted by delta" |
|---|
| 100 | for s in sorted(self.servers, key=lambda s:s.prev_diff): |
|---|
| 101 | print s, s.prev_diff |
|---|
| 102 | |
|---|
| 103 | def servers_for_si(self, si): |
|---|
| 104 | if self.permute: |
|---|
| 105 | def sortkey(s): |
|---|
| 106 | return myhash(s.nodeid+si).digest() |
|---|
| 107 | return sorted(self.servers, key=sortkey) |
|---|
| 108 | for i in range(len(self.servers)): |
|---|
| 109 | if self.servers[i].nodeid >= si: |
|---|
| 110 | return self.servers[i:] + self.servers[:i] |
|---|
| 111 | return list(self.servers) |
|---|
| 112 | |
|---|
| 113 | def show_servers(self, picked): |
|---|
| 114 | bits = [] |
|---|
| 115 | for s in self.servers: |
|---|
| 116 | if s in picked: |
|---|
| 117 | bits.append("1") |
|---|
| 118 | else: |
|---|
| 119 | bits.append("0") |
|---|
| 120 | #d = [s in picked and "1" or "0" for s in self.servers] |
|---|
| 121 | return "".join(bits) |
|---|
| 122 | |
|---|
| 123 | def dump_usage(self, numfiles, avg_space_per_file): |
|---|
| 124 | print "uploaded", numfiles |
|---|
| 125 | # avg_space_per_file measures expected grid-wide ciphertext per file |
|---|
| 126 | used = list(reversed(sorted([s.used for s in self.servers]))) |
|---|
| 127 | # used is actual per-server ciphertext |
|---|
| 128 | usedpf = [1.0*u/numfiles for u in used] |
|---|
| 129 | # usedpf is actual per-server-per-file ciphertext |
|---|
| 130 | #print "min/max usage: %s/%s" % (abbreviate_space(used[-1]), |
|---|
| 131 | # abbreviate_space(used[0])) |
|---|
| 132 | avg_usage_per_file = avg_space_per_file/len(self.servers) |
|---|
| 133 | # avg_usage_per_file is expected per-server-per-file ciphertext |
|---|
| 134 | spreadpf = usedpf[0] - usedpf[-1] |
|---|
| 135 | average_usagepf = sum(usedpf) / len(usedpf) |
|---|
| 136 | variance = sum([(u-average_usagepf)**2 for u in usedpf])/(len(usedpf)-1) |
|---|
| 137 | std_deviation = math.sqrt(variance) |
|---|
| 138 | sd_of_total = std_deviation / avg_usage_per_file |
|---|
| 139 | |
|---|
| 140 | print "min/max/(exp) usage-pf-ps %s/%s/(%s):" % ( |
|---|
| 141 | abbreviate_space(usedpf[-1]), |
|---|
| 142 | abbreviate_space(usedpf[0]), |
|---|
| 143 | abbreviate_space(avg_usage_per_file) ), |
|---|
| 144 | print "spread-pf: %s (%.2f%%)" % ( |
|---|
| 145 | abbreviate_space(spreadpf), 100.0*spreadpf/avg_usage_per_file), |
|---|
| 146 | #print "average_usage:", abbreviate_space(average_usagepf) |
|---|
| 147 | print "stddev: %s (%.2f%%)" % (abbreviate_space(std_deviation), |
|---|
| 148 | 100.0*sd_of_total) |
|---|
| 149 | if self.SHOW_MINMAX: |
|---|
| 150 | s2 = sorted(self.servers, key=lambda s: s.used) |
|---|
| 151 | print "least:", s2[0].nodeid |
|---|
| 152 | print "most:", s2[-1].nodeid |
|---|
| 153 | |
|---|
| 154 | |
|---|
| 155 | class Options(usage.Options): |
|---|
| 156 | optParameters = [ |
|---|
| 157 | ("k", "k", 3, "required shares", int), |
|---|
| 158 | ("N", "N", 10, "total shares", int), |
|---|
| 159 | ("servers", None, 100, "number of servers", int), |
|---|
| 160 | ("seed", None, None, "seed to use for creating ring"), |
|---|
| 161 | ("fileseed", None, "blah", "seed to use for creating files"), |
|---|
| 162 | ("permute", "p", 1, "1 to permute, 0 to use flat ring", int), |
|---|
| 163 | ] |
|---|
| 164 | def postOptions(self): |
|---|
| 165 | assert self["seed"] |
|---|
| 166 | |
|---|
| 167 | |
|---|
| 168 | def do_run(ring, opts): |
|---|
| 169 | avg_space_per_file = avg_filesize * opts["N"] / opts["k"] |
|---|
| 170 | fileseed = opts["fileseed"] |
|---|
| 171 | start = time.time() |
|---|
| 172 | all_servers_have_room = True |
|---|
| 173 | no_files_have_wrapped = True |
|---|
| 174 | for filenum in count(0): |
|---|
| 175 | #used = list(reversed(sorted([s.used for s in ring.servers]))) |
|---|
| 176 | #used = [s.used for s in ring.servers] |
|---|
| 177 | #print used |
|---|
| 178 | si = myhash(fileseed+str(filenum)).hexdigest() |
|---|
| 179 | filesize = make_up_a_file_size(si) |
|---|
| 180 | sharesize = filesize / opts["k"] |
|---|
| 181 | if filenum%4000==0 and filenum > 1: |
|---|
| 182 | ring.dump_usage(filenum, avg_space_per_file) |
|---|
| 183 | servers = ring.servers_for_si(si) |
|---|
| 184 | #print ring.show_servers(servers[:opts["N"]]) |
|---|
| 185 | remaining_shares = opts["N"] |
|---|
| 186 | index = 0 |
|---|
| 187 | server_was_full = False |
|---|
| 188 | file_was_wrapped = False |
|---|
| 189 | remaining_servers = set(servers) |
|---|
| 190 | while remaining_shares: |
|---|
| 191 | if index >= len(servers): |
|---|
| 192 | index = 0 |
|---|
| 193 | file_was_wrapped = True |
|---|
| 194 | s = servers[index] |
|---|
| 195 | accepted = s.upload(sharesize) |
|---|
| 196 | if not accepted: |
|---|
| 197 | server_was_full = True |
|---|
| 198 | remaining_servers.discard(s) |
|---|
| 199 | if not remaining_servers: |
|---|
| 200 | print "-- GRID IS FULL" |
|---|
| 201 | ring.dump_usage(filenum, avg_space_per_file) |
|---|
| 202 | return filenum |
|---|
| 203 | index += 1 |
|---|
| 204 | continue |
|---|
| 205 | remaining_shares -= 1 |
|---|
| 206 | index += 1 |
|---|
| 207 | # file is done being uploaded |
|---|
| 208 | |
|---|
| 209 | if server_was_full and all_servers_have_room: |
|---|
| 210 | all_servers_have_room = False |
|---|
| 211 | print "-- FIRST SERVER FULL" |
|---|
| 212 | ring.dump_usage(filenum, avg_space_per_file) |
|---|
| 213 | if file_was_wrapped and no_files_have_wrapped: |
|---|
| 214 | no_files_have_wrapped = False |
|---|
| 215 | print "-- FIRST FILE WRAPPED" |
|---|
| 216 | ring.dump_usage(filenum, avg_space_per_file) |
|---|
| 217 | |
|---|
| 218 | |
|---|
| 219 | def do_ring(opts): |
|---|
| 220 | total_capacity = opts["servers"]*SERVER_CAPACITY |
|---|
| 221 | avg_space_per_file = avg_filesize * opts["N"] / opts["k"] |
|---|
| 222 | avg_files = total_capacity / avg_space_per_file |
|---|
| 223 | print "expected number of uploads:", avg_files |
|---|
| 224 | if opts["permute"]: |
|---|
| 225 | print " PERMUTED" |
|---|
| 226 | else: |
|---|
| 227 | print " LINEAR" |
|---|
| 228 | seed = opts["seed"] |
|---|
| 229 | |
|---|
| 230 | ring = Ring(opts["servers"], seed, opts["permute"]) |
|---|
| 231 | num_files = do_run(ring, opts) |
|---|
| 232 | |
|---|
| 233 | def run(opts): |
|---|
| 234 | do_ring(opts) |
|---|
| 235 | |
|---|
| 236 | if __name__ == "__main__": |
|---|
| 237 | opts = Options() |
|---|
| 238 | opts.parseOptions() |
|---|
| 239 | run(opts) |
|---|