| 1 | |
|---|
| 2 | |
|---|
| 3 | class Spans: |
|---|
| 4 | """I represent a compressed list of booleans, one per index (an integer). |
|---|
| 5 | Typically, each index represents an offset into a large string, pointing |
|---|
| 6 | to a specific byte of a share. In this context, True means that byte has |
|---|
| 7 | been received, or has been requested. |
|---|
| 8 | |
|---|
| 9 | Another way to look at this is maintaining a set of integers, optimized |
|---|
| 10 | for operations on spans like 'add range to set' and 'is range in set?'. |
|---|
| 11 | |
|---|
| 12 | This is a python equivalent of perl's Set::IntSpan module, frequently |
|---|
| 13 | used to represent .newsrc contents. |
|---|
| 14 | |
|---|
| 15 | Rather than storing an actual (large) list or dictionary, I represent my |
|---|
| 16 | internal state as a sorted list of spans, each with a start and a length. |
|---|
| 17 | My API is presented in terms of start+length pairs. I provide set |
|---|
| 18 | arithmetic operators, to efficiently answer questions like 'I want bytes |
|---|
| 19 | XYZ, I already requested bytes ABC, and I've already received bytes DEF: |
|---|
| 20 | what bytes should I request now?'. |
|---|
| 21 | |
|---|
| 22 | The new downloader will use it to keep track of which bytes we've requested |
|---|
| 23 | or received already. |
|---|
| 24 | """ |
|---|
| 25 | |
|---|
| 26 | def __init__(self, _span_or_start=None, length=None): |
|---|
| 27 | self._spans = list() |
|---|
| 28 | if length is not None: |
|---|
| 29 | self._spans.append( (_span_or_start, length) ) |
|---|
| 30 | elif _span_or_start: |
|---|
| 31 | for (start,length) in _span_or_start: |
|---|
| 32 | self.add(start, length) |
|---|
| 33 | self._check() |
|---|
| 34 | |
|---|
| 35 | def _check(self): |
|---|
| 36 | assert sorted(self._spans) == self._spans |
|---|
| 37 | prev_end = None |
|---|
| 38 | try: |
|---|
| 39 | for (start,length) in self._spans: |
|---|
| 40 | if prev_end is not None: |
|---|
| 41 | assert start > prev_end |
|---|
| 42 | prev_end = start+length |
|---|
| 43 | except AssertionError: |
|---|
| 44 | print("BAD:", self.dump()) |
|---|
| 45 | raise |
|---|
| 46 | |
|---|
| 47 | def add(self, start, length): |
|---|
| 48 | assert start >= 0 |
|---|
| 49 | assert length > 0 |
|---|
| 50 | #print(" ADD [%d+%d -%d) to %s" % (start, length, start+length, self.dump())) |
|---|
| 51 | first_overlap = last_overlap = None |
|---|
| 52 | for i,(s_start,s_length) in enumerate(self._spans): |
|---|
| 53 | #print(" (%d+%d)-> overlap=%s adjacent=%s" % (s_start,s_length, overlap(s_start, s_length, start, length), adjacent(s_start, s_length, start, length))) |
|---|
| 54 | if (overlap(s_start, s_length, start, length) |
|---|
| 55 | or adjacent(s_start, s_length, start, length)): |
|---|
| 56 | last_overlap = i |
|---|
| 57 | if first_overlap is None: |
|---|
| 58 | first_overlap = i |
|---|
| 59 | continue |
|---|
| 60 | # no overlap |
|---|
| 61 | if first_overlap is not None: |
|---|
| 62 | break |
|---|
| 63 | #print(" first_overlap", first_overlap, last_overlap) |
|---|
| 64 | if first_overlap is None: |
|---|
| 65 | # no overlap, so just insert the span and sort by starting |
|---|
| 66 | # position. |
|---|
| 67 | self._spans.insert(0, (start,length)) |
|---|
| 68 | self._spans.sort() |
|---|
| 69 | else: |
|---|
| 70 | # everything from [first_overlap] to [last_overlap] overlapped |
|---|
| 71 | first_start,first_length = self._spans[first_overlap] |
|---|
| 72 | last_start,last_length = self._spans[last_overlap] |
|---|
| 73 | newspan_start = min(start, first_start) |
|---|
| 74 | newspan_end = max(start+length, last_start+last_length) |
|---|
| 75 | newspan_length = newspan_end - newspan_start |
|---|
| 76 | newspan = (newspan_start, newspan_length) |
|---|
| 77 | self._spans[first_overlap:last_overlap+1] = [newspan] |
|---|
| 78 | #print(" ADD done: %s" % self.dump()) |
|---|
| 79 | self._check() |
|---|
| 80 | |
|---|
| 81 | return self |
|---|
| 82 | |
|---|
| 83 | def remove(self, start, length): |
|---|
| 84 | assert start >= 0 |
|---|
| 85 | assert length > 0 |
|---|
| 86 | #print(" REMOVE [%d+%d -%d) from %s" % (start, length, start+length, self.dump())) |
|---|
| 87 | first_complete_overlap = last_complete_overlap = None |
|---|
| 88 | for i,(s_start,s_length) in enumerate(self._spans): |
|---|
| 89 | s_end = s_start + s_length |
|---|
| 90 | o = overlap(s_start, s_length, start, length) |
|---|
| 91 | if o: |
|---|
| 92 | o_start, o_length = o |
|---|
| 93 | o_end = o_start+o_length |
|---|
| 94 | if o_start == s_start and o_end == s_end: |
|---|
| 95 | # delete this span altogether |
|---|
| 96 | if first_complete_overlap is None: |
|---|
| 97 | first_complete_overlap = i |
|---|
| 98 | last_complete_overlap = i |
|---|
| 99 | elif o_start == s_start: |
|---|
| 100 | # we only overlap the left side, so trim the start |
|---|
| 101 | # 1111 |
|---|
| 102 | # rrrr |
|---|
| 103 | # oo |
|---|
| 104 | # -> 11 |
|---|
| 105 | new_start = o_end |
|---|
| 106 | new_end = s_end |
|---|
| 107 | assert new_start > s_start |
|---|
| 108 | new_length = new_end - new_start |
|---|
| 109 | self._spans[i] = (new_start, new_length) |
|---|
| 110 | elif o_end == s_end: |
|---|
| 111 | # we only overlap the right side |
|---|
| 112 | # 1111 |
|---|
| 113 | # rrrr |
|---|
| 114 | # oo |
|---|
| 115 | # -> 11 |
|---|
| 116 | new_start = s_start |
|---|
| 117 | new_end = o_start |
|---|
| 118 | assert new_end < s_end |
|---|
| 119 | new_length = new_end - new_start |
|---|
| 120 | self._spans[i] = (new_start, new_length) |
|---|
| 121 | else: |
|---|
| 122 | # we overlap the middle, so create a new span. No need to |
|---|
| 123 | # examine any other spans. |
|---|
| 124 | # 111111 |
|---|
| 125 | # rr |
|---|
| 126 | # LL RR |
|---|
| 127 | left_start = s_start |
|---|
| 128 | left_end = o_start |
|---|
| 129 | left_length = left_end - left_start |
|---|
| 130 | right_start = o_end |
|---|
| 131 | right_end = s_end |
|---|
| 132 | right_length = right_end - right_start |
|---|
| 133 | self._spans[i] = (left_start, left_length) |
|---|
| 134 | self._spans.append( (right_start, right_length) ) |
|---|
| 135 | self._spans.sort() |
|---|
| 136 | break |
|---|
| 137 | if first_complete_overlap is not None: |
|---|
| 138 | del self._spans[first_complete_overlap:last_complete_overlap+1] |
|---|
| 139 | #print(" REMOVE done: %s" % self.dump()) |
|---|
| 140 | self._check() |
|---|
| 141 | return self |
|---|
| 142 | |
|---|
| 143 | def dump(self): |
|---|
| 144 | return "len=%d: %s" % (self.len(), |
|---|
| 145 | ",".join(["[%d-%d]" % (start,start+l-1) |
|---|
| 146 | for (start,l) in self._spans]) ) |
|---|
| 147 | |
|---|
| 148 | def each(self): |
|---|
| 149 | for start, length in self._spans: |
|---|
| 150 | for i in range(start, start+length): |
|---|
| 151 | yield i |
|---|
| 152 | |
|---|
| 153 | def __iter__(self): |
|---|
| 154 | for s in self._spans: |
|---|
| 155 | yield s |
|---|
| 156 | |
|---|
| 157 | def __bool__(self): # this gets us bool() |
|---|
| 158 | return bool(self.len()) |
|---|
| 159 | |
|---|
| 160 | #__nonzero__ = __bool__ # Python 2 backwards compatibility |
|---|
| 161 | |
|---|
| 162 | def len(self): |
|---|
| 163 | # guess what! python doesn't allow __len__ to return a long, only an |
|---|
| 164 | # int. So we stop using len(spans), use spans.len() instead. |
|---|
| 165 | return sum([length for start,length in self._spans]) |
|---|
| 166 | |
|---|
| 167 | def __add__(self, other): |
|---|
| 168 | s = self.__class__(self) |
|---|
| 169 | for (start, length) in other: |
|---|
| 170 | s.add(start, length) |
|---|
| 171 | return s |
|---|
| 172 | |
|---|
| 173 | def __sub__(self, other): |
|---|
| 174 | s = self.__class__(self) |
|---|
| 175 | for (start, length) in other: |
|---|
| 176 | s.remove(start, length) |
|---|
| 177 | return s |
|---|
| 178 | |
|---|
| 179 | def __iadd__(self, other): |
|---|
| 180 | for (start, length) in other: |
|---|
| 181 | self.add(start, length) |
|---|
| 182 | return self |
|---|
| 183 | |
|---|
| 184 | def __isub__(self, other): |
|---|
| 185 | for (start, length) in other: |
|---|
| 186 | self.remove(start, length) |
|---|
| 187 | return self |
|---|
| 188 | |
|---|
| 189 | def __and__(self, other): |
|---|
| 190 | if not self._spans: |
|---|
| 191 | return self.__class__() |
|---|
| 192 | bounds = self.__class__(self._spans[0][0], |
|---|
| 193 | self._spans[-1][0]+self._spans[-1][1]) |
|---|
| 194 | not_other = bounds - other |
|---|
| 195 | return self - not_other |
|---|
| 196 | |
|---|
| 197 | def __contains__(self, start_and_length): |
|---|
| 198 | (start, length) = start_and_length |
|---|
| 199 | for span_start,span_length in self._spans: |
|---|
| 200 | o = overlap(start, length, span_start, span_length) |
|---|
| 201 | if o: |
|---|
| 202 | o_start,o_length = o |
|---|
| 203 | if o_start == start and o_length == length: |
|---|
| 204 | return True |
|---|
| 205 | return False |
|---|
| 206 | |
|---|
| 207 | def overlap(start0, length0, start1, length1): |
|---|
| 208 | # return start2,length2 of the overlapping region, or None |
|---|
| 209 | # 00 00 000 0000 00 00 000 00 00 00 00 |
|---|
| 210 | # 11 11 11 11 111 11 11 1111 111 11 11 |
|---|
| 211 | left = max(start0, start1) |
|---|
| 212 | right = min(start0+length0, start1+length1) |
|---|
| 213 | # if there is overlap, 'left' will be its start, and right-1 will |
|---|
| 214 | # be the end' |
|---|
| 215 | if left < right: |
|---|
| 216 | return (left, right-left) |
|---|
| 217 | return None |
|---|
| 218 | |
|---|
| 219 | def adjacent(start0, length0, start1, length1): |
|---|
| 220 | if (start0 < start1) and start0+length0 == start1: |
|---|
| 221 | return True |
|---|
| 222 | elif (start1 < start0) and start1+length1 == start0: |
|---|
| 223 | return True |
|---|
| 224 | return False |
|---|
| 225 | |
|---|
| 226 | class DataSpans: |
|---|
| 227 | """I represent portions of a large string. Equivalently, I can be said to |
|---|
| 228 | maintain a large array of characters (with gaps of empty elements). I can |
|---|
| 229 | be used to manage access to a remote share, where some pieces have been |
|---|
| 230 | retrieved, some have been requested, and others have not been read. |
|---|
| 231 | """ |
|---|
| 232 | |
|---|
| 233 | def __init__(self, other=None): |
|---|
| 234 | self.spans = [] # (start, data) tuples, non-overlapping, merged |
|---|
| 235 | if other: |
|---|
| 236 | for (start, data) in other.get_chunks(): |
|---|
| 237 | self.add(start, data) |
|---|
| 238 | |
|---|
| 239 | def __bool__(self): # this gets us bool() |
|---|
| 240 | return bool(self.len()) |
|---|
| 241 | |
|---|
| 242 | def len(self): |
|---|
| 243 | # return number of bytes we're holding |
|---|
| 244 | return sum([len(data) for (start,data) in self.spans]) |
|---|
| 245 | |
|---|
| 246 | def _dump(self): |
|---|
| 247 | # return iterator of sorted list of offsets, one per byte |
|---|
| 248 | for (start,data) in self.spans: |
|---|
| 249 | for i in range(start, start+len(data)): |
|---|
| 250 | yield i |
|---|
| 251 | |
|---|
| 252 | def dump(self): |
|---|
| 253 | return "len=%d: %s" % (self.len(), |
|---|
| 254 | ",".join(["[%d-%d]" % (start,start+len(data)-1) |
|---|
| 255 | for (start,data) in self.spans]) ) |
|---|
| 256 | |
|---|
| 257 | def get_chunks(self): |
|---|
| 258 | return list(self.spans) |
|---|
| 259 | |
|---|
| 260 | def get_spans(self): |
|---|
| 261 | """Return a Spans object with a bit set for each byte I hold""" |
|---|
| 262 | return Spans([(start, len(data)) for (start,data) in self.spans]) |
|---|
| 263 | |
|---|
| 264 | def assert_invariants(self): |
|---|
| 265 | if not self.spans: |
|---|
| 266 | return |
|---|
| 267 | prev_start = self.spans[0][0] |
|---|
| 268 | prev_end = prev_start + len(self.spans[0][1]) |
|---|
| 269 | for start, data in self.spans[1:]: |
|---|
| 270 | if not start > prev_end: |
|---|
| 271 | # adjacent or overlapping: bad |
|---|
| 272 | print("ASSERTION FAILED", self.spans) |
|---|
| 273 | raise AssertionError |
|---|
| 274 | |
|---|
| 275 | def get(self, start, length): |
|---|
| 276 | # returns a string of LENGTH, or None |
|---|
| 277 | #print("get", start, length, self.spans) |
|---|
| 278 | end = start+length |
|---|
| 279 | for (s_start,s_data) in self.spans: |
|---|
| 280 | s_end = s_start+len(s_data) |
|---|
| 281 | #print(" ",s_start,s_end) |
|---|
| 282 | if s_start <= start < s_end: |
|---|
| 283 | # we want some data from this span. Because we maintain |
|---|
| 284 | # strictly merged and non-overlapping spans, everything we |
|---|
| 285 | # want must be in this span. |
|---|
| 286 | offset = start - s_start |
|---|
| 287 | if offset + length > len(s_data): |
|---|
| 288 | #print(" None, span falls short") |
|---|
| 289 | return None # span falls short |
|---|
| 290 | #print(" some", s_data[offset:offset+length]) |
|---|
| 291 | return s_data[offset:offset+length] |
|---|
| 292 | if s_start >= end: |
|---|
| 293 | # we've gone too far: no further spans will overlap |
|---|
| 294 | #print(" None, gone too far") |
|---|
| 295 | return None |
|---|
| 296 | #print(" None, ran out of spans") |
|---|
| 297 | return None |
|---|
| 298 | |
|---|
| 299 | def add(self, start, data): |
|---|
| 300 | # first: walk through existing spans, find overlap, modify-in-place |
|---|
| 301 | # create list of new spans |
|---|
| 302 | # add new spans |
|---|
| 303 | # sort |
|---|
| 304 | # merge adjacent spans |
|---|
| 305 | #print("add", start, data, self.spans) |
|---|
| 306 | end = start + len(data) |
|---|
| 307 | i = 0 |
|---|
| 308 | while len(data): |
|---|
| 309 | #print(" loop", start, data, i, len(self.spans), self.spans) |
|---|
| 310 | if i >= len(self.spans): |
|---|
| 311 | #print(" append and done") |
|---|
| 312 | # append a last span |
|---|
| 313 | self.spans.append( (start, data) ) |
|---|
| 314 | break |
|---|
| 315 | (s_start,s_data) = self.spans[i] |
|---|
| 316 | # five basic cases: |
|---|
| 317 | # a: OLD b:OLDD c1:OLD c2:OLD d1:OLDD d2:OLD e: OLLDD |
|---|
| 318 | # NEW NEW NEW NEWW NEW NEW NEW |
|---|
| 319 | # |
|---|
| 320 | # we handle A by inserting a new segment (with "N") and looping, |
|---|
| 321 | # turning it into B or C. We handle B by replacing a prefix and |
|---|
| 322 | # terminating. We handle C (both c1 and c2) by replacing the |
|---|
| 323 | # segment (and, for c2, looping, turning it into A). We handle D |
|---|
| 324 | # by replacing a suffix (and, for d2, looping, turning it into |
|---|
| 325 | # A). We handle E by replacing the middle and terminating. |
|---|
| 326 | if start < s_start: |
|---|
| 327 | # case A: insert a new span, then loop with the remainder |
|---|
| 328 | #print(" insert new span") |
|---|
| 329 | s_len = s_start-start |
|---|
| 330 | self.spans.insert(i, (start, data[:s_len])) |
|---|
| 331 | i += 1 |
|---|
| 332 | start = s_start |
|---|
| 333 | data = data[s_len:] |
|---|
| 334 | continue |
|---|
| 335 | s_len = len(s_data) |
|---|
| 336 | s_end = s_start+s_len |
|---|
| 337 | if s_start <= start < s_end: |
|---|
| 338 | #print(" modify this span", s_start, start, s_end) |
|---|
| 339 | # we want to modify some data in this span: a prefix, a |
|---|
| 340 | # suffix, or the whole thing |
|---|
| 341 | if s_start == start: |
|---|
| 342 | if s_end <= end: |
|---|
| 343 | #print(" replace whole segment") |
|---|
| 344 | # case C: replace this segment |
|---|
| 345 | self.spans[i] = (s_start, data[:s_len]) |
|---|
| 346 | i += 1 |
|---|
| 347 | start += s_len |
|---|
| 348 | data = data[s_len:] |
|---|
| 349 | # C2 is where len(data)>0 |
|---|
| 350 | continue |
|---|
| 351 | # case B: modify the prefix, retain the suffix |
|---|
| 352 | #print(" modify prefix") |
|---|
| 353 | self.spans[i] = (s_start, data + s_data[len(data):]) |
|---|
| 354 | break |
|---|
| 355 | if start > s_start and end < s_end: |
|---|
| 356 | # case E: modify the middle |
|---|
| 357 | #print(" modify middle") |
|---|
| 358 | prefix_len = start - s_start # we retain this much |
|---|
| 359 | suffix_len = s_end - end # and retain this much |
|---|
| 360 | newdata = s_data[:prefix_len] + data + s_data[-suffix_len:] |
|---|
| 361 | self.spans[i] = (s_start, newdata) |
|---|
| 362 | break |
|---|
| 363 | # case D: retain the prefix, modify the suffix |
|---|
| 364 | #print(" modify suffix") |
|---|
| 365 | prefix_len = start - s_start # we retain this much |
|---|
| 366 | suffix_len = s_len - prefix_len # we replace this much |
|---|
| 367 | #print(" ", s_data, prefix_len, suffix_len, s_len, data) |
|---|
| 368 | self.spans[i] = (s_start, |
|---|
| 369 | s_data[:prefix_len] + data[:suffix_len]) |
|---|
| 370 | i += 1 |
|---|
| 371 | start += suffix_len |
|---|
| 372 | data = data[suffix_len:] |
|---|
| 373 | #print(" now", start, data) |
|---|
| 374 | # D2 is where len(data)>0 |
|---|
| 375 | continue |
|---|
| 376 | # else we're not there yet |
|---|
| 377 | #print(" still looking") |
|---|
| 378 | i += 1 |
|---|
| 379 | continue |
|---|
| 380 | # now merge adjacent spans |
|---|
| 381 | #print(" merging", self.spans) |
|---|
| 382 | newspans = [] |
|---|
| 383 | for (s_start,s_data) in self.spans: |
|---|
| 384 | if newspans and adjacent(newspans[-1][0], len(newspans[-1][1]), |
|---|
| 385 | s_start, len(s_data)): |
|---|
| 386 | newspans[-1] = (newspans[-1][0], newspans[-1][1] + s_data) |
|---|
| 387 | else: |
|---|
| 388 | newspans.append( (s_start, s_data) ) |
|---|
| 389 | self.spans = newspans |
|---|
| 390 | self.assert_invariants() |
|---|
| 391 | #print(" done", self.spans) |
|---|
| 392 | |
|---|
| 393 | def remove(self, start, length): |
|---|
| 394 | i = 0 |
|---|
| 395 | end = start + length |
|---|
| 396 | #print("remove", start, length, self.spans) |
|---|
| 397 | while i < len(self.spans): |
|---|
| 398 | (s_start,s_data) = self.spans[i] |
|---|
| 399 | if s_start >= end: |
|---|
| 400 | # this segment is entirely right of the removed region, and |
|---|
| 401 | # all further segments are even further right. We're done. |
|---|
| 402 | break |
|---|
| 403 | s_len = len(s_data) |
|---|
| 404 | s_end = s_start + s_len |
|---|
| 405 | o = overlap(start, length, s_start, s_len) |
|---|
| 406 | if not o: |
|---|
| 407 | i += 1 |
|---|
| 408 | continue |
|---|
| 409 | o_start, o_len = o |
|---|
| 410 | o_end = o_start + o_len |
|---|
| 411 | if o_len == s_len: |
|---|
| 412 | # remove the whole segment |
|---|
| 413 | del self.spans[i] |
|---|
| 414 | continue |
|---|
| 415 | if o_start == s_start: |
|---|
| 416 | # remove a prefix, leaving the suffix from o_end to s_end |
|---|
| 417 | prefix_len = o_end - o_start |
|---|
| 418 | self.spans[i] = (o_end, s_data[prefix_len:]) |
|---|
| 419 | i += 1 |
|---|
| 420 | continue |
|---|
| 421 | elif o_end == s_end: |
|---|
| 422 | # remove a suffix, leaving the prefix from s_start to o_start |
|---|
| 423 | prefix_len = o_start - s_start |
|---|
| 424 | self.spans[i] = (s_start, s_data[:prefix_len]) |
|---|
| 425 | i += 1 |
|---|
| 426 | continue |
|---|
| 427 | # remove the middle, creating a new segment |
|---|
| 428 | # left is s_start:o_start, right is o_end:s_end |
|---|
| 429 | left_len = o_start - s_start |
|---|
| 430 | left = s_data[:left_len] |
|---|
| 431 | right_len = s_end - o_end |
|---|
| 432 | right = s_data[-right_len:] |
|---|
| 433 | self.spans[i] = (s_start, left) |
|---|
| 434 | self.spans.insert(i+1, (o_end, right)) |
|---|
| 435 | break |
|---|
| 436 | #print(" done", self.spans) |
|---|
| 437 | |
|---|
| 438 | def pop(self, start, length): |
|---|
| 439 | data = self.get(start, length) |
|---|
| 440 | if data: |
|---|
| 441 | self.remove(start, length) |
|---|
| 442 | return data |
|---|