| 1 | """ |
|---|
| 2 | Base62 encoding. |
|---|
| 3 | |
|---|
| 4 | Ported to Python 3. |
|---|
| 5 | """ |
|---|
| 6 | |
|---|
| 7 | maketrans = bytes.maketrans |
|---|
| 8 | translate = bytes.translate |
|---|
| 9 | |
|---|
| 10 | from allmydata.util.mathutil import log_ceil, log_floor |
|---|
| 11 | |
|---|
| 12 | chars = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
|---|
| 13 | |
|---|
| 14 | BASE62CHAR = b'[' + chars + b']' |
|---|
| 15 | |
|---|
| 16 | def byteschr(x): |
|---|
| 17 | return bytes([x]) |
|---|
| 18 | |
|---|
| 19 | vals = b''.join([byteschr(i) for i in range(62)]) |
|---|
| 20 | c2vtranstable = maketrans(chars, vals) |
|---|
| 21 | v2ctranstable = maketrans(vals, chars) |
|---|
| 22 | identitytranstable = maketrans(chars, chars) |
|---|
| 23 | |
|---|
| 24 | def b2a(os): |
|---|
| 25 | """ |
|---|
| 26 | @param os the data to be encoded (as bytes) |
|---|
| 27 | |
|---|
| 28 | @return the contents of os in base-62 encoded form, as bytes |
|---|
| 29 | """ |
|---|
| 30 | cs = b2a_l(os, len(os)*8) |
|---|
| 31 | assert num_octets_that_encode_to_this_many_chars(len(cs)) == len(os), "%s != %s, numchars: %s" % (num_octets_that_encode_to_this_many_chars(len(cs)), len(os), len(cs)) |
|---|
| 32 | return cs |
|---|
| 33 | |
|---|
| 34 | def b2a_l(os, lengthinbits): |
|---|
| 35 | """ |
|---|
| 36 | @param os the data to be encoded (as bytes) |
|---|
| 37 | @param lengthinbits the number of bits of data in os to be encoded |
|---|
| 38 | |
|---|
| 39 | b2a_l() will generate a base-62 encoded string big enough to encode |
|---|
| 40 | lengthinbits bits. So for example if os is 3 bytes long and lengthinbits is |
|---|
| 41 | 17, then b2a_l() will generate a 3-character- long base-62 encoded string |
|---|
| 42 | (since 3 chars is sufficient to encode more than 2^17 values). If os is 3 |
|---|
| 43 | bytes long and lengthinbits is 18 (or None), then b2a_l() will generate a |
|---|
| 44 | 4-character string (since 4 chars are required to hold 2^18 values). Note |
|---|
| 45 | that if os is 3 bytes long and lengthinbits is 17, the least significant 7 |
|---|
| 46 | bits of os are ignored. |
|---|
| 47 | |
|---|
| 48 | Warning: if you generate a base-62 encoded string with b2a_l(), and then someone else tries to |
|---|
| 49 | decode it by calling a2b() instead of a2b_l(), then they will (potentially) get a different |
|---|
| 50 | string than the one you encoded! So use b2a_l() only when you are sure that the encoding and |
|---|
| 51 | decoding sides know exactly which lengthinbits to use. If you do not have a way for the |
|---|
| 52 | encoder and the decoder to agree upon the lengthinbits, then it is best to use b2a() and |
|---|
| 53 | a2b(). The only drawback to using b2a() over b2a_l() is that when you have a number of |
|---|
| 54 | bits to encode that is not a multiple of 8, b2a() can sometimes generate a base-62 encoded |
|---|
| 55 | string that is one or two characters longer than necessary. |
|---|
| 56 | |
|---|
| 57 | @return the contents of os in base-62 encoded form, as bytes |
|---|
| 58 | """ |
|---|
| 59 | # We call bytes() again for Python 2, to ensure literals are using future's |
|---|
| 60 | # Python 3-compatible variant. |
|---|
| 61 | os = [o for o in reversed(bytes(os))] # treat os as big-endian -- and we want to process the least-significant o first |
|---|
| 62 | |
|---|
| 63 | value = 0 |
|---|
| 64 | numvalues = 1 # the number of possible values that value could be |
|---|
| 65 | for o in os: |
|---|
| 66 | o *= numvalues |
|---|
| 67 | value += o |
|---|
| 68 | numvalues *= 256 |
|---|
| 69 | |
|---|
| 70 | chars = [] |
|---|
| 71 | while numvalues > 0: |
|---|
| 72 | chars.append(value % 62) |
|---|
| 73 | value //= 62 |
|---|
| 74 | numvalues //= 62 |
|---|
| 75 | |
|---|
| 76 | return translate(bytes([c for c in reversed(chars)]), v2ctranstable) # make it big-endian |
|---|
| 77 | |
|---|
| 78 | def num_octets_that_encode_to_this_many_chars(numcs): |
|---|
| 79 | return log_floor(62**numcs, 256) |
|---|
| 80 | |
|---|
| 81 | def num_chars_that_this_many_octets_encode_to(numos): |
|---|
| 82 | return log_ceil(256**numos, 62) |
|---|
| 83 | |
|---|
| 84 | def a2b(cs): |
|---|
| 85 | """ |
|---|
| 86 | @param cs the base-62 encoded data (a string) |
|---|
| 87 | """ |
|---|
| 88 | return a2b_l(cs, num_octets_that_encode_to_this_many_chars(len(cs))*8) |
|---|
| 89 | |
|---|
| 90 | def a2b_l(cs, lengthinbits): |
|---|
| 91 | """ |
|---|
| 92 | @param lengthinbits the number of bits of data in encoded into cs |
|---|
| 93 | |
|---|
| 94 | a2b_l() will return a result just big enough to hold lengthinbits bits. So |
|---|
| 95 | for example if cs is 2 characters long (encoding between 5 and 12 bits worth |
|---|
| 96 | of data) and lengthinbits is 8, then a2b_l() will return a string of length |
|---|
| 97 | 1 (since 1 byte is sufficient to store 8 bits), but if lengthinbits is 9, |
|---|
| 98 | then a2b_l() will return a string of length 2. |
|---|
| 99 | |
|---|
| 100 | Please see the warning in the docstring of b2a_l() regarding the use of |
|---|
| 101 | b2a() versus b2a_l(). |
|---|
| 102 | |
|---|
| 103 | @return the data encoded in cs, as bytes |
|---|
| 104 | """ |
|---|
| 105 | # We call bytes() again for Python 2, to ensure literals are using future's |
|---|
| 106 | # Python 3-compatible variant. |
|---|
| 107 | cs = [c for c in reversed(bytes(translate(cs, c2vtranstable)))] # treat cs as big-endian -- and we want to process the least-significant c first |
|---|
| 108 | |
|---|
| 109 | value = 0 |
|---|
| 110 | numvalues = 1 # the number of possible values that value could be |
|---|
| 111 | for c in cs: |
|---|
| 112 | c *= numvalues |
|---|
| 113 | value += c |
|---|
| 114 | numvalues *= 62 |
|---|
| 115 | |
|---|
| 116 | numvalues = 2**lengthinbits |
|---|
| 117 | result_bytes = [] |
|---|
| 118 | while numvalues > 1: |
|---|
| 119 | result_bytes.append(value % 256) |
|---|
| 120 | value //= 256 |
|---|
| 121 | numvalues //= 256 |
|---|
| 122 | |
|---|
| 123 | return bytes([b for b in reversed(result_bytes)]) # make it big-endian |
|---|