| 1 | """ |
|---|
| 2 | Functions used to convert inputs from whatever encoding used in the system to |
|---|
| 3 | unicode and back. |
|---|
| 4 | |
|---|
| 5 | Ported to Python 3. |
|---|
| 6 | |
|---|
| 7 | Once Python 2 support is dropped, most of this module will obsolete, since |
|---|
| 8 | Unicode is the default everywhere in Python 3. |
|---|
| 9 | """ |
|---|
| 10 | |
|---|
| 11 | from six import ensure_str |
|---|
| 12 | |
|---|
| 13 | import sys, os, re |
|---|
| 14 | import unicodedata |
|---|
| 15 | import warnings |
|---|
| 16 | |
|---|
| 17 | from allmydata.util.assertutil import precondition, _assert |
|---|
| 18 | from twisted.python import usage |
|---|
| 19 | from twisted.python.filepath import FilePath |
|---|
| 20 | from allmydata.util import log |
|---|
| 21 | from allmydata.util.fileutil import abspath_expanduser_unicode |
|---|
| 22 | |
|---|
| 23 | NoneType = type(None) |
|---|
| 24 | |
|---|
| 25 | |
|---|
| 26 | def canonical_encoding(encoding): |
|---|
| 27 | if encoding is None: |
|---|
| 28 | log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD) |
|---|
| 29 | encoding = 'utf-8' |
|---|
| 30 | encoding = encoding.lower() |
|---|
| 31 | if encoding == "cp65001": |
|---|
| 32 | encoding = 'utf-8' |
|---|
| 33 | elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968": |
|---|
| 34 | encoding = 'ascii' |
|---|
| 35 | |
|---|
| 36 | return encoding |
|---|
| 37 | |
|---|
| 38 | def check_encoding(encoding): |
|---|
| 39 | # sometimes Python returns an encoding name that it doesn't support for conversion |
|---|
| 40 | # fail early if this happens |
|---|
| 41 | try: |
|---|
| 42 | u"test".encode(encoding) |
|---|
| 43 | except (LookupError, AttributeError): |
|---|
| 44 | raise AssertionError( |
|---|
| 45 | "The character encoding '%s' is not supported for conversion." % (encoding,), |
|---|
| 46 | ) |
|---|
| 47 | |
|---|
| 48 | # On Windows we install UTF-8 stream wrappers for sys.stdout and |
|---|
| 49 | # sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py). |
|---|
| 50 | # |
|---|
| 51 | # On POSIX, we are moving towards a UTF-8-everything and ignore the locale. |
|---|
| 52 | io_encoding = "utf-8" |
|---|
| 53 | |
|---|
| 54 | filesystem_encoding = None |
|---|
| 55 | |
|---|
| 56 | def _reload(): |
|---|
| 57 | global filesystem_encoding |
|---|
| 58 | filesystem_encoding = canonical_encoding(sys.getfilesystemencoding()) |
|---|
| 59 | check_encoding(filesystem_encoding) |
|---|
| 60 | |
|---|
| 61 | _reload() |
|---|
| 62 | |
|---|
| 63 | |
|---|
| 64 | def get_filesystem_encoding(): |
|---|
| 65 | """ |
|---|
| 66 | Returns expected encoding for local filenames. |
|---|
| 67 | """ |
|---|
| 68 | return filesystem_encoding |
|---|
| 69 | |
|---|
| 70 | def get_io_encoding(): |
|---|
| 71 | """ |
|---|
| 72 | Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv. |
|---|
| 73 | """ |
|---|
| 74 | return io_encoding |
|---|
| 75 | |
|---|
| 76 | def argv_to_unicode(s): |
|---|
| 77 | """ |
|---|
| 78 | Decode given argv element to unicode. If this fails, raise a UsageError. |
|---|
| 79 | |
|---|
| 80 | This is the inverse of ``unicode_to_argv``. |
|---|
| 81 | """ |
|---|
| 82 | if isinstance(s, str): |
|---|
| 83 | return s |
|---|
| 84 | |
|---|
| 85 | precondition(isinstance(s, bytes), s) |
|---|
| 86 | |
|---|
| 87 | try: |
|---|
| 88 | return str(s, io_encoding) |
|---|
| 89 | except UnicodeDecodeError: |
|---|
| 90 | raise usage.UsageError("Argument %s cannot be decoded as %s." % |
|---|
| 91 | (quote_output(s), io_encoding)) |
|---|
| 92 | |
|---|
| 93 | def argv_to_abspath(s, **kwargs): |
|---|
| 94 | """ |
|---|
| 95 | Convenience function to decode an argv element to an absolute path, with ~ expanded. |
|---|
| 96 | If this fails, raise a UsageError. |
|---|
| 97 | """ |
|---|
| 98 | decoded = argv_to_unicode(s) |
|---|
| 99 | if decoded.startswith(u'-'): |
|---|
| 100 | raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file." |
|---|
| 101 | % (quote_output(s), quote_output(os.path.join('.', s)))) |
|---|
| 102 | return abspath_expanduser_unicode(decoded, **kwargs) |
|---|
| 103 | |
|---|
| 104 | |
|---|
| 105 | def unicode_to_argv(s): |
|---|
| 106 | """ |
|---|
| 107 | Make the given unicode string suitable for use in an argv list. |
|---|
| 108 | |
|---|
| 109 | On Python 2 on POSIX, this encodes using UTF-8. On Python 3 and on |
|---|
| 110 | Windows, this returns the input unmodified. |
|---|
| 111 | """ |
|---|
| 112 | precondition(isinstance(s, str), s) |
|---|
| 113 | warnings.warn("This is unnecessary.", DeprecationWarning) |
|---|
| 114 | if sys.platform == "win32": |
|---|
| 115 | return s |
|---|
| 116 | return ensure_str(s) |
|---|
| 117 | |
|---|
| 118 | |
|---|
| 119 | # According to unicode_to_argv above, the expected type for |
|---|
| 120 | # cli args depends on the platform, so capture that expectation. |
|---|
| 121 | argv_type = (str,) |
|---|
| 122 | """ |
|---|
| 123 | The expected type for args to a subprocess |
|---|
| 124 | """ |
|---|
| 125 | |
|---|
| 126 | |
|---|
| 127 | def unicode_to_url(s): |
|---|
| 128 | """ |
|---|
| 129 | Encode an unicode object used in an URL to bytes. |
|---|
| 130 | """ |
|---|
| 131 | # According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded. |
|---|
| 132 | |
|---|
| 133 | # FIXME |
|---|
| 134 | return to_bytes(s) |
|---|
| 135 | #precondition(isinstance(s, unicode), s) |
|---|
| 136 | #return s.encode('utf-8') |
|---|
| 137 | |
|---|
| 138 | def to_bytes(s): |
|---|
| 139 | """Convert unicode to bytes. |
|---|
| 140 | |
|---|
| 141 | None and bytes are passed through unchanged. |
|---|
| 142 | """ |
|---|
| 143 | if s is None or isinstance(s, bytes): |
|---|
| 144 | return s |
|---|
| 145 | return s.encode('utf-8') |
|---|
| 146 | |
|---|
| 147 | def from_utf8_or_none(s): |
|---|
| 148 | precondition(isinstance(s, bytes) or s is None, s) |
|---|
| 149 | if s is None: |
|---|
| 150 | return s |
|---|
| 151 | return s.decode('utf-8') |
|---|
| 152 | |
|---|
| 153 | PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$', re.DOTALL) |
|---|
| 154 | PRINTABLE_8BIT = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) |
|---|
| 155 | |
|---|
| 156 | def is_printable_ascii(s): |
|---|
| 157 | return PRINTABLE_ASCII.search(s) is not None |
|---|
| 158 | |
|---|
| 159 | def unicode_to_output(s): |
|---|
| 160 | """ |
|---|
| 161 | Encode an unicode object for representation on stdout or stderr. |
|---|
| 162 | |
|---|
| 163 | On Python 3 just returns the unicode string unchanged, since encoding is |
|---|
| 164 | the responsibility of stdout/stderr, they expect Unicode by default. |
|---|
| 165 | """ |
|---|
| 166 | precondition(isinstance(s, str), s) |
|---|
| 167 | warnings.warn("This is unnecessary.", DeprecationWarning) |
|---|
| 168 | return s |
|---|
| 169 | |
|---|
| 170 | def _unicode_escape(m, quote_newlines): |
|---|
| 171 | u = m.group(0) |
|---|
| 172 | if u == u'"' or u == u'$' or u == u'`' or u == u'\\': |
|---|
| 173 | return u'\\' + u |
|---|
| 174 | elif u == u'\n' and not quote_newlines: |
|---|
| 175 | return u |
|---|
| 176 | if len(u) == 2: |
|---|
| 177 | codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 |
|---|
| 178 | else: |
|---|
| 179 | codepoint = ord(u) |
|---|
| 180 | if codepoint > 0xFFFF: |
|---|
| 181 | return u'\\U%08x' % (codepoint,) |
|---|
| 182 | elif codepoint > 0xFF: |
|---|
| 183 | return u'\\u%04x' % (codepoint,) |
|---|
| 184 | else: |
|---|
| 185 | return u'\\x%02x' % (codepoint,) |
|---|
| 186 | |
|---|
| 187 | def _bytes_escape(m, quote_newlines): |
|---|
| 188 | """ |
|---|
| 189 | Takes a re match on bytes, the result is escaped bytes of group(0). |
|---|
| 190 | """ |
|---|
| 191 | c = m.group(0) |
|---|
| 192 | if c == b'"' or c == b'$' or c == b'`' or c == b'\\': |
|---|
| 193 | return b'\\' + c |
|---|
| 194 | elif c == b'\n' and not quote_newlines: |
|---|
| 195 | return c |
|---|
| 196 | else: |
|---|
| 197 | return b'\\x%02x' % (ord(c),) |
|---|
| 198 | |
|---|
| 199 | MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) |
|---|
| 200 | MUST_DOUBLE_QUOTE = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) |
|---|
| 201 | |
|---|
| 202 | # if we must double-quote, then we have to escape ", $ and `, but need not escape ' |
|---|
| 203 | ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs |
|---|
| 204 | u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', |
|---|
| 205 | re.DOTALL) |
|---|
| 206 | |
|---|
| 207 | ESCAPABLE_8BIT = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) |
|---|
| 208 | |
|---|
| 209 | def quote_output_u(*args, **kwargs): |
|---|
| 210 | """ |
|---|
| 211 | Like ``quote_output`` but always return ``unicode``. |
|---|
| 212 | """ |
|---|
| 213 | result = quote_output(*args, **kwargs) |
|---|
| 214 | if isinstance(result, str): |
|---|
| 215 | return result |
|---|
| 216 | # Since we're quoting, the assumption is this will be read by a human, and |
|---|
| 217 | # therefore printed, so stdout's encoding is the plausible one. io_encoding |
|---|
| 218 | # is now always utf-8. |
|---|
| 219 | return result.decode(kwargs.get("encoding", None) or |
|---|
| 220 | getattr(sys.stdout, "encoding") or io_encoding) |
|---|
| 221 | |
|---|
| 222 | |
|---|
| 223 | def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None): |
|---|
| 224 | """ |
|---|
| 225 | Encode either a Unicode string or a UTF-8-encoded bytestring for representation |
|---|
| 226 | on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is |
|---|
| 227 | always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or |
|---|
| 228 | control bytes in the output. (Newlines are counted as control bytes iff |
|---|
| 229 | quote_newlines is True.) |
|---|
| 230 | |
|---|
| 231 | Quoting may use either single or double quotes. Within single quotes, all |
|---|
| 232 | characters stand for themselves, and ' will not appear. Within double quotes, |
|---|
| 233 | Python-compatible backslash escaping is used. |
|---|
| 234 | |
|---|
| 235 | If not explicitly given, quote_newlines is True when quotemarks is True. |
|---|
| 236 | |
|---|
| 237 | On Python 3, returns Unicode strings. |
|---|
| 238 | """ |
|---|
| 239 | precondition(isinstance(s, (bytes, str)), s) |
|---|
| 240 | # Since we're quoting, the assumption is this will be read by a human, and |
|---|
| 241 | # therefore printed, so stdout's encoding is the plausible one. io_encoding |
|---|
| 242 | # is now always utf-8. |
|---|
| 243 | encoding = encoding or getattr(sys.stdout, "encoding") or io_encoding |
|---|
| 244 | |
|---|
| 245 | if quote_newlines is None: |
|---|
| 246 | quote_newlines = quotemarks |
|---|
| 247 | |
|---|
| 248 | def _encode(s): |
|---|
| 249 | if isinstance(s, bytes): |
|---|
| 250 | try: |
|---|
| 251 | s = s.decode("utf-8") |
|---|
| 252 | except UnicodeDecodeError: |
|---|
| 253 | return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _bytes_escape(m, quote_newlines), s),) |
|---|
| 254 | |
|---|
| 255 | must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE |
|---|
| 256 | if must_double_quote.search(s) is None: |
|---|
| 257 | try: |
|---|
| 258 | out = s.encode(encoding) |
|---|
| 259 | if quotemarks or out.startswith(b'"'): |
|---|
| 260 | return b"'%s'" % (out,) |
|---|
| 261 | else: |
|---|
| 262 | return out |
|---|
| 263 | except (UnicodeDecodeError, UnicodeEncodeError): |
|---|
| 264 | pass |
|---|
| 265 | |
|---|
| 266 | escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s) |
|---|
| 267 | return b'"%s"' % (escaped.encode(encoding, 'backslashreplace'),) |
|---|
| 268 | |
|---|
| 269 | result = _encode(s) |
|---|
| 270 | result = result.decode(encoding) |
|---|
| 271 | return result |
|---|
| 272 | |
|---|
| 273 | |
|---|
| 274 | def quote_path(path, quotemarks=True): |
|---|
| 275 | return quote_output(b"/".join(map(to_bytes, path)), quotemarks=quotemarks, quote_newlines=True) |
|---|
| 276 | |
|---|
| 277 | def quote_local_unicode_path(path, quotemarks=True): |
|---|
| 278 | precondition(isinstance(path, str), path) |
|---|
| 279 | |
|---|
| 280 | if sys.platform == "win32" and path.startswith(u"\\\\?\\"): |
|---|
| 281 | path = path[4 :] |
|---|
| 282 | if path.startswith(u"UNC\\"): |
|---|
| 283 | path = u"\\\\" + path[4 :] |
|---|
| 284 | |
|---|
| 285 | return quote_output(path, quotemarks=quotemarks, quote_newlines=True) |
|---|
| 286 | |
|---|
| 287 | def quote_filepath(path, quotemarks=True): |
|---|
| 288 | return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks) |
|---|
| 289 | |
|---|
| 290 | def extend_filepath(fp, segments): |
|---|
| 291 | # We cannot use FilePath.preauthChild, because |
|---|
| 292 | # * it has the security flaw described in <https://twistedmatrix.com/trac/ticket/6527>; |
|---|
| 293 | # * it may return a FilePath in the wrong mode. |
|---|
| 294 | |
|---|
| 295 | for segment in segments: |
|---|
| 296 | fp = fp.child(segment) |
|---|
| 297 | |
|---|
| 298 | return fp |
|---|
| 299 | |
|---|
| 300 | def to_filepath(path): |
|---|
| 301 | precondition(isinstance(path, str), path=path) |
|---|
| 302 | |
|---|
| 303 | if sys.platform == "win32": |
|---|
| 304 | _assert(isinstance(path, str), path=path) |
|---|
| 305 | if path.startswith(u"\\\\?\\") and len(path) > 4: |
|---|
| 306 | # FilePath normally strips trailing path separators, but not in this case. |
|---|
| 307 | path = path.rstrip(u"\\") |
|---|
| 308 | |
|---|
| 309 | return FilePath(path) |
|---|
| 310 | |
|---|
| 311 | def _decode(s): |
|---|
| 312 | precondition(isinstance(s, (bytes, str)), s=s) |
|---|
| 313 | |
|---|
| 314 | if isinstance(s, bytes): |
|---|
| 315 | return s.decode(filesystem_encoding) |
|---|
| 316 | else: |
|---|
| 317 | return s |
|---|
| 318 | |
|---|
| 319 | def unicode_from_filepath(fp): |
|---|
| 320 | precondition(isinstance(fp, FilePath), fp=fp) |
|---|
| 321 | return _decode(fp.path) |
|---|
| 322 | |
|---|
| 323 | def unicode_segments_from(base_fp, ancestor_fp): |
|---|
| 324 | precondition(isinstance(base_fp, FilePath), base_fp=base_fp) |
|---|
| 325 | precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp) |
|---|
| 326 | |
|---|
| 327 | return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode()) |
|---|
| 328 | |
|---|
| 329 | def unicode_platform(): |
|---|
| 330 | """ |
|---|
| 331 | Does the current platform handle Unicode filenames natively? |
|---|
| 332 | """ |
|---|
| 333 | return True |
|---|
| 334 | |
|---|
| 335 | class FilenameEncodingError(Exception): |
|---|
| 336 | """ |
|---|
| 337 | Filename cannot be encoded using the current encoding of your filesystem |
|---|
| 338 | (%s). Please configure your locale correctly or rename this file. |
|---|
| 339 | """ |
|---|
| 340 | pass |
|---|
| 341 | |
|---|
| 342 | def listdir_unicode(path): |
|---|
| 343 | """ |
|---|
| 344 | Wrapper around listdir() which provides safe access to the convenient |
|---|
| 345 | Unicode API even under platforms that don't provide one natively. |
|---|
| 346 | """ |
|---|
| 347 | precondition(isinstance(path, str), path) |
|---|
| 348 | return os.listdir(path) |
|---|
| 349 | |
|---|
| 350 | def listdir_filepath(fp): |
|---|
| 351 | return listdir_unicode(unicode_from_filepath(fp)) |
|---|
| 352 | |
|---|
| 353 | |
|---|
| 354 | # 'x' at the end of a variable name indicates that it holds a Unicode string that may not |
|---|
| 355 | # be NFC-normalized. |
|---|
| 356 | def normalize(namex): |
|---|
| 357 | return unicodedata.normalize('NFC', namex) |
|---|