Context Navigation

source: trunk/src/allmydata/util/encodingutil.py

Visit:

Last change on this file was 1504bec, checked in by Alexandre Detiste <alexandre.detiste@…>, at 2024-03-11T20:57:36Z
drop dead code
Property mode set to `100644`
File size: 11.4 KB

Line
1	"""
2	Functions used to convert inputs from whatever encoding used in the system to
3	unicode and back.
4
5	Ported to Python 3.
6
7	Once Python 2 support is dropped, most of this module will obsolete, since
8	Unicode is the default everywhere in Python 3.
9	"""
10
11	from six import ensure_str
12
13	import sys, os, re
14	import unicodedata
15	import warnings
16
17	from allmydata.util.assertutil import precondition, _assert
18	from twisted.python import usage
19	from twisted.python.filepath import FilePath
20	from allmydata.util import log
21	from allmydata.util.fileutil import abspath_expanduser_unicode
22
23	NoneType = type(None)
24
25
26	def canonical_encoding(encoding):
27	if encoding is None:
28	log.msg("Warning: falling back to UTF-8 encoding.", level=log.WEIRD)
29	encoding = 'utf-8'
30	encoding = encoding.lower()
31	if encoding == "cp65001":
32	encoding = 'utf-8'
33	elif encoding == "us-ascii" or encoding == "646" or encoding == "ansi_x3.4-1968":
34	encoding = 'ascii'
35
36	return encoding
37
38	def check_encoding(encoding):
39	# sometimes Python returns an encoding name that it doesn't support for conversion
40	# fail early if this happens
41	try:
42	u"test".encode(encoding)
43	except (LookupError, AttributeError):
44	raise AssertionError(
45	"The character encoding '%s' is not supported for conversion." % (encoding,),
46	)
47
48	# On Windows we install UTF-8 stream wrappers for sys.stdout and
49	# sys.stderr, and reencode the arguments as UTF-8 (see scripts/runner.py).
50	#
51	# On POSIX, we are moving towards a UTF-8-everything and ignore the locale.
52	io_encoding = "utf-8"
53
54	filesystem_encoding = None
55
56	def _reload():
57	global filesystem_encoding
58	filesystem_encoding = canonical_encoding(sys.getfilesystemencoding())
59	check_encoding(filesystem_encoding)
60
61	_reload()
62
63
64	def get_filesystem_encoding():
65	"""
66	Returns expected encoding for local filenames.
67	"""
68	return filesystem_encoding
69
70	def get_io_encoding():
71	"""
72	Returns expected encoding for writing to stdout or stderr, and for arguments in sys.argv.
73	"""
74	return io_encoding
75
76	def argv_to_unicode(s):
77	"""
78	Decode given argv element to unicode. If this fails, raise a UsageError.
79
80	This is the inverse of ``unicode_to_argv``.
81	"""
82	if isinstance(s, str):
83	return s
84
85	precondition(isinstance(s, bytes), s)
86
87	try:
88	return str(s, io_encoding)
89	except UnicodeDecodeError:
90	raise usage.UsageError("Argument %s cannot be decoded as %s." %
91	(quote_output(s), io_encoding))
92
93	def argv_to_abspath(s, **kwargs):
94	"""
95	Convenience function to decode an argv element to an absolute path, with ~ expanded.
96	If this fails, raise a UsageError.
97	"""
98	decoded = argv_to_unicode(s)
99	if decoded.startswith(u'-'):
100	raise usage.UsageError("Path argument %s cannot start with '-'.\nUse %s if you intended to refer to a file."
101	% (quote_output(s), quote_output(os.path.join('.', s))))
102	return abspath_expanduser_unicode(decoded, **kwargs)
103
104
105	def unicode_to_argv(s):
106	"""
107	Make the given unicode string suitable for use in an argv list.
108
109	On Python 2 on POSIX, this encodes using UTF-8. On Python 3 and on
110	Windows, this returns the input unmodified.
111	"""
112	precondition(isinstance(s, str), s)
113	warnings.warn("This is unnecessary.", DeprecationWarning)
114	if sys.platform == "win32":
115	return s
116	return ensure_str(s)
117
118
119	# According to unicode_to_argv above, the expected type for
120	# cli args depends on the platform, so capture that expectation.
121	argv_type = (str,)
122	"""
123	The expected type for args to a subprocess
124	"""
125
126
127	def unicode_to_url(s):
128	"""
129	Encode an unicode object used in an URL to bytes.
130	"""
131	# According to RFC 2718, non-ascii characters in URLs must be UTF-8 encoded.
132
133	# FIXME
134	return to_bytes(s)
135	#precondition(isinstance(s, unicode), s)
136	#return s.encode('utf-8')
137
138	def to_bytes(s):
139	"""Convert unicode to bytes.
140
141	None and bytes are passed through unchanged.
142	"""
143	if s is None or isinstance(s, bytes):
144	return s
145	return s.encode('utf-8')
146
147	def from_utf8_or_none(s):
148	precondition(isinstance(s, bytes) or s is None, s)
149	if s is None:
150	return s
151	return s.decode('utf-8')
152
153	PRINTABLE_ASCII = re.compile(br'^[\n\r\x20-\x7E]*$', re.DOTALL)
154	PRINTABLE_8BIT = re.compile(br'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL)
155
156	def is_printable_ascii(s):
157	return PRINTABLE_ASCII.search(s) is not None
158
159	def unicode_to_output(s):
160	"""
161	Encode an unicode object for representation on stdout or stderr.
162
163	On Python 3 just returns the unicode string unchanged, since encoding is
164	the responsibility of stdout/stderr, they expect Unicode by default.
165	"""
166	precondition(isinstance(s, str), s)
167	warnings.warn("This is unnecessary.", DeprecationWarning)
168	return s
169
170	def _unicode_escape(m, quote_newlines):
171	u = m.group(0)
172	if u == u'"' or u == u'$' or u == u'`' or u == u'\\':
173	return u'\\' + u
174	elif u == u'\n' and not quote_newlines:
175	return u
176	if len(u) == 2:
177	codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000
178	else:
179	codepoint = ord(u)
180	if codepoint > 0xFFFF:
181	return u'\\U%08x' % (codepoint,)
182	elif codepoint > 0xFF:
183	return u'\\u%04x' % (codepoint,)
184	else:
185	return u'\\x%02x' % (codepoint,)
186
187	def _bytes_escape(m, quote_newlines):
188	"""
189	Takes a re match on bytes, the result is escaped bytes of group(0).
190	"""
191	c = m.group(0)
192	if c == b'"' or c == b'$' or c == b'`' or c == b'\\':
193	return b'\\' + c
194	elif c == b'\n' and not quote_newlines:
195	return c
196	else:
197	return b'\\x%02x' % (ord(c),)
198
199	MUST_DOUBLE_QUOTE_NL = re.compile(u'[^\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
200	MUST_DOUBLE_QUOTE = re.compile(u'[^\\n\\x20-\\x26\\x28-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL)
201
202	# if we must double-quote, then we have to escape ", $ and `, but need not escape '
203	ESCAPABLE_UNICODE = re.compile(u'([\uD800-\uDBFF][\uDC00-\uDFFF])\|' # valid surrogate pairs
204	u'[^ !#\\x25-\\x5B\\x5D-\\x5F\\x61-\\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]',
205	re.DOTALL)
206
207	ESCAPABLE_8BIT = re.compile( br'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL)
208
209	def quote_output_u(args, *kwargs):
210	"""
211	Like ``quote_output`` but always return ``unicode``.
212	"""
213	result = quote_output(args, *kwargs)
214	if isinstance(result, str):
215	return result
216	# Since we're quoting, the assumption is this will be read by a human, and
217	# therefore printed, so stdout's encoding is the plausible one. io_encoding
218	# is now always utf-8.
219	return result.decode(kwargs.get("encoding", None) or
220	getattr(sys.stdout, "encoding") or io_encoding)
221
222
223	def quote_output(s, quotemarks=True, quote_newlines=None, encoding=None):
224	"""
225	Encode either a Unicode string or a UTF-8-encoded bytestring for representation
226	on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is
227	always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or
228	control bytes in the output. (Newlines are counted as control bytes iff
229	quote_newlines is True.)
230
231	Quoting may use either single or double quotes. Within single quotes, all
232	characters stand for themselves, and ' will not appear. Within double quotes,
233	Python-compatible backslash escaping is used.
234
235	If not explicitly given, quote_newlines is True when quotemarks is True.
236
237	On Python 3, returns Unicode strings.
238	"""
239	precondition(isinstance(s, (bytes, str)), s)
240	# Since we're quoting, the assumption is this will be read by a human, and
241	# therefore printed, so stdout's encoding is the plausible one. io_encoding
242	# is now always utf-8.
243	encoding = encoding or getattr(sys.stdout, "encoding") or io_encoding
244
245	if quote_newlines is None:
246	quote_newlines = quotemarks
247
248	def _encode(s):
249	if isinstance(s, bytes):
250	try:
251	s = s.decode("utf-8")
252	except UnicodeDecodeError:
253	return b'b"%s"' % (ESCAPABLE_8BIT.sub(lambda m: _bytes_escape(m, quote_newlines), s),)
254
255	must_double_quote = quote_newlines and MUST_DOUBLE_QUOTE_NL or MUST_DOUBLE_QUOTE
256	if must_double_quote.search(s) is None:
257	try:
258	out = s.encode(encoding)
259	if quotemarks or out.startswith(b'"'):
260	return b"'%s'" % (out,)
261	else:
262	return out
263	except (UnicodeDecodeError, UnicodeEncodeError):
264	pass
265
266	escaped = ESCAPABLE_UNICODE.sub(lambda m: _unicode_escape(m, quote_newlines), s)
267	return b'"%s"' % (escaped.encode(encoding, 'backslashreplace'),)
268
269	result = _encode(s)
270	result = result.decode(encoding)
271	return result
272
273
274	def quote_path(path, quotemarks=True):
275	return quote_output(b"/".join(map(to_bytes, path)), quotemarks=quotemarks, quote_newlines=True)
276
277	def quote_local_unicode_path(path, quotemarks=True):
278	precondition(isinstance(path, str), path)
279
280	if sys.platform == "win32" and path.startswith(u"\\\\?\\"):
281	path = path[4 :]
282	if path.startswith(u"UNC\\"):
283	path = u"\\\\" + path[4 :]
284
285	return quote_output(path, quotemarks=quotemarks, quote_newlines=True)
286
287	def quote_filepath(path, quotemarks=True):
288	return quote_local_unicode_path(unicode_from_filepath(path), quotemarks=quotemarks)
289
290	def extend_filepath(fp, segments):
291	# We cannot use FilePath.preauthChild, because
292	# * it has the security flaw described in <https://twistedmatrix.com/trac/ticket/6527>;
293	# * it may return a FilePath in the wrong mode.
294
295	for segment in segments:
296	fp = fp.child(segment)
297
298	return fp
299
300	def to_filepath(path):
301	precondition(isinstance(path, str), path=path)
302
303	if sys.platform == "win32":
304	_assert(isinstance(path, str), path=path)
305	if path.startswith(u"\\\\?\\") and len(path) > 4:
306	# FilePath normally strips trailing path separators, but not in this case.
307	path = path.rstrip(u"\\")
308
309	return FilePath(path)
310
311	def _decode(s):
312	precondition(isinstance(s, (bytes, str)), s=s)
313
314	if isinstance(s, bytes):
315	return s.decode(filesystem_encoding)
316	else:
317	return s
318
319	def unicode_from_filepath(fp):
320	precondition(isinstance(fp, FilePath), fp=fp)
321	return _decode(fp.path)
322
323	def unicode_segments_from(base_fp, ancestor_fp):
324	precondition(isinstance(base_fp, FilePath), base_fp=base_fp)
325	precondition(isinstance(ancestor_fp, FilePath), ancestor_fp=ancestor_fp)
326
327	return base_fp.asTextMode().segmentsFrom(ancestor_fp.asTextMode())
328
329	def unicode_platform():
330	"""
331	Does the current platform handle Unicode filenames natively?
332	"""
333	return True
334
335	class FilenameEncodingError(Exception):
336	"""
337	Filename cannot be encoded using the current encoding of your filesystem
338	(%s). Please configure your locale correctly or rename this file.
339	"""
340	pass
341
342	def listdir_unicode(path):
343	"""
344	Wrapper around listdir() which provides safe access to the convenient
345	Unicode API even under platforms that don't provide one natively.
346	"""
347	precondition(isinstance(path, str), path)
348	return os.listdir(path)
349
350	def listdir_filepath(fp):
351	return listdir_unicode(unicode_from_filepath(fp))
352
353
354	# 'x' at the end of a variable name indicates that it holds a Unicode string that may not
355	# be NFC-normalized.
356	def normalize(namex):
357	return unicodedata.normalize('NFC', namex)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: