| 1 | Fri Jul 23 08:53:14 GMT Daylight Time 2010 david-sarah@jacaranda.org |
|---|
| 2 | * util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135 |
|---|
| 3 | |
|---|
| 4 | New patches: |
|---|
| 5 | |
|---|
| 6 | [util.encodingutil: change quote_output to do less unnecessary escaping, and to use double-quotes more consistently when needed. This version avoids u-escaping for characters that are representable in the output encoding, when double quotes are used, and includes tests. fixes #1135 |
|---|
| 7 | david-sarah@jacaranda.org**20100723075314 |
|---|
| 8 | Ignore-this: b82205834d17db61612dd16436b7c5a2 |
|---|
| 9 | ] { |
|---|
| 10 | hunk ./src/allmydata/test/test_encodingutil.py 60 |
|---|
| 11 | |
|---|
| 12 | from allmydata.test.common_util import ReallyEqualMixin |
|---|
| 13 | from allmydata.util.encodingutil import argv_to_unicode, unicode_to_url, \ |
|---|
| 14 | - unicode_to_output, unicode_platform, listdir_unicode, FilenameEncodingError, \ |
|---|
| 15 | - get_output_encoding, get_filesystem_encoding, _reload |
|---|
| 16 | + unicode_to_output, quote_output, unicode_platform, listdir_unicode, \ |
|---|
| 17 | + FilenameEncodingError, get_output_encoding, get_filesystem_encoding, _reload |
|---|
| 18 | from allmydata.dirnode import normalize |
|---|
| 19 | |
|---|
| 20 | from twisted.python import usage |
|---|
| 21 | hunk ./src/allmydata/test/test_encodingutil.py 289 |
|---|
| 22 | self.failUnlessRaises(UnicodeEncodeError, open, fn, 'wb') |
|---|
| 23 | |
|---|
| 24 | |
|---|
| 25 | +class QuoteOutput(ReallyEqualMixin, unittest.TestCase): |
|---|
| 26 | + def _check(self, inp, out, enc, optional_quotes): |
|---|
| 27 | + out2 = out |
|---|
| 28 | + if optional_quotes: |
|---|
| 29 | + out2 = out2[1:-1] |
|---|
| 30 | + self.failUnlessReallyEqual(quote_output(inp, encoding=enc), out) |
|---|
| 31 | + self.failUnlessReallyEqual(quote_output(inp, encoding=enc, quotemarks=False), out2) |
|---|
| 32 | + if out[0:2] != 'b"': |
|---|
| 33 | + if isinstance(inp, str): |
|---|
| 34 | + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc), out) |
|---|
| 35 | + self.failUnlessReallyEqual(quote_output(unicode(inp), encoding=enc, quotemarks=False), out2) |
|---|
| 36 | + else: |
|---|
| 37 | + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc), out) |
|---|
| 38 | + self.failUnlessReallyEqual(quote_output(inp.encode('utf-8'), encoding=enc, quotemarks=False), out2) |
|---|
| 39 | + |
|---|
| 40 | + def _test_quote_output_all(self, enc): |
|---|
| 41 | + def check(inp, out, optional_quotes=False): |
|---|
| 42 | + self._check(inp, out, enc, optional_quotes) |
|---|
| 43 | + |
|---|
| 44 | + # optional single quotes |
|---|
| 45 | + check("foo", "'foo'", True) |
|---|
| 46 | + check("\\", "'\\'", True) |
|---|
| 47 | + check("$\"`", "'$\"`'", True) |
|---|
| 48 | + |
|---|
| 49 | + # mandatory single quotes |
|---|
| 50 | + check("\"", "'\"'") |
|---|
| 51 | + |
|---|
| 52 | + # double quotes |
|---|
| 53 | + check("'", "\"'\"") |
|---|
| 54 | + check("\n", "\"\\x0a\"") |
|---|
| 55 | + check("\x00", "\"\\x00\"") |
|---|
| 56 | + |
|---|
| 57 | + # invalid Unicode and astral planes |
|---|
| 58 | + check(u"\uFDD0\uFDEF", "\"\\ufdd0\\ufdef\"") |
|---|
| 59 | + check(u"\uDC00\uD800", "\"\\udc00\\ud800\"") |
|---|
| 60 | + check(u"\uDC00\uD800\uDC00", "\"\\udc00\\U00010000\"") |
|---|
| 61 | + check(u"\uD800\uDC00", "\"\\U00010000\"") |
|---|
| 62 | + check(u"\uD800\uDC01", "\"\\U00010001\"") |
|---|
| 63 | + check(u"\uD801\uDC00", "\"\\U00010400\"") |
|---|
| 64 | + check(u"\uDBFF\uDFFF", "\"\\U0010ffff\"") |
|---|
| 65 | + check(u"'\uDBFF\uDFFF", "\"'\\U0010ffff\"") |
|---|
| 66 | + check(u"\"\uDBFF\uDFFF", "\"\\\"\\U0010ffff\"") |
|---|
| 67 | + |
|---|
| 68 | + # invalid UTF-8 |
|---|
| 69 | + check("\xFF", "b\"\\xff\"") |
|---|
| 70 | + check("\x00\"$\\`\x80\xFF", "b\"\\x00\\\"\\$\\\\\\`\\x80\\xff\"") |
|---|
| 71 | + |
|---|
| 72 | + def test_quote_output_ascii(self, enc='ascii'): |
|---|
| 73 | + def check(inp, out, optional_quotes=False): |
|---|
| 74 | + self._check(inp, out, enc, optional_quotes) |
|---|
| 75 | + |
|---|
| 76 | + self._test_quote_output_all(enc) |
|---|
| 77 | + check(u"\u00D7", "\"\\xd7\"") |
|---|
| 78 | + check(u"'\u00D7", "\"'\\xd7\"") |
|---|
| 79 | + check(u"\"\u00D7", "\"\\\"\\xd7\"") |
|---|
| 80 | + check(u"\u2621", "\"\\u2621\"") |
|---|
| 81 | + check(u"'\u2621", "\"'\\u2621\"") |
|---|
| 82 | + check(u"\"\u2621", "\"\\\"\\u2621\"") |
|---|
| 83 | + |
|---|
| 84 | + def test_quote_output_latin1(self, enc='latin1'): |
|---|
| 85 | + def check(inp, out, optional_quotes=False): |
|---|
| 86 | + self._check(inp, out.encode('latin1'), enc, optional_quotes) |
|---|
| 87 | + |
|---|
| 88 | + self._test_quote_output_all(enc) |
|---|
| 89 | + check(u"\u00D7", u"'\u00D7'", True) |
|---|
| 90 | + check(u"'\u00D7", u"\"'\u00D7\"") |
|---|
| 91 | + check(u"\"\u00D7", u"'\"\u00D7'") |
|---|
| 92 | + check(u"\u00D7\"", u"'\u00D7\"'", True) |
|---|
| 93 | + check(u"\u2621", u"\"\\u2621\"") |
|---|
| 94 | + check(u"'\u2621", u"\"'\\u2621\"") |
|---|
| 95 | + check(u"\"\u2621", u"\"\\\"\\u2621\"") |
|---|
| 96 | + |
|---|
| 97 | + def test_quote_output_utf8(self, enc='utf-8'): |
|---|
| 98 | + def check(inp, out, optional_quotes=False): |
|---|
| 99 | + self._check(inp, out.encode('utf-8'), enc, optional_quotes) |
|---|
| 100 | + |
|---|
| 101 | + self._test_quote_output_all(enc) |
|---|
| 102 | + check(u"\u2621", u"'\u2621'", True) |
|---|
| 103 | + check(u"'\u2621", u"\"'\u2621\"") |
|---|
| 104 | + check(u"\"\u2621", u"'\"\u2621'") |
|---|
| 105 | + check(u"\u2621\"", u"'\u2621\"'", True) |
|---|
| 106 | + |
|---|
| 107 | + @patch('sys.stdout') |
|---|
| 108 | + def test_quote_output_mock(self, mock_stdout): |
|---|
| 109 | + mock_stdout.encoding = 'ascii' |
|---|
| 110 | + _reload() |
|---|
| 111 | + self.test_quote_output_ascii(None) |
|---|
| 112 | + |
|---|
| 113 | + mock_stdout.encoding = 'latin1' |
|---|
| 114 | + _reload() |
|---|
| 115 | + self.test_quote_output_latin1(None) |
|---|
| 116 | + |
|---|
| 117 | + mock_stdout.encoding = 'utf-8' |
|---|
| 118 | + _reload() |
|---|
| 119 | + self.test_quote_output_utf8(None) |
|---|
| 120 | + |
|---|
| 121 | + |
|---|
| 122 | class UbuntuKarmicUTF8(EncodingUtil, unittest.TestCase): |
|---|
| 123 | uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64' |
|---|
| 124 | output = 'lumi\xc3\xa8re' |
|---|
| 125 | hunk ./src/allmydata/util/encodingutil.py 115 |
|---|
| 126 | return s |
|---|
| 127 | return s.encode(argv_encoding) |
|---|
| 128 | |
|---|
| 129 | -PRINTABLE_ASCII = re.compile(r'^[ -~\n\r]*$', re.DOTALL) |
|---|
| 130 | -PRINTABLE_8BIT = re.compile(r'^[ -&(-~\n\r\x80-\xFF]*$', re.DOTALL) |
|---|
| 131 | +PRINTABLE_ASCII = re.compile(r'^[\n\r\x20-\x7E]*$', re.DOTALL) |
|---|
| 132 | +PRINTABLE_8BIT = re.compile(r'^[\n\r\x20-\x7E\x80-\xFF]*$', re.DOTALL) |
|---|
| 133 | |
|---|
| 134 | def is_printable_ascii(s): |
|---|
| 135 | return PRINTABLE_ASCII.search(s) is not None |
|---|
| 136 | hunk ./src/allmydata/util/encodingutil.py 140 |
|---|
| 137 | (output_encoding, repr(s))) |
|---|
| 138 | return out |
|---|
| 139 | |
|---|
| 140 | + |
|---|
| 141 | +def _unicode_escape(m): |
|---|
| 142 | + u = m.group(0) |
|---|
| 143 | + if u == '"' or u == '$' or u == '`' or u == '\\': |
|---|
| 144 | + return u'\\' + u |
|---|
| 145 | + if len(u) == 2: |
|---|
| 146 | + codepoint = (ord(u[0])-0xD800)*0x400 + ord(u[1])-0xDC00 + 0x10000 |
|---|
| 147 | + else: |
|---|
| 148 | + codepoint = ord(u) |
|---|
| 149 | + if codepoint > 0xFFFF: |
|---|
| 150 | + return u'\\U%08x' % (codepoint,) |
|---|
| 151 | + elif codepoint > 0xFF: |
|---|
| 152 | + return u'\\u%04x' % (codepoint,) |
|---|
| 153 | + else: |
|---|
| 154 | + return u'\\x%02x' % (codepoint,) |
|---|
| 155 | + |
|---|
| 156 | +def _str_escape(m): |
|---|
| 157 | + c = m.group(0) |
|---|
| 158 | + if c == '"' or c == '$' or c == '`' or c == '\\': |
|---|
| 159 | + return '\\' + c |
|---|
| 160 | + else: |
|---|
| 161 | + return '\\x%02x' % (ord(c),) |
|---|
| 162 | + |
|---|
| 163 | +MUST_DOUBLE_QUOTE = re.compile(ur'[^\x20-\x26\x28-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', re.DOTALL) |
|---|
| 164 | + |
|---|
| 165 | +# if we must double-quote, then we have to escape ", $ and `, but need not escape ' |
|---|
| 166 | +ESCAPABLE_UNICODE = re.compile(ur'([\uD800-\uDBFF][\uDC00-\uDFFF])|' # valid surrogate pairs |
|---|
| 167 | + ur'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E\u00A0-\uD7FF\uE000-\uFDCF\uFDF0-\uFFFC]', |
|---|
| 168 | + re.DOTALL) |
|---|
| 169 | + |
|---|
| 170 | +ESCAPABLE_8BIT = re.compile( r'[^ !#\x25-\x5B\x5D-\x5F\x61-\x7E]', re.DOTALL) |
|---|
| 171 | + |
|---|
| 172 | def quote_output(s, quotemarks=True, encoding=None): |
|---|
| 173 | """ |
|---|
| 174 | Encode either a Unicode string or a UTF-8-encoded bytestring for representation |
|---|
| 175 | hunk ./src/allmydata/util/encodingutil.py 176 |
|---|
| 176 | on stdout or stderr, tolerating errors. If 'quotemarks' is True, the string is |
|---|
| 177 | - always surrounded by single quotes; otherwise, it is quoted only if necessary to |
|---|
| 178 | - avoid ambiguity or control bytes in the output. |
|---|
| 179 | + always quoted; otherwise, it is quoted only if necessary to avoid ambiguity or |
|---|
| 180 | + control bytes in the output. |
|---|
| 181 | + Quoting may use either single or double quotes. Within single quotes, all |
|---|
| 182 | + characters stand for themselves, and ' will not appear. Within double quotes, |
|---|
| 183 | + Python-compatible backslash escaping is used. |
|---|
| 184 | """ |
|---|
| 185 | precondition(isinstance(s, (str, unicode)), s) |
|---|
| 186 | |
|---|
| 187 | hunk ./src/allmydata/util/encodingutil.py 188 |
|---|
| 188 | try: |
|---|
| 189 | s = s.decode('utf-8') |
|---|
| 190 | except UnicodeDecodeError: |
|---|
| 191 | - return 'b' + repr(s) |
|---|
| 192 | - |
|---|
| 193 | - try: |
|---|
| 194 | - out = s.encode(encoding or output_encoding) |
|---|
| 195 | - except (UnicodeEncodeError, UnicodeDecodeError): |
|---|
| 196 | - return repr(s) |
|---|
| 197 | + return 'b"%s"' % (ESCAPABLE_8BIT.sub(_str_escape, s),) |
|---|
| 198 | |
|---|
| 199 | hunk ./src/allmydata/util/encodingutil.py 190 |
|---|
| 200 | - if PRINTABLE_8BIT.search(out) is None: |
|---|
| 201 | - return repr(out) |
|---|
| 202 | + if MUST_DOUBLE_QUOTE.search(s) is None: |
|---|
| 203 | + try: |
|---|
| 204 | + out = s.encode(encoding or output_encoding) |
|---|
| 205 | + if quotemarks or out.startswith('"'): |
|---|
| 206 | + return "'%s'" % (out,) |
|---|
| 207 | + else: |
|---|
| 208 | + return out |
|---|
| 209 | + except (UnicodeDecodeError, UnicodeEncodeError): |
|---|
| 210 | + pass |
|---|
| 211 | |
|---|
| 212 | hunk ./src/allmydata/util/encodingutil.py 200 |
|---|
| 213 | - if quotemarks: |
|---|
| 214 | - return "'" + out.replace("\\", "\\\\").replace("'", "\'") + "'" |
|---|
| 215 | - else: |
|---|
| 216 | - return out |
|---|
| 217 | + escaped = ESCAPABLE_UNICODE.sub(_unicode_escape, s) |
|---|
| 218 | + return '"%s"' % (escaped.encode(encoding or output_encoding, 'backslashreplace'),) |
|---|
| 219 | |
|---|
| 220 | def quote_path(path, quotemarks=True): |
|---|
| 221 | return quote_output("/".join(map(to_str, path)), quotemarks=quotemarks) |
|---|
| 222 | } |
|---|
| 223 | |
|---|
| 224 | Context: |
|---|
| 225 | |
|---|
| 226 | [docs/specifications/dirnodes.txt: 'mesh'->'grid'. |
|---|
| 227 | david-sarah@jacaranda.org**20100723061616 |
|---|
| 228 | Ignore-this: 887bcf921ef00afba8e05e9239035bca |
|---|
| 229 | ] |
|---|
| 230 | [docs: use current cap to Zooko's wiki page in example text |
|---|
| 231 | zooko@zooko.com**20100721010543 |
|---|
| 232 | Ignore-this: 4f36f36758f9fdbaf9eb73eac23b6652 |
|---|
| 233 | fixes #1134 |
|---|
| 234 | ] |
|---|
| 235 | [docs/specifications/dirnodes.txt: bring layer terminology up-to-date with architecture.txt, and a few other updates (e.g. note that the MAC is no longer verified, and that URIs can be unknown). Also 'Tahoe'->'Tahoe-LAFS'. |
|---|
| 236 | david-sarah@jacaranda.org**20100723054703 |
|---|
| 237 | Ignore-this: f3b98183e7d0a0f391225b8b93ac6c37 |
|---|
| 238 | ] |
|---|
| 239 | [__init__.py: silence DeprecationWarning about BaseException.message globally. fixes #1129 |
|---|
| 240 | david-sarah@jacaranda.org**20100720011939 |
|---|
| 241 | Ignore-this: 38808986ba79cb2786b010504a22f89 |
|---|
| 242 | ] |
|---|
| 243 | [test_runner: test that 'tahoe --version' outputs no noise (e.g. DeprecationWarnings). |
|---|
| 244 | david-sarah@jacaranda.org**20100720011345 |
|---|
| 245 | Ignore-this: dd358b7b2e5d57282cbe133e8069702e |
|---|
| 246 | ] |
|---|
| 247 | [TAG allmydata-tahoe-1.7.1 |
|---|
| 248 | zooko@zooko.com**20100719131352 |
|---|
| 249 | Ignore-this: 6942056548433dc653a746703819ad8c |
|---|
| 250 | ] |
|---|
| 251 | Patch bundle hash: |
|---|
| 252 | d4aa6ac35c5dba44996999385ca90717c2525a3e |
|---|