Ticket #534: unicode-helper-functions-v4.diff

File unicode-helper-functions-v4.diff, 13.6 KB (added by francois, at 2010-05-20T00:54:48Z)
  • src/allmydata/test/test_stringutils.py

    Thu May 20 02:41:05 CEST 2010  Francois Deppierraz <francois@ctrlaltdel.ch>
      * stringutils.py: Unicode helper functions + associated tests
      
      This file contains a bunch of helper functions which converts
      unicode string from and to argv, filenames and stdout.
    diff -rN -u old-tahoe-534/src/allmydata/test/test_stringutils.py new-tahoe-534/src/allmydata/test/test_stringutils.py
    old new  
     1# coding=utf-8
     2
     3TEST_FILENAMES = (
     4  u'Ärtonwall.mp3',
     5  u'test_file',
     6  u'Blah blah.txt',
     7)
     8
     9# The following main helps to generate a test class for other operating
     10# systems.
     11
     12if __name__ == "__main__":
     13    import sys, os
     14    import tempfile
     15    import shutil
     16    import platform
     17   
     18    if len(sys.argv) != 2:
     19        print "Usage: %s lumière" % sys.argv[0]
     20        sys.exit(1)
     21   
     22    print
     23    print "class MyWeirdOS(StringUtils, unittest.TestCase):"
     24    print "    uname = '%s'" % ' '.join(platform.uname())
     25    print "    argv = %s" % repr(sys.argv[1])
     26    print "    platform = '%s'" % sys.platform
     27    print "    filesystemencoding = '%s'" % sys.getfilesystemencoding()
     28    print "    stdoutencoding = '%s'" % sys.stdout.encoding
     29
     30    try:
     31        tmpdir = tempfile.mkdtemp()
     32        for fname in TEST_FILENAMES:
     33            open(os.path.join(tmpdir, fname), 'w').close()
     34
     35        # Use Unicode API under Windows or MacOS X
     36        if sys.platform in ('win32', 'darwin'):
     37            dirlist = os.listdir(unicode(tmpdir))
     38        else:
     39            dirlist = os.listdir(tmpdir)
     40
     41        print "    dirlist = %s" % repr(dirlist)
     42    except:
     43        print "    # Oops, I cannot write filenames containing non-ascii characters"
     44    print
     45
     46    shutil.rmtree(tmpdir)
     47    sys.exit(0)
     48
     49from twisted.trial import unittest
     50from mock import patch
     51import sys
     52
     53from allmydata.util.stringutils import argv_to_unicode, unicode_to_url, \
     54    unicode_to_stdout, unicode_platform, listdir_unicode, open_unicode, \
     55    FilenameEncodingError, get_term_encoding
     56from twisted.python import usage
     57
     58class StringUtilsErrors(unittest.TestCase):
     59    @patch('sys.stdout')
     60    def test_get_term_encoding(self, mock):
     61        mock.encoding = None
     62       
     63        self.failUnlessEqual(get_term_encoding(), 'ascii')
     64
     65    @patch('sys.stdout')
     66    def test_argv_to_unicode(self, mock):
     67        mock.encoding = 'utf-8'
     68
     69        self.failUnlessRaises(usage.UsageError,
     70                              argv_to_unicode,
     71                              u'lumière'.encode('latin1'))
     72
     73    def test_unicode_to_url(self):
     74        pass
     75
     76    @patch('sys.stdout')
     77    def test_unicode_to_stdout(self, mock):
     78        # Encoding koi8-r cannot represent 'è'
     79        mock.encoding = 'koi8-r'
     80        self.failUnlessEqual(unicode_to_stdout(u'lumière'), 'lumi?re')
     81
     82    @patch('os.listdir')
     83    def test_unicode_normalization(self, mock):
     84        # Pretend to run on an Unicode platform such as Windows
     85        orig_platform = sys.platform
     86        sys.platform = 'win32'
     87
     88        mock.return_value = [u'A\u0308rtonwall.mp3']
     89        self.failUnlessEqual(listdir_unicode(u'/dummy'), [u'\xc4rtonwall.mp3'])
     90
     91        sys.platform = orig_platform
     92
     93# The following tests applies only to platforms which don't store filenames as
     94# Unicode entities on the filesystem.
     95class StringUtilsNonUnicodePlatform(unittest.TestCase):
     96    def setUp(self):
     97        # Mock sys.platform because unicode_platform() uses it
     98        self.original_platform = sys.platform
     99        sys.platform = 'linux'
     100
     101    def tearDown(self):
     102        sys.platform = self.original_platform
     103
     104    @patch('sys.getfilesystemencoding')
     105    @patch('os.listdir')
     106    def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
     107        # What happen if a latin1-encoded filenames is encountered on an UTF-8
     108        # filesystem?
     109        mock_listdir.return_value = [
     110            u'lumière'.encode('utf-8'),
     111            u'lumière'.encode('latin1')]
     112
     113        mock_getfilesystemencoding.return_value = 'utf-8'
     114       
     115        self.failUnlessRaises(FilenameEncodingError,
     116                              listdir_unicode,
     117                              u'/dummy')
     118       
     119        # We're trying to list a directory whose name cannot be represented in
     120        # the filesystem encoding.  This should fail.
     121        mock_getfilesystemencoding.return_value = 'ascii'
     122        self.failUnlessRaises(FilenameEncodingError,
     123                              listdir_unicode,
     124                              u'/lumière')
     125
     126    @patch('sys.getfilesystemencoding')
     127    def test_open_unicode(self, mock):
     128        mock.return_value = 'ascii'
     129
     130        self.failUnlessRaises(FilenameEncodingError,
     131                              open_unicode,
     132                              u'lumière')
     133
     134class StringUtils():
     135    def setUp(self):
     136        # Mock sys.platform because unicode_platform() uses it
     137        self.original_platform = sys.platform
     138        sys.platform = self.platform
     139
     140    def tearDown(self):
     141        sys.platform = self.original_platform
     142
     143    @patch('sys.stdout')
     144    def test_argv_to_unicode(self, mock):
     145        if 'argv' not in dir(self):
     146            raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
     147
     148        mock.encoding = self.stdoutencoding
     149
     150        argu = u'lumière'
     151        argv = self.argv
     152
     153        self.failUnlessEqual(argv_to_unicode(argv), argu)
     154
     155    def test_unicode_to_url(self):
     156        self.failUnless(unicode_to_url(u'lumière'), u'lumière'.encode('utf-8'))
     157
     158    @patch('sys.stdout')
     159    def test_unicode_to_stdout(self, mock):
     160        if 'argv' not in dir(self):
     161            raise unittest.SkipTest("There's no way to pass non-ASCII arguments in CLI on this (mocked) platform")
     162
     163        mock.encoding = self.stdoutencoding
     164        self.failUnlessEqual(unicode_to_stdout(u'lumière'), self.argv)
     165
     166    def test_unicode_platform(self):
     167        matrix = {
     168          'linux2': False,
     169          'win32':  True,
     170          'darwin': True,
     171        }
     172
     173        self.failUnlessEqual(unicode_platform(), matrix[self.platform])
     174 
     175    @patch('sys.getfilesystemencoding')
     176    @patch('os.listdir')
     177    def test_listdir_unicode(self, mock_listdir, mock_getfilesystemencoding):
     178
     179        mock_listdir.return_value = self.dirlist
     180        mock_getfilesystemencoding.return_value = self.filesystemencoding
     181       
     182        filenames = listdir_unicode(u'/dummy')
     183
     184        for fname in TEST_FILENAMES:
     185            self.failUnless(isinstance(fname, unicode))
     186
     187            if fname not in filenames:
     188                self.fail("Cannot find %r in %r" % (fname, filenames))
     189
     190    @patch('os.open')
     191    def test_open_unicode(self, mock):
     192
     193        self.failUnlessRaises(IOError,
     194                              open_unicode,
     195                              u'/dummy_directory/lumière.txt')
     196
     197
     198class UbuntuKarmicUTF8(StringUtils, unittest.TestCase):
     199    uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
     200    argv = 'lumi\xc3\xa8re'
     201    platform = 'linux2'
     202    filesystemencoding = 'UTF-8'
     203    stdoutencoding = 'UTF-8'
     204    dirlist = ['test_file', '\xc3\x84rtonwall.mp3', 'Blah blah.txt']
     205
     206
     207class UbuntuKarmicLatin1(StringUtils, unittest.TestCase):
     208    uname = 'Linux korn 2.6.31-14-generic #48-Ubuntu SMP Fri Oct 16 14:05:01 UTC 2009 x86_64'
     209    argv = 'lumi\xe8re'
     210    platform = 'linux2'
     211    filesystemencoding = 'ISO-8859-1'
     212    stdoutencoding = 'ISO-8859-1'
     213    dirlist = ['test_file', 'Blah blah.txt', '\xc4rtonwall.mp3']
     214
     215class WindowsXP(StringUtils, unittest.TestCase):
     216    uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
     217    argv = 'lumi\xe8re'
     218    platform = 'win32'
     219    filesystemencoding = 'mbcs'
     220    stdoutencoding = 'cp850'
     221    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
     222
     223    todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
     224
     225class WindowsXP_UTF8(StringUtils, unittest.TestCase):
     226    uname = 'Windows XP 5.1.2600 x86 x86 Family 15 Model 75 Step ping 2, AuthenticAMD'
     227    argv = 'lumi\xe8re'
     228    platform = 'win32'
     229    filesystemencoding = 'mbcs'
     230    stdoutencoding = 'cp65001'
     231    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
     232
     233    todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
     234
     235class WindowsVista(StringUtils, unittest.TestCase):
     236    uname = 'Windows Vista 6.0.6000 x86 x86 Family 6 Model 15 Stepping 11, GenuineIntel'
     237    argv = 'lumi\xe8re'
     238    platform = 'win32'
     239    filesystemencoding = 'mbcs'
     240    stdoutencoding = 'cp850'
     241    dirlist = [u'Blah blah.txt', u'test_file', u'\xc4rtonwall.mp3']
     242
     243    todo = "Unicode arguments on the command-line is not yet supported under Windows, see bug #565."
     244
     245class MacOSXLeopard(StringUtils, unittest.TestCase):
     246    uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
     247    argv = 'lumi\xc3\xa8re'
     248    platform = 'darwin'
     249    filesystemencoding = 'utf-8'
     250    stdoutencoding = 'UTF-8'
     251    dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
     252
     253class MacOSXLeopard7bit(StringUtils, unittest.TestCase):
     254    uname = 'Darwin g5.local 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul 15 16:57:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_PPC Power Macintosh powerpc'
     255    #argv = 'lumiere'
     256    platform = 'darwin'
     257    filesystemencoding = 'utf-8'
     258    stdoutencoding = 'US-ASCII'
     259    dirlist = [u'A\u0308rtonwall.mp3', u'Blah blah.txt', u'test_file']
  • src/allmydata/util/stringutils.py

    diff -rN -u old-tahoe-534/src/allmydata/util/stringutils.py new-tahoe-534/src/allmydata/util/stringutils.py
    old new  
     1"""
     2Functions used to convert inputs from whatever encoding used in the system to
     3unicode and back.
     4"""
     5
     6import sys
     7import os
     8import unicodedata
     9from allmydata.util.assertutil import precondition
     10from twisted.python import usage
     11
     12def get_term_encoding():
     13    """
     14    Returns expected encoding for writing to the terminal and reading
     15    arguments from the command-line.
     16    """
     17
     18    if sys.stdout.encoding == None:
     19        return 'ascii'
     20    else:
     21        return sys.stdout.encoding
     22
     23def argv_to_unicode(s):
     24    """
     25    Decode given argv element to unicode.
     26    """
     27    # Try to decode the command-line argument with the encoding returned by
     28    # get_term_encoding(), if this fails print an error message to the user.
     29
     30    precondition(isinstance(s, str), s)
     31
     32    try:
     33        return unicode(s, get_term_encoding())
     34    except UnicodeDecodeError:
     35        raise usage.UsageError("Argument '%s' cannot be decoded as %s." %
     36                               (s, get_term_encoding()))
     37
     38def unicode_to_url(s):
     39    """
     40    Encode an unicode object used in an URL.
     41    """
     42    # According to RFC 2718, non-ascii characters in url's must be UTF-8 encoded.
     43
     44    precondition(isinstance(s, unicode), s)
     45    return s.encode('utf-8')
     46
     47def unicode_to_stdout(s):
     48    """
     49    Encode an unicode object for representation on stdout.
     50    """
     51
     52    precondition(isinstance(s, unicode), s)
     53    return s.encode(get_term_encoding(), 'replace')
     54
     55def unicode_platform():
     56    """
     57    Does the current platform handle Unicode filenames natively ?
     58    """
     59
     60    return sys.platform in ('win32', 'darwin')
     61
     62class FilenameEncodingError(Exception):
     63    """
     64    Filename cannot be encoded using the current encoding of your filesystem
     65    (%s). Please configure your locale correctly or rename this file.
     66    """
     67
     68    pass
     69
     70def listdir_unicode_unix(path):
     71    """
     72    This function emulates an Unicode API under Unix similar to one available
     73    under Windows or MacOS X.
     74
     75    If badly encoded filenames are encountered, an exception is raised.
     76    """
     77    precondition(isinstance(path, unicode), path)
     78
     79    encoding = sys.getfilesystemencoding()
     80    try:
     81        byte_path = path.encode(encoding)
     82    except UnicodeEncodeError:
     83        raise FilenameEncodingError(path)
     84
     85    try:
     86        return [unicode(fn, encoding) for fn in os.listdir(byte_path)]
     87    except UnicodeDecodeError:
     88        raise FilenameEncodingError(fn)
     89
     90def listdir_unicode(path, encoding = None):
     91    """
     92    Wrapper around listdir() which provides safe access to the convenient
     93    Unicode API even under Unix.
     94    """
     95
     96    precondition(isinstance(path, unicode), path)
     97
     98    # On Windows and MacOS X, the Unicode API is used
     99    if unicode_platform():
     100        dirlist = os.listdir(path)
     101
     102    # On other platforms (ie. Unix systems), the byte-level API is used
     103    else:
     104        dirlist = listdir_unicode_unix(path)
     105
     106    # Normalize the resulting unicode filenames
     107    #
     108    # This prevents different OS from generating non-equal unicode strings for
     109    # the same filename representation
     110    return [unicodedata.normalize('NFC', fname) for fname in dirlist]
     111
     112def open_unicode(path, mode='r'):
     113    """
     114    Wrapper around open() which provides safe access to the convenient Unicode
     115    API even under Unix.
     116    """
     117
     118    precondition(isinstance(path, unicode), path)
     119
     120    if unicode_platform():
     121        return open(path, mode)
     122    else:
     123        encoding = sys.getfilesystemencoding()
     124
     125        try:
     126            return open(path.encode(encoding), mode)
     127        except UnicodeEncodeError:
     128            raise FilenameEncodingError(path)