__init__.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. """ Standard "encodings" Package
  2. Standard Python encoding modules are stored in this package
  3. directory.
  4. Codec modules must have names corresponding to normalized encoding
  5. names as defined in the normalize_encoding() function below, e.g.
  6. 'utf-8' must be implemented by the module 'utf_8.py'.
  7. Each codec module must export the following interface:
  8. * getregentry() -> codecs.CodecInfo object
  9. The getregentry() API must a CodecInfo object with encoder, decoder,
  10. incrementalencoder, incrementaldecoder, streamwriter and streamreader
  11. atttributes which adhere to the Python Codec Interface Standard.
  12. In addition, a module may optionally also define the following
  13. APIs which are then used by the package's codec search function:
  14. * getaliases() -> sequence of encoding name strings to use as aliases
  15. Alias names returned by getaliases() must be normalized encoding
  16. names as defined by normalize_encoding().
  17. Written by Marc-Andre Lemburg (mal@lemburg.com).
  18. (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
  19. """#"
  20. import codecs
  21. from encodings import aliases
  22. import __builtin__
  23. _cache = {}
  24. _unknown = '--unknown--'
  25. _import_tail = ['*']
  26. _norm_encoding_map = (' . '
  27. '0123456789 ABCDEFGHIJKLMNOPQRSTUVWXYZ '
  28. ' abcdefghijklmnopqrstuvwxyz '
  29. ' '
  30. ' '
  31. ' ')
  32. _aliases = aliases.aliases
  33. class CodecRegistryError(LookupError, SystemError):
  34. pass
  35. def normalize_encoding(encoding):
  36. """ Normalize an encoding name.
  37. Normalization works as follows: all non-alphanumeric
  38. characters except the dot used for Python package names are
  39. collapsed and replaced with a single underscore, e.g. ' -;#'
  40. becomes '_'. Leading and trailing underscores are removed.
  41. Note that encoding names should be ASCII only; if they do use
  42. non-ASCII characters, these must be Latin-1 compatible.
  43. """
  44. # Make sure we have an 8-bit string, because .translate() works
  45. # differently for Unicode strings.
  46. if hasattr(__builtin__, "unicode") and isinstance(encoding, unicode):
  47. # Note that .encode('latin-1') does *not* use the codec
  48. # registry, so this call doesn't recurse. (See unicodeobject.c
  49. # PyUnicode_AsEncodedString() for details)
  50. encoding = encoding.encode('latin-1')
  51. return '_'.join(encoding.translate(_norm_encoding_map).split())
  52. def search_function(encoding):
  53. # Cache lookup
  54. entry = _cache.get(encoding, _unknown)
  55. if entry is not _unknown:
  56. return entry
  57. # Import the module:
  58. #
  59. # First try to find an alias for the normalized encoding
  60. # name and lookup the module using the aliased name, then try to
  61. # lookup the module using the standard import scheme, i.e. first
  62. # try in the encodings package, then at top-level.
  63. #
  64. norm_encoding = normalize_encoding(encoding)
  65. aliased_encoding = _aliases.get(norm_encoding) or \
  66. _aliases.get(norm_encoding.replace('.', '_'))
  67. if aliased_encoding is not None:
  68. modnames = [aliased_encoding,
  69. norm_encoding]
  70. else:
  71. modnames = [norm_encoding]
  72. for modname in modnames:
  73. if not modname or '.' in modname:
  74. continue
  75. try:
  76. # Import is absolute to prevent the possibly malicious import of a
  77. # module with side-effects that is not in the 'encodings' package.
  78. mod = __import__('encodings.' + modname, fromlist=_import_tail,
  79. level=0)
  80. except ImportError:
  81. pass
  82. else:
  83. break
  84. else:
  85. mod = None
  86. try:
  87. getregentry = mod.getregentry
  88. except AttributeError:
  89. # Not a codec module
  90. mod = None
  91. if mod is None:
  92. # Cache misses
  93. _cache[encoding] = None
  94. return None
  95. # Now ask the module for the registry entry
  96. entry = getregentry()
  97. if not isinstance(entry, codecs.CodecInfo):
  98. if not 4 <= len(entry) <= 7:
  99. raise CodecRegistryError,\
  100. 'module "%s" (%s) failed to register' % \
  101. (mod.__name__, mod.__file__)
  102. if not hasattr(entry[0], '__call__') or \
  103. not hasattr(entry[1], '__call__') or \
  104. (entry[2] is not None and not hasattr(entry[2], '__call__')) or \
  105. (entry[3] is not None and not hasattr(entry[3], '__call__')) or \
  106. (len(entry) > 4 and entry[4] is not None and not hasattr(entry[4], '__call__')) or \
  107. (len(entry) > 5 and entry[5] is not None and not hasattr(entry[5], '__call__')):
  108. raise CodecRegistryError,\
  109. 'incompatible codecs in module "%s" (%s)' % \
  110. (mod.__name__, mod.__file__)
  111. if len(entry)<7 or entry[6] is None:
  112. entry += (None,)*(6-len(entry)) + (mod.__name__.split(".", 1)[1],)
  113. entry = codecs.CodecInfo(*entry)
  114. # Cache the codec registry entry
  115. _cache[encoding] = entry
  116. # Register its aliases (without overwriting previously registered
  117. # aliases)
  118. try:
  119. codecaliases = mod.getaliases()
  120. except AttributeError:
  121. pass
  122. else:
  123. for alias in codecaliases:
  124. if alias not in _aliases:
  125. _aliases[alias] = modname
  126. # Return the registry entry
  127. return entry
  128. # Register the search_function in the Python codec registry
  129. codecs.register(search_function)