utf_8_sig.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. """ Python 'utf-8-sig' Codec
  2. This work similar to UTF-8 with the following changes:
  3. * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
  4. first three bytes.
  5. * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
  6. bytes will be skipped.
  7. """
  8. import codecs
  9. ### Codec APIs
  10. def encode(input, errors='strict'):
  11. return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
  12. def decode(input, errors='strict'):
  13. prefix = 0
  14. if input[:3] == codecs.BOM_UTF8:
  15. input = input[3:]
  16. prefix = 3
  17. (output, consumed) = codecs.utf_8_decode(input, errors, True)
  18. return (output, consumed+prefix)
  19. class IncrementalEncoder(codecs.IncrementalEncoder):
  20. def __init__(self, errors='strict'):
  21. codecs.IncrementalEncoder.__init__(self, errors)
  22. self.first = 1
  23. def encode(self, input, final=False):
  24. if self.first:
  25. self.first = 0
  26. return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
  27. else:
  28. return codecs.utf_8_encode(input, self.errors)[0]
  29. def reset(self):
  30. codecs.IncrementalEncoder.reset(self)
  31. self.first = 1
  32. def getstate(self):
  33. return self.first
  34. def setstate(self, state):
  35. self.first = state
  36. class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
  37. def __init__(self, errors='strict'):
  38. codecs.BufferedIncrementalDecoder.__init__(self, errors)
  39. self.first = True
  40. def _buffer_decode(self, input, errors, final):
  41. if self.first:
  42. if len(input) < 3:
  43. if codecs.BOM_UTF8.startswith(input):
  44. # not enough data to decide if this really is a BOM
  45. # => try again on the next call
  46. return (u"", 0)
  47. else:
  48. self.first = None
  49. else:
  50. self.first = None
  51. if input[:3] == codecs.BOM_UTF8:
  52. (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
  53. return (output, consumed+3)
  54. return codecs.utf_8_decode(input, errors, final)
  55. def reset(self):
  56. codecs.BufferedIncrementalDecoder.reset(self)
  57. self.first = True
  58. class StreamWriter(codecs.StreamWriter):
  59. def reset(self):
  60. codecs.StreamWriter.reset(self)
  61. try:
  62. del self.encode
  63. except AttributeError:
  64. pass
  65. def encode(self, input, errors='strict'):
  66. self.encode = codecs.utf_8_encode
  67. return encode(input, errors)
  68. class StreamReader(codecs.StreamReader):
  69. def reset(self):
  70. codecs.StreamReader.reset(self)
  71. try:
  72. del self.decode
  73. except AttributeError:
  74. pass
  75. def decode(self, input, errors='strict'):
  76. if len(input) < 3:
  77. if codecs.BOM_UTF8.startswith(input):
  78. # not enough data to decide if this is a BOM
  79. # => try again on the next call
  80. return (u"", 0)
  81. elif input[:3] == codecs.BOM_UTF8:
  82. self.decode = codecs.utf_8_decode
  83. (output, consumed) = codecs.utf_8_decode(input[3:],errors)
  84. return (output, consumed+3)
  85. # (else) no BOM present
  86. self.decode = codecs.utf_8_decode
  87. return codecs.utf_8_decode(input, errors)
  88. ### encodings module API
  89. def getregentry():
  90. return codecs.CodecInfo(
  91. name='utf-8-sig',
  92. encode=encode,
  93. decode=decode,
  94. incrementalencoder=IncrementalEncoder,
  95. incrementaldecoder=IncrementalDecoder,
  96. streamreader=StreamReader,
  97. streamwriter=StreamWriter,
  98. )