encoding.py 1002 B

12345678910111213141516171819202122232425262728293031
  1. import codecs
  2. import locale
  3. import re
  4. BOMS = [
  5. (codecs.BOM_UTF8, 'utf8'),
  6. (codecs.BOM_UTF16, 'utf16'),
  7. (codecs.BOM_UTF16_BE, 'utf16-be'),
  8. (codecs.BOM_UTF16_LE, 'utf16-le'),
  9. (codecs.BOM_UTF32, 'utf32'),
  10. (codecs.BOM_UTF32_BE, 'utf32-be'),
  11. (codecs.BOM_UTF32_LE, 'utf32-le'),
  12. ]
  13. ENCODING_RE = re.compile(b'coding[:=]\s*([-\w.]+)')
  14. def auto_decode(data):
  15. """Check a bytes string for a BOM to correctly detect the encoding
  16. Fallback to locale.getpreferredencoding(False) like open() on Python3"""
  17. for bom, encoding in BOMS:
  18. if data.startswith(bom):
  19. return data[len(bom):].decode(encoding)
  20. # Lets check the first two lines as in PEP263
  21. for line in data.split(b'\n')[:2]:
  22. if line[0:1] == b'#' and ENCODING_RE.search(line):
  23. encoding = ENCODING_RE.search(line).groups()[0].decode('ascii')
  24. return data.decode(encoding)
  25. return data.decode(locale.getpreferredencoding(False))