123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113 |
- """ codecs -- Python Codec Registry, API and helpers.
- Written by Marc-Andre Lemburg (mal@lemburg.com).
- (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
- """#"
- import __builtin__, sys
- ### Registry and builtin stateless codec functions
- try:
- from _codecs import *
- except ImportError, why:
- raise SystemError('Failed to load the builtin codecs: %s' % why)
- __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
- "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
- "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
- "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
- "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
- "StreamReader", "StreamWriter",
- "StreamReaderWriter", "StreamRecoder",
- "getencoder", "getdecoder", "getincrementalencoder",
- "getincrementaldecoder", "getreader", "getwriter",
- "encode", "decode", "iterencode", "iterdecode",
- "strict_errors", "ignore_errors", "replace_errors",
- "xmlcharrefreplace_errors", "backslashreplace_errors",
- "register_error", "lookup_error"]
- ### Constants
- #
- # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
- # and its possible byte string values
- # for UTF8/UTF16/UTF32 output and little/big endian machines
- #
- # UTF-8
- BOM_UTF8 = '\xef\xbb\xbf'
- # UTF-16, little endian
- BOM_LE = BOM_UTF16_LE = '\xff\xfe'
- # UTF-16, big endian
- BOM_BE = BOM_UTF16_BE = '\xfe\xff'
- # UTF-32, little endian
- BOM_UTF32_LE = '\xff\xfe\x00\x00'
- # UTF-32, big endian
- BOM_UTF32_BE = '\x00\x00\xfe\xff'
- if sys.byteorder == 'little':
- # UTF-16, native endianness
- BOM = BOM_UTF16 = BOM_UTF16_LE
- # UTF-32, native endianness
- BOM_UTF32 = BOM_UTF32_LE
- else:
- # UTF-16, native endianness
- BOM = BOM_UTF16 = BOM_UTF16_BE
- # UTF-32, native endianness
- BOM_UTF32 = BOM_UTF32_BE
- # Old broken names (don't use in new code)
- BOM32_LE = BOM_UTF16_LE
- BOM32_BE = BOM_UTF16_BE
- BOM64_LE = BOM_UTF32_LE
- BOM64_BE = BOM_UTF32_BE
- ### Codec base classes (defining the API)
- class CodecInfo(tuple):
- """Codec details when looking up the codec registry"""
- # Private API to allow Python to blacklist the known non-Unicode
- # codecs in the standard library. A more general mechanism to
- # reliably distinguish test encodings from other codecs will hopefully
- # be defined for Python 3.5
- #
- # See http://bugs.python.org/issue19619
- _is_text_encoding = True # Assume codecs are text encodings by default
- def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
- incrementalencoder=None, incrementaldecoder=None, name=None,
- _is_text_encoding=None):
- self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
- self.name = name
- self.encode = encode
- self.decode = decode
- self.incrementalencoder = incrementalencoder
- self.incrementaldecoder = incrementaldecoder
- self.streamwriter = streamwriter
- self.streamreader = streamreader
- if _is_text_encoding is not None:
- self._is_text_encoding = _is_text_encoding
- return self
- def __repr__(self):
- return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
- class Codec:
- """ Defines the interface for stateless encoders/decoders.
- The .encode()/.decode() methods may use different error
- handling schemes by providing the errors argument. These
- string values are predefined:
- 'strict' - raise a ValueError error (or a subclass)
- 'ignore' - ignore the character and continue with the next
- 'replace' - replace with a suitable replacement character;
- Python will use the official U+FFFD REPLACEMENT
- CHARACTER for the builtin Unicode codecs on
- decoding and '?' on encoding.
- 'xmlcharrefreplace' - Replace with the appropriate XML
- character reference (only for encoding).
- 'backslashreplace' - Replace with backslashed escape sequences
- (only for encoding).
- The set of allowed values can be extended via register_error.
- """
- def encode(self, input, errors='strict'):
- """ Encodes the object input and returns a tuple (output
- object, length consumed).
- errors defines the error handling to apply. It defaults to
- 'strict' handling.
- The method may not store state in the Codec instance. Use
- StreamWriter for codecs which have to keep state in order to
- make encoding efficient.
- The encoder must be able to handle zero length input and
- return an empty object of the output object type in this
- situation.
- """
- raise NotImplementedError
- def decode(self, input, errors='strict'):
- """ Decodes the object input and returns a tuple (output
- object, length consumed).
- input must be an object which provides the bf_getreadbuf
- buffer slot. Python strings, buffer objects and memory
- mapped files are examples of objects providing this slot.
- errors defines the error handling to apply. It defaults to
- 'strict' handling.
- The method may not store state in the Codec instance. Use
- StreamReader for codecs which have to keep state in order to
- make decoding efficient.
- The decoder must be able to handle zero length input and
- return an empty object of the output object type in this
- situation.
- """
- raise NotImplementedError
- class IncrementalEncoder(object):
- """
- An IncrementalEncoder encodes an input in multiple steps. The input can be
- passed piece by piece to the encode() method. The IncrementalEncoder remembers
- the state of the Encoding process between calls to encode().
- """
- def __init__(self, errors='strict'):
- """
- Creates an IncrementalEncoder instance.
- The IncrementalEncoder may use different error handling schemes by
- providing the errors keyword argument. See the module docstring
- for a list of possible values.
- """
- self.errors = errors
- self.buffer = ""
- def encode(self, input, final=False):
- """
- Encodes input and returns the resulting object.
- """
- raise NotImplementedError
- def reset(self):
- """
- Resets the encoder to the initial state.
- """
- def getstate(self):
- """
- Return the current state of the encoder.
- """
- return 0
- def setstate(self, state):
- """
- Set the current state of the encoder. state must have been
- returned by getstate().
- """
- class BufferedIncrementalEncoder(IncrementalEncoder):
- """
- This subclass of IncrementalEncoder can be used as the baseclass for an
- incremental encoder if the encoder must keep some of the output in a
- buffer between calls to encode().
- """
- def __init__(self, errors='strict'):
- IncrementalEncoder.__init__(self, errors)
- self.buffer = "" # unencoded input that is kept between calls to encode()
- def _buffer_encode(self, input, errors, final):
- # Overwrite this method in subclasses: It must encode input
- # and return an (output, length consumed) tuple
- raise NotImplementedError
- def encode(self, input, final=False):
- # encode input (taking the buffer into account)
- data = self.buffer + input
- (result, consumed) = self._buffer_encode(data, self.errors, final)
- # keep unencoded input until the next call
- self.buffer = data[consumed:]
- return result
- def reset(self):
- IncrementalEncoder.reset(self)
- self.buffer = ""
- def getstate(self):
- return self.buffer or 0
- def setstate(self, state):
- self.buffer = state or ""
- class IncrementalDecoder(object):
- """
- An IncrementalDecoder decodes an input in multiple steps. The input can be
- passed piece by piece to the decode() method. The IncrementalDecoder
- remembers the state of the decoding process between calls to decode().
- """
- def __init__(self, errors='strict'):
- """
- Creates a IncrementalDecoder instance.
- The IncrementalDecoder may use different error handling schemes by
- providing the errors keyword argument. See the module docstring
- for a list of possible values.
- """
- self.errors = errors
- def decode(self, input, final=False):
- """
- Decodes input and returns the resulting object.
- """
- raise NotImplementedError
- def reset(self):
- """
- Resets the decoder to the initial state.
- """
- def getstate(self):
- """
- Return the current state of the decoder.
- This must be a (buffered_input, additional_state_info) tuple.
- buffered_input must be a bytes object containing bytes that
- were passed to decode() that have not yet been converted.
- additional_state_info must be a non-negative integer
- representing the state of the decoder WITHOUT yet having
- processed the contents of buffered_input. In the initial state
- and after reset(), getstate() must return (b"", 0).
- """
- return (b"", 0)
- def setstate(self, state):
- """
- Set the current state of the decoder.
- state must have been returned by getstate(). The effect of
- setstate((b"", 0)) must be equivalent to reset().
- """
- class BufferedIncrementalDecoder(IncrementalDecoder):
- """
- This subclass of IncrementalDecoder can be used as the baseclass for an
- incremental decoder if the decoder must be able to handle incomplete byte
- sequences.
- """
- def __init__(self, errors='strict'):
- IncrementalDecoder.__init__(self, errors)
- self.buffer = "" # undecoded input that is kept between calls to decode()
- def _buffer_decode(self, input, errors, final):
- # Overwrite this method in subclasses: It must decode input
- # and return an (output, length consumed) tuple
- raise NotImplementedError
- def decode(self, input, final=False):
- # decode input (taking the buffer into account)
- data = self.buffer + input
- (result, consumed) = self._buffer_decode(data, self.errors, final)
- # keep undecoded input until the next call
- self.buffer = data[consumed:]
- return result
- def reset(self):
- IncrementalDecoder.reset(self)
- self.buffer = ""
- def getstate(self):
- # additional state info is always 0
- return (self.buffer, 0)
- def setstate(self, state):
- # ignore additional state info
- self.buffer = state[0]
- #
- # The StreamWriter and StreamReader class provide generic working
- # interfaces which can be used to implement new encoding submodules
- # very easily. See encodings/utf_8.py for an example on how this is
- # done.
- #
- class StreamWriter(Codec):
- def __init__(self, stream, errors='strict'):
- """ Creates a StreamWriter instance.
- stream must be a file-like object open for writing
- (binary) data.
- The StreamWriter may use different error handling
- schemes by providing the errors keyword argument. These
- parameters are predefined:
- 'strict' - raise a ValueError (or a subclass)
- 'ignore' - ignore the character and continue with the next
- 'replace'- replace with a suitable replacement character
- 'xmlcharrefreplace' - Replace with the appropriate XML
- character reference.
- 'backslashreplace' - Replace with backslashed escape
- sequences (only for encoding).
- The set of allowed parameter values can be extended via
- register_error.
- """
- self.stream = stream
- self.errors = errors
- def write(self, object):
- """ Writes the object's contents encoded to self.stream.
- """
- data, consumed = self.encode(object, self.errors)
- self.stream.write(data)
- def writelines(self, list):
- """ Writes the concatenated list of strings to the stream
- using .write().
- """
- self.write(''.join(list))
- def reset(self):
- """ Flushes and resets the codec buffers used for keeping state.
- Calling this method should ensure that the data on the
- output is put into a clean state, that allows appending
- of new fresh data without having to rescan the whole
- stream to recover state.
- """
- pass
- def seek(self, offset, whence=0):
- self.stream.seek(offset, whence)
- if whence == 0 and offset == 0:
- self.reset()
- def __getattr__(self, name,
- getattr=getattr):
- """ Inherit all other methods from the underlying stream.
- """
- return getattr(self.stream, name)
- def __enter__(self):
- return self
- def __exit__(self, type, value, tb):
- self.stream.close()
- ###
- class StreamReader(Codec):
- def __init__(self, stream, errors='strict'):
- """ Creates a StreamReader instance.
- stream must be a file-like object open for reading
- (binary) data.
- The StreamReader may use different error handling
- schemes by providing the errors keyword argument. These
- parameters are predefined:
- 'strict' - raise a ValueError (or a subclass)
- 'ignore' - ignore the character and continue with the next
- 'replace'- replace with a suitable replacement character;
- The set of allowed parameter values can be extended via
- register_error.
- """
- self.stream = stream
- self.errors = errors
- self.bytebuffer = ""
- # For str->str decoding this will stay a str
- # For str->unicode decoding the first read will promote it to unicode
- self.charbuffer = ""
- self.linebuffer = None
- def decode(self, input, errors='strict'):
- raise NotImplementedError
- def read(self, size=-1, chars=-1, firstline=False):
- """ Decodes data from the stream self.stream and returns the
- resulting object.
- chars indicates the number of characters to read from the
- stream. read() will never return more than chars
- characters, but it might return less, if there are not enough
- characters available.
- size indicates the approximate maximum number of bytes to
- read from the stream for decoding purposes. The decoder
- can modify this setting as appropriate. The default value
- -1 indicates to read and decode as much as possible. size
- is intended to prevent having to decode huge files in one
- step.
- If firstline is true, and a UnicodeDecodeError happens
- after the first line terminator in the input only the first line
- will be returned, the rest of the input will be kept until the
- next call to read().
- The method should use a greedy read strategy meaning that
- it should read as much data as is allowed within the
- definition of the encoding and the given size, e.g. if
- optional encoding endings or state markers are available
- on the stream, these should be read too.
- """
- # If we have lines cached, first merge them back into characters
- if self.linebuffer:
- self.charbuffer = "".join(self.linebuffer)
- self.linebuffer = None
- # read until we get the required number of characters (if available)
- while True:
- # can the request be satisfied from the character buffer?
- if chars >= 0:
- if len(self.charbuffer) >= chars:
- break
- elif size >= 0:
- if len(self.charbuffer) >= size:
- break
- # we need more data
- if size < 0:
- newdata = self.stream.read()
- else:
- newdata = self.stream.read(size)
- # decode bytes (those remaining from the last call included)
- data = self.bytebuffer + newdata
- try:
- newchars, decodedbytes = self.decode(data, self.errors)
- except UnicodeDecodeError, exc:
- if firstline:
- newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
- lines = newchars.splitlines(True)
- if len(lines)<=1:
- raise
- else:
- raise
- # keep undecoded bytes until the next call
- self.bytebuffer = data[decodedbytes:]
- # put new characters in the character buffer
- self.charbuffer += newchars
- # there was no data available
- if not newdata:
- break
- if chars < 0:
- # Return everything we've got
- result = self.charbuffer
- self.charbuffer = ""
- else:
- # Return the first chars characters
- result = self.charbuffer[:chars]
- self.charbuffer = self.charbuffer[chars:]
- return result
- def readline(self, size=None, keepends=True):
- """ Read one line from the input stream and return the
- decoded data.
- size, if given, is passed as size argument to the
- read() method.
- """
- # If we have lines cached from an earlier read, return
- # them unconditionally
- if self.linebuffer:
- line = self.linebuffer[0]
- del self.linebuffer[0]
- if len(self.linebuffer) == 1:
- # revert to charbuffer mode; we might need more data
- # next time
- self.charbuffer = self.linebuffer[0]
- self.linebuffer = None
- if not keepends:
- line = line.splitlines(False)[0]
- return line
- readsize = size or 72
- line = ""
- # If size is given, we call read() only once
- while True:
- data = self.read(readsize, firstline=True)
- if data:
- # If we're at a "\r" read one extra character (which might
- # be a "\n") to get a proper line ending. If the stream is
- # temporarily exhausted we return the wrong line ending.
- if data.endswith("\r"):
- data += self.read(size=1, chars=1)
- line += data
- lines = line.splitlines(True)
- if lines:
- if len(lines) > 1:
- # More than one line result; the first line is a full line
- # to return
- line = lines[0]
- del lines[0]
- if len(lines) > 1:
- # cache the remaining lines
- lines[-1] += self.charbuffer
- self.linebuffer = lines
- self.charbuffer = None
- else:
- # only one remaining line, put it back into charbuffer
- self.charbuffer = lines[0] + self.charbuffer
- if not keepends:
- line = line.splitlines(False)[0]
- break
- line0withend = lines[0]
- line0withoutend = lines[0].splitlines(False)[0]
- if line0withend != line0withoutend: # We really have a line end
- # Put the rest back together and keep it until the next call
- self.charbuffer = "".join(lines[1:]) + self.charbuffer
- if keepends:
- line = line0withend
- else:
- line = line0withoutend
- break
- # we didn't get anything or this was our only try
- if not data or size is not None:
- if line and not keepends:
- line = line.splitlines(False)[0]
- break
- if readsize<8000:
- readsize *= 2
- return line
- def readlines(self, sizehint=None, keepends=True):
- """ Read all lines available on the input stream
- and return them as list of lines.
- Line breaks are implemented using the codec's decoder
- method and are included in the list entries.
- sizehint, if given, is ignored since there is no efficient
- way to finding the true end-of-line.
- """
- data = self.read()
- return data.splitlines(keepends)
- def reset(self):
- """ Resets the codec buffers used for keeping state.
- Note that no stream repositioning should take place.
- This method is primarily intended to be able to recover
- from decoding errors.
- """
- self.bytebuffer = ""
- self.charbuffer = u""
- self.linebuffer = None
- def seek(self, offset, whence=0):
- """ Set the input stream's current position.
- Resets the codec buffers used for keeping state.
- """
- self.stream.seek(offset, whence)
- self.reset()
- def next(self):
- """ Return the next decoded line from the input stream."""
- line = self.readline()
- if line:
- return line
- raise StopIteration
- def __iter__(self):
- return self
- def __getattr__(self, name,
- getattr=getattr):
- """ Inherit all other methods from the underlying stream.
- """
- return getattr(self.stream, name)
- def __enter__(self):
- return self
- def __exit__(self, type, value, tb):
- self.stream.close()
- ###
- class StreamReaderWriter:
- """ StreamReaderWriter instances allow wrapping streams which
- work in both read and write modes.
- The design is such that one can use the factory functions
- returned by the codec.lookup() function to construct the
- instance.
- """
- # Optional attributes set by the file wrappers below
- encoding = 'unknown'
- def __init__(self, stream, Reader, Writer, errors='strict'):
- """ Creates a StreamReaderWriter instance.
- stream must be a Stream-like object.
- Reader, Writer must be factory functions or classes
- providing the StreamReader, StreamWriter interface resp.
- Error handling is done in the same way as defined for the
- StreamWriter/Readers.
- """
- self.stream = stream
- self.reader = Reader(stream, errors)
- self.writer = Writer(stream, errors)
- self.errors = errors
- def read(self, size=-1):
- return self.reader.read(size)
- def readline(self, size=None):
- return self.reader.readline(size)
- def readlines(self, sizehint=None):
- return self.reader.readlines(sizehint)
- def next(self):
- """ Return the next decoded line from the input stream."""
- return self.reader.next()
- def __iter__(self):
- return self
- def write(self, data):
- return self.writer.write(data)
- def writelines(self, list):
- return self.writer.writelines(list)
- def reset(self):
- self.reader.reset()
- self.writer.reset()
- def seek(self, offset, whence=0):
- self.stream.seek(offset, whence)
- self.reader.reset()
- if whence == 0 and offset == 0:
- self.writer.reset()
- def __getattr__(self, name,
- getattr=getattr):
- """ Inherit all other methods from the underlying stream.
- """
- return getattr(self.stream, name)
- # these are needed to make "with codecs.open(...)" work properly
- def __enter__(self):
- return self
- def __exit__(self, type, value, tb):
- self.stream.close()
- ###
- class StreamRecoder:
- """ StreamRecoder instances provide a frontend - backend
- view of encoding data.
- They use the complete set of APIs returned by the
- codecs.lookup() function to implement their task.
- Data written to the stream is first decoded into an
- intermediate format (which is dependent on the given codec
- combination) and then written to the stream using an instance
- of the provided Writer class.
- In the other direction, data is read from the stream using a
- Reader instance and then return encoded data to the caller.
- """
- # Optional attributes set by the file wrappers below
- data_encoding = 'unknown'
- file_encoding = 'unknown'
- def __init__(self, stream, encode, decode, Reader, Writer,
- errors='strict'):
- """ Creates a StreamRecoder instance which implements a two-way
- conversion: encode and decode work on the frontend (the
- input to .read() and output of .write()) while
- Reader and Writer work on the backend (reading and
- writing to the stream).
- You can use these objects to do transparent direct
- recodings from e.g. latin-1 to utf-8 and back.
- stream must be a file-like object.
- encode, decode must adhere to the Codec interface, Reader,
- Writer must be factory functions or classes providing the
- StreamReader, StreamWriter interface resp.
- encode and decode are needed for the frontend translation,
- Reader and Writer for the backend translation. Unicode is
- used as intermediate encoding.
- Error handling is done in the same way as defined for the
- StreamWriter/Readers.
- """
- self.stream = stream
- self.encode = encode
- self.decode = decode
- self.reader = Reader(stream, errors)
- self.writer = Writer(stream, errors)
- self.errors = errors
- def read(self, size=-1):
- data = self.reader.read(size)
- data, bytesencoded = self.encode(data, self.errors)
- return data
- def readline(self, size=None):
- if size is None:
- data = self.reader.readline()
- else:
- data = self.reader.readline(size)
- data, bytesencoded = self.encode(data, self.errors)
- return data
- def readlines(self, sizehint=None):
- data = self.reader.read()
- data, bytesencoded = self.encode(data, self.errors)
- return data.splitlines(1)
- def next(self):
- """ Return the next decoded line from the input stream."""
- data = self.reader.next()
- data, bytesencoded = self.encode(data, self.errors)
- return data
- def __iter__(self):
- return self
- def write(self, data):
- data, bytesdecoded = self.decode(data, self.errors)
- return self.writer.write(data)
- def writelines(self, list):
- data = ''.join(list)
- data, bytesdecoded = self.decode(data, self.errors)
- return self.writer.write(data)
- def reset(self):
- self.reader.reset()
- self.writer.reset()
- def __getattr__(self, name,
- getattr=getattr):
- """ Inherit all other methods from the underlying stream.
- """
- return getattr(self.stream, name)
- def __enter__(self):
- return self
- def __exit__(self, type, value, tb):
- self.stream.close()
- ### Shortcuts
- def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
- """ Open an encoded file using the given mode and return
- a wrapped version providing transparent encoding/decoding.
- Note: The wrapped version will only accept the object format
- defined by the codecs, i.e. Unicode objects for most builtin
- codecs. Output is also codec dependent and will usually be
- Unicode as well.
- Files are always opened in binary mode, even if no binary mode
- was specified. This is done to avoid data loss due to encodings
- using 8-bit values. The default file mode is 'rb' meaning to
- open the file in binary read mode.
- encoding specifies the encoding which is to be used for the
- file.
- errors may be given to define the error handling. It defaults
- to 'strict' which causes ValueErrors to be raised in case an
- encoding error occurs.
- buffering has the same meaning as for the builtin open() API.
- It defaults to line buffered.
- The returned wrapped file object provides an extra attribute
- .encoding which allows querying the used encoding. This
- attribute is only available if an encoding was specified as
- parameter.
- """
- if encoding is not None:
- if 'U' in mode:
- # No automatic conversion of '\n' is done on reading and writing
- mode = mode.strip().replace('U', '')
- if mode[:1] not in set('rwa'):
- mode = 'r' + mode
- if 'b' not in mode:
- # Force opening of the file in binary mode
- mode = mode + 'b'
- file = __builtin__.open(filename, mode, buffering)
- if encoding is None:
- return file
- info = lookup(encoding)
- srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
- # Add attributes to simplify introspection
- srw.encoding = encoding
- return srw
- def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
- """ Return a wrapped version of file which provides transparent
- encoding translation.
- Strings written to the wrapped file are interpreted according
- to the given data_encoding and then written to the original
- file as string using file_encoding. The intermediate encoding
- will usually be Unicode but depends on the specified codecs.
- Strings are read from the file using file_encoding and then
- passed back to the caller as string using data_encoding.
- If file_encoding is not given, it defaults to data_encoding.
- errors may be given to define the error handling. It defaults
- to 'strict' which causes ValueErrors to be raised in case an
- encoding error occurs.
- The returned wrapped file object provides two extra attributes
- .data_encoding and .file_encoding which reflect the given
- parameters of the same name. The attributes can be used for
- introspection by Python programs.
- """
- if file_encoding is None:
- file_encoding = data_encoding
- data_info = lookup(data_encoding)
- file_info = lookup(file_encoding)
- sr = StreamRecoder(file, data_info.encode, data_info.decode,
- file_info.streamreader, file_info.streamwriter, errors)
- # Add attributes to simplify introspection
- sr.data_encoding = data_encoding
- sr.file_encoding = file_encoding
- return sr
- ### Helpers for codec lookup
- def getencoder(encoding):
- """ Lookup up the codec for the given encoding and return
- its encoder function.
- Raises a LookupError in case the encoding cannot be found.
- """
- return lookup(encoding).encode
- def getdecoder(encoding):
- """ Lookup up the codec for the given encoding and return
- its decoder function.
- Raises a LookupError in case the encoding cannot be found.
- """
- return lookup(encoding).decode
- def getincrementalencoder(encoding):
- """ Lookup up the codec for the given encoding and return
- its IncrementalEncoder class or factory function.
- Raises a LookupError in case the encoding cannot be found
- or the codecs doesn't provide an incremental encoder.
- """
- encoder = lookup(encoding).incrementalencoder
- if encoder is None:
- raise LookupError(encoding)
- return encoder
- def getincrementaldecoder(encoding):
- """ Lookup up the codec for the given encoding and return
- its IncrementalDecoder class or factory function.
- Raises a LookupError in case the encoding cannot be found
- or the codecs doesn't provide an incremental decoder.
- """
- decoder = lookup(encoding).incrementaldecoder
- if decoder is None:
- raise LookupError(encoding)
- return decoder
- def getreader(encoding):
- """ Lookup up the codec for the given encoding and return
- its StreamReader class or factory function.
- Raises a LookupError in case the encoding cannot be found.
- """
- return lookup(encoding).streamreader
- def getwriter(encoding):
- """ Lookup up the codec for the given encoding and return
- its StreamWriter class or factory function.
- Raises a LookupError in case the encoding cannot be found.
- """
- return lookup(encoding).streamwriter
- def iterencode(iterator, encoding, errors='strict', **kwargs):
- """
- Encoding iterator.
- Encodes the input strings from the iterator using a IncrementalEncoder.
- errors and kwargs are passed through to the IncrementalEncoder
- constructor.
- """
- encoder = getincrementalencoder(encoding)(errors, **kwargs)
- for input in iterator:
- output = encoder.encode(input)
- if output:
- yield output
- output = encoder.encode("", True)
- if output:
- yield output
- def iterdecode(iterator, encoding, errors='strict', **kwargs):
- """
- Decoding iterator.
- Decodes the input strings from the iterator using a IncrementalDecoder.
- errors and kwargs are passed through to the IncrementalDecoder
- constructor.
- """
- decoder = getincrementaldecoder(encoding)(errors, **kwargs)
- for input in iterator:
- output = decoder.decode(input)
- if output:
- yield output
- output = decoder.decode("", True)
- if output:
- yield output
- ### Helpers for charmap-based codecs
- def make_identity_dict(rng):
- """ make_identity_dict(rng) -> dict
- Return a dictionary where elements of the rng sequence are
- mapped to themselves.
- """
- res = {}
- for i in rng:
- res[i]=i
- return res
- def make_encoding_map(decoding_map):
- """ Creates an encoding map from a decoding map.
- If a target mapping in the decoding map occurs multiple
- times, then that target is mapped to None (undefined mapping),
- causing an exception when encountered by the charmap codec
- during translation.
- One example where this happens is cp875.py which decodes
- multiple character to \\u001a.
- """
- m = {}
- for k,v in decoding_map.items():
- if not v in m:
- m[v] = k
- else:
- m[v] = None
- return m
- ### error handlers
- try:
- strict_errors = lookup_error("strict")
- ignore_errors = lookup_error("ignore")
- replace_errors = lookup_error("replace")
- xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
- backslashreplace_errors = lookup_error("backslashreplace")
- except LookupError:
- # In --disable-unicode builds, these error handler are missing
- strict_errors = None
- ignore_errors = None
- replace_errors = None
- xmlcharrefreplace_errors = None
- backslashreplace_errors = None
- # Tell modulefinder that using codecs probably needs the encodings
- # package
- _false = 0
- if _false:
- import encodings
- ### Tests
- if __name__ == '__main__':
- # Make stdout translate Latin-1 output into UTF-8 output
- sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
- # Have stdin translate Latin-1 input into UTF-8 input
- sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
|