1 """ Python 'utf-8-sig' Codec
2 This work similar to UTF-8 with the following changes:
4 * On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
7 * On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
14 def encode(input, errors='strict'):
15 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
17 def decode(input, errors='strict'):
19 if input[:3] == codecs.BOM_UTF8:
22 (output, consumed) = codecs.utf_8_decode(input, errors, True)
23 return (output, consumed+prefix)
25 class IncrementalEncoder(codecs.IncrementalEncoder):
26 def __init__(self, errors='strict'):
27 codecs.IncrementalEncoder.__init__(self, errors)
30 def encode(self, input, final=False):
33 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]
35 return codecs.utf_8_encode(input, self.errors)[0]
38 codecs.IncrementalEncoder.reset(self)
41 class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
42 def __init__(self, errors='strict'):
43 codecs.BufferedIncrementalDecoder.__init__(self, errors)
46 def _buffer_decode(self, input, errors, final):
49 if codecs.BOM_UTF8.startswith(input):
50 # not enough data to decide if this really is a BOM
51 # => try again on the next call
57 if input[:3] == codecs.BOM_UTF8:
58 (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)
59 return (output, consumed+3)
60 return codecs.utf_8_decode(input, errors, final)
63 codecs.BufferedIncrementalDecoder.reset(self)
66 class StreamWriter(codecs.StreamWriter):
68 codecs.StreamWriter.reset(self)
71 except AttributeError:
74 def encode(self, input, errors='strict'):
75 self.encode = codecs.utf_8_encode
76 return encode(input, errors)
78 class StreamReader(codecs.StreamReader):
80 codecs.StreamReader.reset(self)
83 except AttributeError:
86 def decode(self, input, errors='strict'):
88 if codecs.BOM_UTF8.startswith(input):
89 # not enough data to decide if this is a BOM
90 # => try again on the next call
92 elif input[:3] == codecs.BOM_UTF8:
93 self.decode = codecs.utf_8_decode
94 (output, consumed) = codecs.utf_8_decode(input[3:],errors)
95 return (output, consumed+3)
96 # (else) no BOM present
97 self.decode = codecs.utf_8_decode
98 return codecs.utf_8_decode(input, errors)
100 ### encodings module API
103 return codecs.CodecInfo(
107 incrementalencoder=IncrementalEncoder,
108 incrementaldecoder=IncrementalDecoder,
109 streamreader=StreamReader,
110 streamwriter=StreamWriter,