diff --git a/Lib/_pycodecs.py b/Lib/_pycodecs.py index 4068bd56693..98dec3c614d 100644 --- a/Lib/_pycodecs.py +++ b/Lib/_pycodecs.py @@ -22,10 +22,10 @@ The builtin Unicode codecs use the following interface: - _encode(Unicode_object[,errors='strict']) -> + _encode(Unicode_object[,errors='strict']) -> (string object, bytes consumed) - _decode(char_buffer_obj[,errors='strict']) -> + _decode(char_buffer_obj[,errors='strict']) -> (Unicode object, bytes consumed) _encode() interfaces also accept non-Unicode object as @@ -44,48 +44,76 @@ From PyPy v1.0.0 """ -#from unicodecodec import * - -__all__ = ['register', 'lookup', 'lookup_error', 'register_error', 'encode', 'decode', - 'latin_1_encode', 'mbcs_decode', 'readbuffer_encode', 'escape_encode', - 'utf_8_decode', 'raw_unicode_escape_decode', 'utf_7_decode', - 'unicode_escape_encode', 'latin_1_decode', 'utf_16_decode', - 'unicode_escape_decode', 'ascii_decode', 'charmap_encode', 'charmap_build', - 'unicode_internal_encode', 'unicode_internal_decode', 'utf_16_ex_decode', - 'escape_decode', 'charmap_decode', 'utf_7_encode', 'mbcs_encode', - 'ascii_encode', 'utf_16_encode', 'raw_unicode_escape_encode', 'utf_8_encode', - 'utf_16_le_encode', 'utf_16_be_encode', 'utf_16_le_decode', 'utf_16_be_decode', - 'utf_32_ex_decode',] +# from unicodecodec import * + +__all__ = [ + "register", + "lookup", + "lookup_error", + "register_error", + "encode", + "decode", + "latin_1_encode", + "mbcs_decode", + "readbuffer_encode", + "escape_encode", + "utf_8_decode", + "raw_unicode_escape_decode", + "utf_7_decode", + "unicode_escape_encode", + "latin_1_decode", + "utf_16_decode", + "unicode_escape_decode", + "ascii_decode", + "charmap_encode", + "charmap_build", + "unicode_internal_encode", + "unicode_internal_decode", + "utf_16_ex_decode", + "escape_decode", + "charmap_decode", + "utf_7_encode", + "mbcs_encode", + "ascii_encode", + "utf_16_encode", + "raw_unicode_escape_encode", + "utf_8_encode", + "utf_16_le_encode", + "utf_16_be_encode", + "utf_16_le_decode", + "utf_16_be_decode", + "utf_32_ex_decode", +] import sys import warnings from _codecs import * -def latin_1_encode( obj, errors='strict'): - """None - """ +def latin_1_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeLatin1(obj, len(obj), errors) res = bytes(res) return res, len(obj) + + # XXX MBCS codec might involve ctypes ? def mbcs_decode(): - """None - """ + """None""" pass -def readbuffer_encode( obj, errors='strict'): - """None - """ + +def readbuffer_encode(obj, errors="strict"): + """None""" if isinstance(obj, str): res = obj.encode() else: res = bytes(obj) return res, len(obj) -def escape_encode( obj, errors='strict'): - """None - """ + +def escape_encode(obj, errors="strict"): + """None""" if not isinstance(obj, bytes): raise TypeError("must be bytes") s = repr(obj).encode() @@ -94,85 +122,88 @@ def escape_encode( obj, errors='strict'): v = v.replace(b"'", b"\\'").replace(b'\\"', b'"') return v, len(obj) -def raw_unicode_escape_decode( data, errors='strict', final=False): - """None - """ - res = PyUnicode_DecodeRawUnicodeEscape(data, len(data), errors, final) - res = ''.join(res) - return res, len(data) -def utf_7_decode( data, errors='strict', final=False): - """None - """ +def raw_unicode_escape_decode(data, errors="strict", final=True): + """None""" + res, consumed = PyUnicode_DecodeRawUnicodeEscape(data, len(data), errors, final) + res = "".join(res) + return res, consumed + + +def utf_7_decode(data, errors="strict", final=False): + """None""" res, consumed = PyUnicode_DecodeUTF7(data, len(data), errors, final) - res = ''.join(res) + res = "".join(res) return res, consumed -def unicode_escape_encode( obj, errors='strict'): - """None - """ + +def unicode_escape_encode(obj, errors="strict"): + """None""" res = unicodeescape_string(obj, len(obj), 0) - res = b''.join(res) + res = b"".join(res) return res, len(obj) -def latin_1_decode( data, errors='strict'): - """None - """ + +def latin_1_decode(data, errors="strict"): + """None""" res = PyUnicode_DecodeLatin1(data, len(data), errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def utf_16_decode( data, errors='strict', final=False): - """None - """ + +def utf_16_decode(data, errors="strict", final=False): + """None""" consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'native', final) - res = ''.join(res) + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, "native", final + ) + res = "".join(res) return res, consumed -def unicode_escape_decode( data, errors='strict', final=False): - """None - """ - res = PyUnicode_DecodeUnicodeEscape(data, len(data), errors, final) - res = ''.join(res) - return res, len(data) + +def unicode_escape_decode(data, errors="strict", final=True): + """None""" + res, consumed = PyUnicode_DecodeUnicodeEscape(data, len(data), errors, final) + res = "".join(res) + return res, consumed -def ascii_decode( data, errors='strict'): - """None - """ +def ascii_decode(data, errors="strict"): + """None""" res = PyUnicode_DecodeASCII(data, len(data), errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def charmap_encode(obj, errors='strict', mapping='latin-1'): - """None - """ + +def charmap_encode(obj, errors="strict", mapping="latin-1"): + """None""" res = PyUnicode_EncodeCharmap(obj, len(obj), mapping, errors) res = bytes(res) return res, len(obj) + def charmap_build(s): return {ord(c): i for i, c in enumerate(s)} + if sys.maxunicode == 65535: unicode_bytes = 2 else: unicode_bytes = 4 -def unicode_internal_encode( obj, errors='strict'): - """None - """ + +def unicode_internal_encode(obj, errors="strict"): + """None""" if type(obj) == str: p = bytearray() t = [ord(x) for x in obj] for i in t: b = bytearray() for j in range(unicode_bytes): - b.append(i%256) + b.append(i % 256) i >>= 8 if sys.byteorder == "big": b.reverse() @@ -180,12 +211,12 @@ def unicode_internal_encode( obj, errors='strict'): res = bytes(p) return res, len(res) else: - res = "You can do better than this" # XXX make this right + res = "You can do better than this" # XXX make this right return res, len(res) -def unicode_internal_decode( unistr, errors='strict'): - """None - """ + +def unicode_internal_decode(unistr, errors="strict"): + """None""" if type(unistr) == str: return unistr, len(unistr) else: @@ -199,232 +230,281 @@ def unicode_internal_decode( unistr, errors='strict'): start = 0 stop = unicode_bytes step = 1 - while i < len(unistr)-unicode_bytes+1: + while i < len(unistr) - unicode_bytes + 1: t = 0 h = 0 for j in range(start, stop, step): - t += ord(unistr[i+j])<<(h*8) + t += ord(unistr[i + j]) << (h * 8) h += 1 i += unicode_bytes p += chr(t) - res = ''.join(p) + res = "".join(p) return res, len(res) -def utf_16_ex_decode( data, errors='strict', byteorder=0, final=0): - """None - """ + +def utf_16_ex_decode(data, errors="strict", byteorder=0, final=0): + """None""" if byteorder == 0: - bm = 'native' + bm = "native" elif byteorder == -1: - bm = 'little' + bm = "little" else: - bm = 'big' + bm = "big" consumed = len(data) if final: consumed = 0 - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, bm, final) - res = ''.join(res) + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, bm, final + ) + res = "".join(res) return res, consumed, byteorder -def utf_32_ex_decode( data, errors='strict', byteorder=0, final=0): - """None - """ + +def utf_32_ex_decode(data, errors="strict", byteorder=0, final=0): + """None""" if byteorder == 0: if len(data) < 4: if final and len(data): - if sys.byteorder == 'little': - bm = 'little' + if sys.byteorder == "little": + bm = "little" else: - bm = 'big' + bm = "big" res, consumed, _ = PyUnicode_DecodeUTF32Stateful( data, len(data), errors, bm, final ) - return ''.join(res), consumed, 0 - return '', 0, 0 - if data[0:4] == b'\xff\xfe\x00\x00': + return "".join(res), consumed, 0 + return "", 0, 0 + if data[0:4] == b"\xff\xfe\x00\x00": res, consumed, _ = PyUnicode_DecodeUTF32Stateful( - data[4:], len(data) - 4, errors, 'little', final + data[4:], len(data) - 4, errors, "little", final ) - return ''.join(res), consumed + 4, -1 - if data[0:4] == b'\x00\x00\xfe\xff': + return "".join(res), consumed + 4, -1 + if data[0:4] == b"\x00\x00\xfe\xff": res, consumed, _ = PyUnicode_DecodeUTF32Stateful( - data[4:], len(data) - 4, errors, 'big', final + data[4:], len(data) - 4, errors, "big", final ) - return ''.join(res), consumed + 4, 1 - if sys.byteorder == 'little': - bm = 'little' + return "".join(res), consumed + 4, 1 + if sys.byteorder == "little": + bm = "little" else: - bm = 'big' - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, bm, final) - return ''.join(res), consumed, 0 + bm = "big" + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, bm, final + ) + return "".join(res), consumed, 0 if byteorder == -1: - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'little', final) - return ''.join(res), consumed, -1 + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "little", final + ) + return "".join(res), consumed, -1 + + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "big", final + ) + return "".join(res), consumed, 1 + - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'big', final) - return ''.join(res), consumed, 1 +def _is_hex_digit(b): + return ( + 0x30 <= b <= 0x39 # 0-9 + or 0x41 <= b <= 0x46 # A-F + or 0x61 <= b <= 0x66 + ) # a-f -# XXX needs error messages when the input is invalid -def escape_decode(data, errors='strict'): - """None - """ + +def escape_decode(data, errors="strict"): + if isinstance(data, str): + data = data.encode("latin-1") l = len(data) i = 0 res = bytearray() while i < l: - - if data[i] == '\\': + if data[i] == 0x5C: # '\\' i += 1 if i >= l: raise ValueError("Trailing \\ in string") - else: - if data[i] == '\\': - res += b'\\' - elif data[i] == 'n': - res += b'\n' - elif data[i] == 't': - res += b'\t' - elif data[i] == 'r': - res += b'\r' - elif data[i] == 'b': - res += b'\b' - elif data[i] == '\'': - res += b'\'' - elif data[i] == '\"': - res += b'\"' - elif data[i] == 'f': - res += b'\f' - elif data[i] == 'a': - res += b'\a' - elif data[i] == 'v': - res += b'\v' - elif '0' <= data[i] <= '9': - # emulate a strange wrap-around behavior of CPython: - # \400 is the same as \000 because 0400 == 256 - octal = data[i:i+3] - res.append(int(octal, 8) & 0xFF) - i += 2 - elif data[i] == 'x': - hexa = data[i+1:i+3] - res.append(int(hexa, 16)) + ch = data[i] + if ch == 0x5C: + res.append(0x5C) # \\ + elif ch == 0x27: + res.append(0x27) # \' + elif ch == 0x22: + res.append(0x22) # \" + elif ch == 0x61: + res.append(0x07) # \a + elif ch == 0x62: + res.append(0x08) # \b + elif ch == 0x66: + res.append(0x0C) # \f + elif ch == 0x6E: + res.append(0x0A) # \n + elif ch == 0x72: + res.append(0x0D) # \r + elif ch == 0x74: + res.append(0x09) # \t + elif ch == 0x76: + res.append(0x0B) # \v + elif ch == 0x0A: + pass # \ continuation + elif 0x30 <= ch <= 0x37: # \0-\7 octal + val = ch - 0x30 + if i + 1 < l and 0x30 <= data[i + 1] <= 0x37: + i += 1 + val = (val << 3) | (data[i] - 0x30) + if i + 1 < l and 0x30 <= data[i + 1] <= 0x37: + i += 1 + val = (val << 3) | (data[i] - 0x30) + res.append(val & 0xFF) + elif ch == 0x78: # \x hex + hex_count = 0 + for j in range(1, 3): + if i + j < l and _is_hex_digit(data[i + j]): + hex_count += 1 + else: + break + if hex_count < 2: + if errors == "strict": + raise ValueError("invalid \\x escape at position %d" % (i - 1)) + elif errors == "replace": + res.append(0x3F) # '?' + i += hex_count + else: + res.append(int(bytes(data[i + 1 : i + 3]), 16)) i += 2 + else: + import warnings + + warnings.warn( + '"\\%c" is an invalid escape sequence' % ch + if 0x20 <= ch < 0x7F + else '"\\x%02x" is an invalid escape sequence' % ch, + DeprecationWarning, + stacklevel=2, + ) + res.append(0x5C) + res.append(ch) else: res.append(data[i]) i += 1 - res = bytes(res) - return res, len(res) + return bytes(res), l + -def charmap_decode( data, errors='strict', mapping=None): - """None - """ +def charmap_decode(data, errors="strict", mapping=None): + """None""" res = PyUnicode_DecodeCharmap(data, len(data), mapping, errors) - res = ''.join(res) + res = "".join(res) return res, len(data) -def utf_7_encode( obj, errors='strict'): - """None - """ +def utf_7_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeUTF7(obj, len(obj), 0, 0, errors) - res = b''.join(res) + res = b"".join(res) return res, len(obj) -def mbcs_encode( obj, errors='strict'): - """None - """ + +def mbcs_encode(obj, errors="strict"): + """None""" pass + + ## return (PyUnicode_EncodeMBCS( -## (obj), +## (obj), ## len(obj), ## errors), ## len(obj)) - -def ascii_encode( obj, errors='strict'): - """None - """ + +def ascii_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeASCII(obj, len(obj), errors) res = bytes(res) return res, len(obj) -def utf_16_encode( obj, errors='strict'): - """None - """ - res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'native') + +def utf_16_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF16(obj, len(obj), errors, "native") res = bytes(res) return res, len(obj) -def raw_unicode_escape_encode( obj, errors='strict'): - """None - """ + +def raw_unicode_escape_encode(obj, errors="strict"): + """None""" res = PyUnicode_EncodeRawUnicodeEscape(obj, len(obj)) res = bytes(res) return res, len(obj) -def utf_16_le_encode( obj, errors='strict'): - """None - """ - res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'little') + +def utf_16_le_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF16(obj, len(obj), errors, "little") res = bytes(res) return res, len(obj) -def utf_16_be_encode( obj, errors='strict'): - """None - """ - res = PyUnicode_EncodeUTF16(obj, len(obj), errors, 'big') + +def utf_16_be_encode(obj, errors="strict"): + """None""" + res = PyUnicode_EncodeUTF16(obj, len(obj), errors, "big") res = bytes(res) return res, len(obj) -def utf_16_le_decode(data, errors='strict', final=0): - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'little', final) - res = ''.join(res) + +def utf_16_le_decode(data, errors="strict", final=0): + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, "little", final + ) + res = "".join(res) return res, consumed -def utf_16_be_decode(data, errors='strict', final=0): - res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful(data, len(data), errors, 'big', final) - res = ''.join(res) + +def utf_16_be_decode(data, errors="strict", final=0): + res, consumed, byteorder = PyUnicode_DecodeUTF16Stateful( + data, len(data), errors, "big", final + ) + res = "".join(res) return res, consumed def STORECHAR32(ch, byteorder): """Store a 32-bit character as 4 bytes in the specified byte order.""" - b0 = ch & 0xff - b1 = (ch >> 8) & 0xff - b2 = (ch >> 16) & 0xff - b3 = (ch >> 24) & 0xff - if byteorder == 'little': + b0 = ch & 0xFF + b1 = (ch >> 8) & 0xFF + b2 = (ch >> 16) & 0xFF + b3 = (ch >> 24) & 0xFF + if byteorder == "little": return [b0, b1, b2, b3] else: # big-endian return [b3, b2, b1, b0] -def PyUnicode_EncodeUTF32(s, size, errors, byteorder='little'): +def PyUnicode_EncodeUTF32(s, size, errors, byteorder="little"): """Encode a Unicode string to UTF-32.""" p = [] bom = sys.byteorder - if byteorder == 'native': + if byteorder == "native": bom = sys.byteorder # Add BOM for native encoding p += STORECHAR32(0xFEFF, bom) - if byteorder == 'little': - bom = 'little' - elif byteorder == 'big': - bom = 'big' + if byteorder == "little": + bom = "little" + elif byteorder == "big": + bom = "big" pos = 0 while pos < len(s): ch = ord(s[pos]) if 0xD800 <= ch <= 0xDFFF: - if errors == 'surrogatepass': + if errors == "surrogatepass": p += STORECHAR32(ch, bom) pos += 1 else: res, pos = unicode_call_errorhandler( - errors, 'utf-32', 'surrogates not allowed', - s, pos, pos + 1, False) + errors, "utf-32", "surrogates not allowed", s, pos, pos + 1, False + ) for c in res: p += STORECHAR32(ord(c), bom) else: @@ -434,26 +514,26 @@ def PyUnicode_EncodeUTF32(s, size, errors, byteorder='little'): return p -def utf_32_encode(obj, errors='strict'): +def utf_32_encode(obj, errors="strict"): """UTF-32 encoding with BOM.""" - encoded = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'native') + encoded = PyUnicode_EncodeUTF32(obj, len(obj), errors, "native") return bytes(encoded), len(obj) -def utf_32_le_encode(obj, errors='strict'): +def utf_32_le_encode(obj, errors="strict"): """UTF-32 little-endian encoding without BOM.""" - encoded = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'little') + encoded = PyUnicode_EncodeUTF32(obj, len(obj), errors, "little") return bytes(encoded), len(obj) -def utf_32_be_encode(obj, errors='strict'): +def utf_32_be_encode(obj, errors="strict"): """UTF-32 big-endian encoding without BOM.""" - res = PyUnicode_EncodeUTF32(obj, len(obj), errors, 'big') + res = PyUnicode_EncodeUTF32(obj, len(obj), errors, "big") res = bytes(res) return res, len(obj) -def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final=0): +def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder="little", final=0): """Decode UTF-32 encoded bytes to Unicode string.""" if size == 0: return [], 0, 0 @@ -463,28 +543,44 @@ def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final= aligned_size = (size // 4) * 4 while pos + 3 < aligned_size: - if byteorder == 'little': - ch = data[pos] | (data[pos+1] << 8) | (data[pos+2] << 16) | (data[pos+3] << 24) + if byteorder == "little": + ch = ( + data[pos] + | (data[pos + 1] << 8) + | (data[pos + 2] << 16) + | (data[pos + 3] << 24) + ) else: # big-endian - ch = (data[pos] << 24) | (data[pos+1] << 16) | (data[pos+2] << 8) | data[pos+3] + ch = ( + (data[pos] << 24) + | (data[pos + 1] << 16) + | (data[pos + 2] << 8) + | data[pos + 3] + ) # Validate code point if ch > 0x10FFFF: - if errors == 'strict': - raise UnicodeDecodeError('utf-32', bytes(data), pos, pos+4, - 'codepoint not in range(0x110000)') - elif errors == 'replace': - result.append('\ufffd') + if errors == "strict": + raise UnicodeDecodeError( + "utf-32", + bytes(data), + pos, + pos + 4, + "codepoint not in range(0x110000)", + ) + elif errors == "replace": + result.append("\ufffd") # 'ignore' - skip this character pos += 4 elif 0xD800 <= ch <= 0xDFFF: - if errors == 'surrogatepass': + if errors == "surrogatepass": result.append(chr(ch)) pos += 4 else: - msg = 'code point in surrogate code point range(0xd800, 0xe000)' + msg = "code point in surrogate code point range(0xd800, 0xe000)" res, pos = unicode_call_errorhandler( - errors, 'utf-32', msg, data, pos, pos + 4, True) + errors, "utf-32", msg, data, pos, pos + 4, True + ) result.append(res) else: result.append(chr(ch)) @@ -494,47 +590,57 @@ def PyUnicode_DecodeUTF32Stateful(data, size, errors, byteorder='little', final= if pos < size: if final: res, pos = unicode_call_errorhandler( - errors, 'utf-32', 'truncated data', - data, pos, size, True) + errors, "utf-32", "truncated data", data, pos, size, True + ) if res: result.append(res) return result, pos, 0 -def utf_32_decode(data, errors='strict', final=0): +def utf_32_decode(data, errors="strict", final=0): """UTF-32 decoding with BOM detection.""" if len(data) >= 4: # Check for BOM - if data[0:4] == b'\xff\xfe\x00\x00': + if data[0:4] == b"\xff\xfe\x00\x00": # UTF-32 LE BOM - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data[4:], len(data)-4, errors, 'little', final) - res = ''.join(res) + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data[4:], len(data) - 4, errors, "little", final + ) + res = "".join(res) return res, consumed + 4 - elif data[0:4] == b'\x00\x00\xfe\xff': + elif data[0:4] == b"\x00\x00\xfe\xff": # UTF-32 BE BOM - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data[4:], len(data)-4, errors, 'big', final) - res = ''.join(res) + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data[4:], len(data) - 4, errors, "big", final + ) + res = "".join(res) return res, consumed + 4 # Default to little-endian if no BOM - byteorder = 'little' if sys.byteorder == 'little' else 'big' - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, byteorder, final) - res = ''.join(res) + byteorder = "little" if sys.byteorder == "little" else "big" + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, byteorder, final + ) + res = "".join(res) return res, consumed -def utf_32_le_decode(data, errors='strict', final=0): +def utf_32_le_decode(data, errors="strict", final=0): """UTF-32 little-endian decoding without BOM.""" - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'little', final) - res = ''.join(res) + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "little", final + ) + res = "".join(res) return res, consumed -def utf_32_be_decode(data, errors='strict', final=0): +def utf_32_be_decode(data, errors="strict", final=0): """UTF-32 big-endian decoding without BOM.""" - res, consumed, _ = PyUnicode_DecodeUTF32Stateful(data, len(data), errors, 'big', final) - res = ''.join(res) + res, consumed, _ = PyUnicode_DecodeUTF32Stateful( + data, len(data), errors, "big", final + ) + res = "".join(res) return res, consumed @@ -543,9 +649,9 @@ def utf_32_be_decode(data, errors='strict', final=0): ##import sys ##""" Python implementation of CPythons builtin unicode codecs. ## -## Generally the functions in this module take a list of characters an returns +## Generally the functions in this module take a list of characters an returns ## a list of characters. -## +## ## For use in the PyPy project""" @@ -555,62 +661,211 @@ def utf_32_be_decode(data, errors='strict', final=0): ## 1 - special ## 2 - whitespace (optional) ## 3 - RFC2152 Set O (optional) - + utf7_special = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, - 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 2, + 1, + 1, + 2, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 1, + 2, + 3, + 3, + 3, + 3, + 3, + 3, + 0, + 0, + 0, + 3, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 3, + 3, + 3, + 0, + 3, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 1, + 3, + 3, + 3, + 3, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + 3, + 3, + 3, + 1, + 1, ] -unicode_latin1 = [None]*256 +unicode_latin1 = [None] * 256 def SPECIAL(c, encodeO, encodeWS): c = ord(c) - return (c>127 or utf7_special[c] == 1) or \ - (encodeWS and (utf7_special[(c)] == 2)) or \ - (encodeO and (utf7_special[(c)] == 3)) + return ( + (c > 127 or utf7_special[c] == 1) + or (encodeWS and (utf7_special[(c)] == 2)) + or (encodeO and (utf7_special[(c)] == 3)) + ) + + def B64(n): - return bytes([b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f]]) + return bytes( + [ + b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[ + (n) & 0x3F + ] + ] + ) + + def B64CHAR(c): - return (c.isalnum() or (c) == b'+' or (c) == b'/') + return c.isalnum() or (c) == b"+" or (c) == b"/" + + def UB64(c): - if (c) == b'+' : - return 62 - elif (c) == b'/': - return 63 - elif (c) >= b'a': - return ord(c) - 71 - elif (c) >= b'A': - return ord(c) - 65 - else: + if (c) == b"+": + return 62 + elif (c) == b"/": + return 63 + elif (c) >= b"a": + return ord(c) - 71 + elif (c) >= b"A": + return ord(c) - 65 + else: return ord(c) + 4 -def ENCODE( ch, bits) : + +def ENCODE(ch, bits): out = [] - while (bits >= 6): - out += B64(ch >> (bits-6)) - bits -= 6 + while bits >= 6: + out += B64(ch >> (bits - 6)) + bits -= 6 return out, bits + def _IS_BASE64(ch): - return (ord('A') <= ch <= ord('Z')) or (ord('a') <= ch <= ord('z')) or \ - (ord('0') <= ch <= ord('9')) or ch == ord('+') or ch == ord('/') + return ( + (ord("A") <= ch <= ord("Z")) + or (ord("a") <= ch <= ord("z")) + or (ord("0") <= ch <= ord("9")) + or ch == ord("+") + or ch == ord("/") + ) + def _FROM_BASE64(ch): - if ch == ord('+'): return 62 - if ch == ord('/'): return 63 - if ch >= ord('a'): return ch - 71 - if ch >= ord('A'): return ch - 65 - if ch >= ord('0'): return ch - ord('0') + 52 + if ch == ord("+"): + return 62 + if ch == ord("/"): + return 63 + if ch >= ord("a"): + return ch - 71 + if ch >= ord("A"): + return ch - 65 + if ch >= ord("0"): + return ch - ord("0") + 52 return -1 + def _DECODE_DIRECT(ch): - return ch <= 127 and ch != ord('+') + return ch <= 127 and ch != ord("+") + def PyUnicode_DecodeUTF7(s, size, errors, final=False): if size == 0: @@ -633,12 +888,16 @@ def PyUnicode_DecodeUTF7(s, size, errors, final=False): base64bits += 6 i += 1 if base64bits >= 16: - outCh = (base64buffer >> (base64bits - 16)) & 0xffff + outCh = (base64buffer >> (base64bits - 16)) & 0xFFFF base64bits -= 16 base64buffer &= (1 << base64bits) - 1 if surrogate: if 0xDC00 <= outCh <= 0xDFFF: - ch2 = 0x10000 + ((surrogate - 0xD800) << 10) + (outCh - 0xDC00) + ch2 = ( + 0x10000 + + ((surrogate - 0xD800) << 10) + + (outCh - 0xDC00) + ) p.append(chr(ch2)) surrogate = 0 continue @@ -656,7 +915,8 @@ def PyUnicode_DecodeUTF7(s, size, errors, final=False): i += 1 errmsg = "partial character in shift sequence" out, i = unicode_call_errorhandler( - errors, 'utf-7', errmsg, s, startinpos, i) + errors, "utf-7", errmsg, s, startinpos, i + ) p.append(out) continue else: @@ -664,25 +924,27 @@ def PyUnicode_DecodeUTF7(s, size, errors, final=False): i += 1 errmsg = "non-zero padding bits in shift sequence" out, i = unicode_call_errorhandler( - errors, 'utf-7', errmsg, s, startinpos, i) + errors, "utf-7", errmsg, s, startinpos, i + ) p.append(out) continue if surrogate and _DECODE_DIRECT(ch): p.append(chr(surrogate)) surrogate = 0 - if ch == ord('-'): + if ch == ord("-"): i += 1 - elif ch == ord('+'): + elif ch == ord("+"): startinpos = i i += 1 - if i < size and s[i] == ord('-'): + if i < size and s[i] == ord("-"): i += 1 - p.append('+') + p.append("+") elif i < size and not _IS_BASE64(s[i]): i += 1 errmsg = "ill-formed sequence" out, i = unicode_call_errorhandler( - errors, 'utf-7', errmsg, s, startinpos, i) + errors, "utf-7", errmsg, s, startinpos, i + ) p.append(out) else: inShift = True @@ -698,7 +960,8 @@ def PyUnicode_DecodeUTF7(s, size, errors, final=False): i += 1 errmsg = "unexpected special character" out, i = unicode_call_errorhandler( - errors, 'utf-7', errmsg, s, startinpos, i) + errors, "utf-7", errmsg, s, startinpos, i + ) p.append(out) if inShift and not final: @@ -708,11 +971,13 @@ def PyUnicode_DecodeUTF7(s, size, errors, final=False): if surrogate or base64bits >= 6 or (base64bits > 0 and base64buffer != 0): errmsg = "unterminated shift sequence" out, i = unicode_call_errorhandler( - errors, 'utf-7', errmsg, s, startinpos, size) + errors, "utf-7", errmsg, s, startinpos, size + ) p.append(out) return p, size + def _ENCODE_DIRECT(ch, encodeSetO, encodeWhiteSpace): c = ord(ch) if isinstance(ch, str) else ch if c > 127: @@ -725,6 +990,7 @@ def _ENCODE_DIRECT(ch, encodeSetO, encodeWhiteSpace): return not encodeSetO return False + def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors): inShift = False base64bits = 0 @@ -741,8 +1007,8 @@ def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors): base64buffer = 0 base64bits = 0 inShift = False - if B64CHAR(ch) or ch == '-': - out.append(b'-') + if B64CHAR(ch) or ch == "-": + out.append(b"-") out.append(bytes([ch_ord])) else: # encode character in base64 @@ -765,12 +1031,12 @@ def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors): base64bits -= 6 base64buffer &= (1 << base64bits) - 1 if base64bits else 0 else: - if ch == '+': - out.append(b'+-') + if ch == "+": + out.append(b"+-") elif _ENCODE_DIRECT(ch, encodeSetO, encodeWhiteSpace): out.append(bytes([ch_ord])) else: - out.append(b'+') + out.append(b"+") inShift = True # encode character in base64 if ch_ord >= 0x10000: @@ -795,95 +1061,96 @@ def PyUnicode_EncodeUTF7(s, size, encodeSetO, encodeWhiteSpace, errors): if i + 1 < size: ch2 = s[i + 1] if _ENCODE_DIRECT(ch2, encodeSetO, encodeWhiteSpace): - if B64CHAR(ch2) or ch2 == '-': - out.append(b'-') + if B64CHAR(ch2) or ch2 == "-": + out.append(b"-") inShift = False else: - out.append(b'-') + out.append(b"-") inShift = False if base64bits: out.append(B64(base64buffer << (6 - base64bits))) if inShift: - out.append(b'-') + out.append(b"-") return out -unicode_empty = '' -def unicodeescape_string(s, size, quotes): +unicode_empty = "" + +def unicodeescape_string(s, size, quotes): p = [] - if (quotes) : - if (s.find('\'') != -1 and s.find('"') == -1): + if quotes: + if s.find("'") != -1 and s.find('"') == -1: p.append(b'"') else: - p.append(b'\'') + p.append(b"'") pos = 0 - while (pos < size): + while pos < size: ch = s[pos] - #/* Escape quotes */ - if (quotes and (ch == p[1] or ch == '\\')): - p.append(b'\\%c' % ord(ch)) + # /* Escape quotes */ + if quotes and (ch == p[1] or ch == "\\"): + p.append(b"\\%c" % ord(ch)) pos += 1 continue -#ifdef Py_UNICODE_WIDE - #/* Map 21-bit characters to '\U00xxxxxx' */ - elif (ord(ch) >= 0x10000): - p.append(b'\\U%08x' % ord(ch)) + # ifdef Py_UNICODE_WIDE + # /* Map 21-bit characters to '\U00xxxxxx' */ + elif ord(ch) >= 0x10000: + p.append(b"\\U%08x" % ord(ch)) pos += 1 - continue -#endif - #/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ - elif (ord(ch) >= 0xD800 and ord(ch) < 0xDC00): + continue + # endif + # /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ + elif ord(ch) >= 0xD800 and ord(ch) < 0xDC00: pos += 1 ch2 = s[pos] - - if (ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF): + + if ord(ch2) >= 0xDC00 and ord(ch2) <= 0xDFFF: ucs = (((ord(ch) & 0x03FF) << 10) | (ord(ch2) & 0x03FF)) + 0x00010000 - p.append(b'\\U%08x' % ucs) + p.append(b"\\U%08x" % ucs) pos += 1 continue - - #/* Fall through: isolated surrogates are copied as-is */ + + # /* Fall through: isolated surrogates are copied as-is */ pos -= 1 - - #/* Map 16-bit characters to '\uxxxx' */ - if (ord(ch) >= 256): - p.append(b'\\u%04x' % ord(ch)) - - #/* Map special whitespace to '\t', \n', '\r' */ - elif (ch == '\t'): - p.append(b'\\t') - - elif (ch == '\n'): - p.append(b'\\n') - - elif (ch == '\r'): - p.append(b'\\r') - - elif (ch == '\\'): - p.append(b'\\\\') - - #/* Map non-printable US ASCII to '\xhh' */ - elif (ch < ' ' or ch >= chr(0x7F)) : - p.append(b'\\x%02x' % ord(ch)) - #/* Copy everything else as-is */ + + # /* Map 16-bit characters to '\uxxxx' */ + if ord(ch) >= 256: + p.append(b"\\u%04x" % ord(ch)) + + # /* Map special whitespace to '\t', \n', '\r' */ + elif ch == "\t": + p.append(b"\\t") + + elif ch == "\n": + p.append(b"\\n") + + elif ch == "\r": + p.append(b"\\r") + + elif ch == "\\": + p.append(b"\\\\") + + # /* Map non-printable US ASCII to '\xhh' */ + elif ch < " " or ch >= chr(0x7F): + p.append(b"\\x%02x" % ord(ch)) + # /* Copy everything else as-is */ else: p.append(bytes([ord(ch)])) pos += 1 - if (quotes): + if quotes: p.append(p[0]) return p -def PyUnicode_DecodeASCII(s, size, errors): -# /* ASCII is equivalent to the first 128 ordinals in Unicode. */ - if (size == 1 and ord(s) < 128) : +def PyUnicode_DecodeASCII(s, size, errors): + # /* ASCII is equivalent to the first 128 ordinals in Unicode. */ + if size == 1 and ord(s) < 128: return [chr(ord(s))] - if (size == 0): - return [''] #unicode('') + if size == 0: + return [""] # unicode('') p = [] pos = 0 while pos < len(s): @@ -892,54 +1159,50 @@ def PyUnicode_DecodeASCII(s, size, errors): p += chr(c) pos += 1 else: - res = unicode_call_errorhandler( - errors, "ascii", "ordinal not in range(128)", - s, pos, pos+1) + errors, "ascii", "ordinal not in range(128)", s, pos, pos + 1 + ) p += res[0] pos = res[1] return p -def PyUnicode_EncodeASCII(p, size, errors): +def PyUnicode_EncodeASCII(p, size, errors): return unicode_encode_ucs1(p, size, errors, 128) -def PyUnicode_AsASCIIString(unistr): +def PyUnicode_AsASCIIString(unistr): if not type(unistr) == str: raise TypeError - return PyUnicode_EncodeASCII(unistr, - len(unistr), - None) + return PyUnicode_EncodeASCII(unistr, len(unistr), None) -def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=True): - bo = 0 #/* assume native ordering by default */ +def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder="native", final=True): + bo = 0 # /* assume native ordering by default */ consumed = 0 errmsg = "" - if sys.byteorder == 'little': + if sys.byteorder == "little": ihi = 1 ilo = 0 else: ihi = 0 ilo = 1 - - #/* Unpack UTF-16 encoded data */ + # /* Unpack UTF-16 encoded data */ -## /* Check for BOM marks (U+FEFF) in the input and adjust current -## byte order setting accordingly. In native mode, the leading BOM -## mark is skipped, in all other modes, it is copied to the output -## stream as-is (giving a ZWNBSP character). */ + ## /* Check for BOM marks (U+FEFF) in the input and adjust current + ## byte order setting accordingly. In native mode, the leading BOM + ## mark is skipped, in all other modes, it is copied to the output + ## stream as-is (giving a ZWNBSP character). */ q = 0 p = [] - if byteorder == 'native': - if (size >= 2): + if byteorder == "native": + if size >= 2: bom = (s[ihi] << 8) | s[ilo] -#ifdef BYTEORDER_IS_LITTLE_ENDIAN - if sys.byteorder == 'little': - if (bom == 0xFEFF): + # ifdef BYTEORDER_IS_LITTLE_ENDIAN + if sys.byteorder == "little": + if bom == 0xFEFF: q += 2 bo = -1 elif bom == 0xFFFE: @@ -952,126 +1215,125 @@ def PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder='native', final=Tru elif bom == 0xFFFE: q += 2 bo = -1 - elif byteorder == 'little': + elif byteorder == "little": bo = -1 else: bo = 1 - - if (size == 0): - return [''], 0, bo - - if (bo == -1): - #/* force LE */ + + if size == 0: + return [""], 0, bo + + if bo == -1: + # /* force LE */ ihi = 1 ilo = 0 - elif (bo == 1): - #/* force BE */ + elif bo == 1: + # /* force BE */ ihi = 0 ilo = 1 - while (q < len(s)): - - #/* remaining bytes at the end? (size should be even) */ - if (len(s) - q < 2): + while q < len(s): + # /* remaining bytes at the end? (size should be even) */ + if len(s) - q < 2: if not final: break res, q = unicode_call_errorhandler( - errors, 'utf-16', "truncated data", - s, q, len(s), True) + errors, "utf-16", "truncated data", s, q, len(s), True + ) p.append(res) break - ch = (s[q+ihi] << 8) | s[q+ilo] + ch = (s[q + ihi] << 8) | s[q + ilo] - if (ch < 0xD800 or ch > 0xDFFF): + if ch < 0xD800 or ch > 0xDFFF: p.append(chr(ch)) q += 2 continue - #/* UTF-16 code pair: high surrogate */ - if (0xD800 <= ch <= 0xDBFF): - if (q + 4 <= len(s)): - ch2 = (s[q+2+ihi] << 8) | s[q+2+ilo] - if (0xDC00 <= ch2 <= 0xDFFF): + # /* UTF-16 code pair: high surrogate */ + if 0xD800 <= ch <= 0xDBFF: + if q + 4 <= len(s): + ch2 = (s[q + 2 + ihi] << 8) | s[q + 2 + ilo] + if 0xDC00 <= ch2 <= 0xDFFF: # Valid surrogate pair - always assemble p.append(chr((((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000)) q += 4 continue else: # High surrogate followed by non-low-surrogate - if errors == 'surrogatepass': + if errors == "surrogatepass": p.append(chr(ch)) q += 2 continue res, q = unicode_call_errorhandler( - errors, 'utf-16', "illegal UTF-16 surrogate", - s, q, q + 2, True) + errors, "utf-16", "illegal UTF-16 surrogate", s, q, q + 2, True + ) p.append(res) else: # High surrogate at end of data if not final: break - if errors == 'surrogatepass': + if errors == "surrogatepass": p.append(chr(ch)) q += 2 continue res, q = unicode_call_errorhandler( - errors, 'utf-16', "unexpected end of data", - s, q, len(s), True) + errors, "utf-16", "unexpected end of data", s, q, len(s), True + ) p.append(res) else: # Low surrogate without preceding high surrogate - if errors == 'surrogatepass': + if errors == "surrogatepass": p.append(chr(ch)) q += 2 continue res, q = unicode_call_errorhandler( - errors, 'utf-16', "illegal encoding", - s, q, q + 2, True) + errors, "utf-16", "illegal encoding", s, q, q + 2, True + ) p.append(res) return p, q, bo + # moved out of local scope, especially because it didn't # have any nested variables. + def STORECHAR(CH, byteorder): - hi = (CH >> 8) & 0xff - lo = CH & 0xff - if byteorder == 'little': + hi = (CH >> 8) & 0xFF + lo = CH & 0xFF + if byteorder == "little": return [lo, hi] else: return [hi, lo] -def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'): -# /* Offsets from p for storing byte pairs in the right order. */ +def PyUnicode_EncodeUTF16(s, size, errors, byteorder="little"): + # /* Offsets from p for storing byte pairs in the right order. */ - p = [] bom = sys.byteorder - if (byteorder == 'native'): - + if byteorder == "native": bom = sys.byteorder p += STORECHAR(0xFEFF, bom) - - if (byteorder == 'little' ): - bom = 'little' - elif (byteorder == 'big'): - bom = 'big' + + if byteorder == "little": + bom = "little" + elif byteorder == "big": + bom = "big" pos = 0 while pos < len(s): ch = ord(s[pos]) if 0xD800 <= ch <= 0xDFFF: - if errors == 'surrogatepass': + if errors == "surrogatepass": p += STORECHAR(ch, bom) pos += 1 else: res, pos = unicode_call_errorhandler( - errors, 'utf-16', 'surrogates not allowed', - s, pos, pos + 1, False) + errors, "utf-16", "surrogates not allowed", s, pos, pos + 1, False + ) for c in res: cp = ord(c) cp2 = 0 @@ -1097,123 +1359,149 @@ def PyUnicode_EncodeUTF16(s, size, errors, byteorder='little'): def PyUnicode_DecodeMBCS(s, size, errors): pass + def PyUnicode_EncodeMBCS(p, size, errors): pass -def unicode_call_errorhandler(errors, encoding, - reason, input, startinpos, endinpos, decode=True): - + +def unicode_call_errorhandler( + errors, encoding, reason, input, startinpos, endinpos, decode=True +): errorHandler = lookup_error(errors) if decode: - exceptionObject = UnicodeDecodeError(encoding, input, startinpos, endinpos, reason) + exceptionObject = UnicodeDecodeError( + encoding, input, startinpos, endinpos, reason + ) else: - exceptionObject = UnicodeEncodeError(encoding, input, startinpos, endinpos, reason) + exceptionObject = UnicodeEncodeError( + encoding, input, startinpos, endinpos, reason + ) res = errorHandler(exceptionObject) - if isinstance(res, tuple) and isinstance(res[0], (str, bytes)) and isinstance(res[1], int): + if ( + isinstance(res, tuple) + and isinstance(res[0], (str, bytes)) + and isinstance(res[1], int) + ): newpos = res[1] - if (newpos < 0): + if newpos < 0: newpos = len(input) + newpos if newpos < 0 or newpos > len(input): - raise IndexError( "position %d from error handler out of bounds" % newpos) + raise IndexError("position %d from error handler out of bounds" % newpos) return res[0], newpos else: - raise TypeError("encoding error handler must return (unicode, int) tuple, not %s" % repr(res)) + raise TypeError( + "encoding error handler must return (unicode, int) tuple, not %s" + % repr(res) + ) + + +# /* --- Latin-1 Codec ------------------------------------------------------ */ -#/* --- Latin-1 Codec ------------------------------------------------------ */ def PyUnicode_DecodeLatin1(s, size, errors): - #/* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ -## if (size == 1): -## return [PyUnicode_FromUnicode(s, 1)] + # /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */ + ## if (size == 1): + ## return [PyUnicode_FromUnicode(s, 1)] pos = 0 p = [] - while (pos < size): + while pos < size: p += chr(s[pos]) pos += 1 return p + def unicode_encode_ucs1(p, size, errors, limit): - if limit == 256: reason = "ordinal not in range(256)" encoding = "latin-1" else: reason = "ordinal not in range(128)" encoding = "ascii" - - if (size == 0): + + if size == 0: return [] res = bytearray() pos = 0 while pos < len(p): - #for ch in p: + # for ch in p: ch = p[pos] - + if ord(ch) < limit: res.append(ord(ch)) pos += 1 else: - #/* startpos for collecting unencodable chars */ - collstart = pos - collend = pos+1 + # /* startpos for collecting unencodable chars */ + collstart = pos + collend = pos + 1 while collend < len(p) and ord(p[collend]) >= limit: collend += 1 - x = unicode_call_errorhandler(errors, encoding, reason, p, collstart, collend, False) + x = unicode_call_errorhandler( + errors, encoding, reason, p, collstart, collend, False + ) replacement = x[0] if isinstance(replacement, bytes): res += replacement else: res += replacement.encode() pos = x[1] - + return res + def PyUnicode_EncodeLatin1(p, size, errors): res = unicode_encode_ucs1(p, size, errors, 256) return res -hexdigits = [ord(hex(i)[-1]) for i in range(16)]+[ord(hex(i)[-1].upper()) for i in range(10, 16)] + +hexdigits = [ord(hex(i)[-1]) for i in range(16)] + [ + ord(hex(i)[-1].upper()) for i in range(10, 16) +] + def hex_number_end(s, pos, digits): target_end = pos + digits - while pos < target_end and pos < len(s) and s[pos] in hexdigits: + while pos < target_end and pos < len(s) and s[pos] in hexdigits: pos += 1 return pos + def hexescape(s, pos, digits, message, errors): ch = 0 p = [] number_end = hex_number_end(s, pos, digits) if number_end - pos != digits: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, number_end) + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 2, number_end + ) p.append(x[0]) pos = x[1] else: - ch = int(s[pos:pos+digits], 16) - #/* when we get here, ch is a 32-bit unicode character */ + ch = int(s[pos : pos + digits], 16) + # /* when we get here, ch is a 32-bit unicode character */ if ch <= sys.maxunicode: p.append(chr(ch)) pos += digits - elif (ch <= 0x10ffff): + elif ch <= 0x10FFFF: ch -= 0x10000 p.append(chr(0xD800 + (ch >> 10))) - p.append(chr(0xDC00 + (ch & 0x03FF))) + p.append(chr(0xDC00 + (ch & 0x03FF))) pos += digits else: message = "illegal Unicode character" - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-2, - pos+digits) + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 2, pos + digits + ) p.append(x[0]) pos = x[1] res = p return res, pos + def PyUnicode_DecodeUnicodeEscape(s, size, errors, final): + if size == 0: + return "", 0 - if (size == 0): - return '' - if isinstance(s, str): s = s.encode() @@ -1221,129 +1509,166 @@ def PyUnicode_DecodeUnicodeEscape(s, size, errors, final): p = [] pos = 0 - while (pos < size): -## /* Non-escape characters are interpreted as Unicode ordinals */ - if (chr(s[pos]) != '\\') : + while pos < size: + ## /* Non-escape characters are interpreted as Unicode ordinals */ + if s[pos] != ord("\\"): p.append(chr(s[pos])) pos += 1 continue -## /* \ - Escapes */ - else: - pos += 1 - if pos >= len(s): - errmessage = "\\ at end of string" - unicode_call_errorhandler(errors, "unicodeescape", errmessage, s, pos-1, size) - ch = chr(s[pos]) - pos += 1 - ## /* \x escapes */ - if ch == '\n': pass - elif ch == '\\': p += '\\' - elif ch == '\'': p += '\'' - elif ch == '\"': p += '\"' - elif ch == 'b' : p += '\b' - elif ch == 'f' : p += '\014' #/* FF */ - elif ch == 't' : p += '\t' - elif ch == 'n' : p += '\n' - elif ch == 'r' : p += '\r' - elif ch == 'v' : p += '\013' #break; /* VT */ - elif ch == 'a' : p += '\007' # break; /* BEL, not classic C */ - elif '0' <= ch <= '7': - x = ord(ch) - ord('0') - if pos < size: - ch = chr(s[pos]) - if '0' <= ch <= '7': - pos += 1 - x = (x<<3) + ord(ch) - ord('0') - if pos < size: - ch = chr(s[pos]) - if '0' <= ch <= '7': - pos += 1 - x = (x<<3) + ord(ch) - ord('0') - p.append(chr(x)) - ## /* hex escapes */ - ## /* \xXX */ - elif ch == 'x': + ## /* \ - Escapes */ + escape_start = pos + pos += 1 + if pos >= size: + if not final: + pos = escape_start + break + errmessage = "\\ at end of string" + unicode_call_errorhandler( + errors, "unicodeescape", errmessage, s, pos - 1, size + ) + break + ch = chr(s[pos]) + pos += 1 + ## /* \x escapes */ + if ch == "\n": + pass + elif ch == "\\": + p += "\\" + elif ch == "'": + p += "'" + elif ch == '"': + p += '"' + elif ch == "b": + p += "\b" + elif ch == "f": + p += "\014" # /* FF */ + elif ch == "t": + p += "\t" + elif ch == "n": + p += "\n" + elif ch == "r": + p += "\r" + elif ch == "v": + p += "\013" # break; /* VT */ + elif ch == "a": + p += "\007" # break; /* BEL, not classic C */ + elif "0" <= ch <= "7": + x = ord(ch) - ord("0") + if pos < size: + ch = chr(s[pos]) + if "0" <= ch <= "7": + pos += 1 + x = (x << 3) + ord(ch) - ord("0") + if pos < size: + ch = chr(s[pos]) + if "0" <= ch <= "7": + pos += 1 + x = (x << 3) + ord(ch) - ord("0") + p.append(chr(x)) + ## /* hex escapes */ + ## /* \xXX */ + elif ch in ("x", "u", "U"): + if ch == "x": digits = 2 message = "truncated \\xXX escape" - x = hexescape(s, pos, digits, message, errors) - p += x[0] - pos = x[1] - - # /* \uXXXX */ - elif ch == 'u': + elif ch == "u": digits = 4 message = "truncated \\uXXXX escape" + else: + digits = 8 + message = "truncated \\UXXXXXXXX escape" + number_end = hex_number_end(s, pos, digits) + if number_end - pos != digits: + if not final: + pos = escape_start + break x = hexescape(s, pos, digits, message, errors) p += x[0] pos = x[1] - - # /* \UXXXXXXXX */ - elif ch == 'U': - digits = 8 - message = "truncated \\UXXXXXXXX escape" + else: x = hexescape(s, pos, digits, message, errors) p += x[0] pos = x[1] -## /* \N{name} */ - elif ch == 'N': - message = "malformed \\N character escape" - # pos += 1 - look = pos - try: - import unicodedata - except ImportError: - message = "\\N escapes not supported (can't load unicodedata module)" - unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, size) - if look < size and chr(s[look]) == '{': - #/* look for the closing brace */ - while (look < size and chr(s[look]) != '}'): - look += 1 - if (look > pos+1 and look < size and chr(s[look]) == '}'): - #/* found a name. look it up in the unicode database */ - message = "unknown Unicode character name" - st = s[pos+1:look] - try: - chr_codec = unicodedata.lookup("%s" % st) - except LookupError as e: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) - else: - x = chr_codec, look + 1 - p.append(x[0]) - pos = x[1] - else: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) - else: - x = unicode_call_errorhandler(errors, "unicodeescape", message, s, pos-1, look+1) + ## /* \N{name} */ + elif ch == "N": + message = "malformed \\N character escape" + look = pos + try: + import unicodedata + except ImportError: + message = "\\N escapes not supported (can't load unicodedata module)" + unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, size + ) + continue + if look < size and chr(s[look]) == "{": + # /* look for the closing brace */ + while look < size and chr(s[look]) != "}": + look += 1 + if look > pos + 1 and look < size and chr(s[look]) == "}": + # /* found a name. look it up in the unicode database */ + message = "unknown Unicode character name" + st = s[pos + 1 : look] + try: + chr_codec = unicodedata.lookup("%s" % st) + except LookupError as e: + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, look + 1 + ) + else: + x = chr_codec, look + 1 + p.append(x[0]) + pos = x[1] + else: + if not final: + pos = escape_start + break + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, look + 1 + ) + p.append(x[0]) + pos = x[1] else: - if not found_invalid_escape: - found_invalid_escape = True - warnings.warn("invalid escape sequence '\\%c'" % ch, DeprecationWarning, 2) - p.append('\\') - p.append(ch) - return p + if not final: + pos = escape_start + break + x = unicode_call_errorhandler( + errors, "unicodeescape", message, s, pos - 1, look + 1 + ) + p.append(x[0]) + pos = x[1] + else: + if not found_invalid_escape: + found_invalid_escape = True + warnings.warn( + "invalid escape sequence '\\%c'" % ch, DeprecationWarning, 2 + ) + p.append("\\") + p.append(ch) + return p, pos + def PyUnicode_EncodeRawUnicodeEscape(s, size): - - if (size == 0): - return b'' + if size == 0: + return b"" p = bytearray() for ch in s: -# /* Map 32-bit characters to '\Uxxxxxxxx' */ - if (ord(ch) >= 0x10000): - p += b'\\U%08x' % ord(ch) - elif (ord(ch) >= 256) : -# /* Map 16-bit characters to '\uxxxx' */ - p += b'\\u%04x' % (ord(ch)) -# /* Copy everything else as-is */ + # /* Map 32-bit characters to '\Uxxxxxxxx' */ + if ord(ch) >= 0x10000: + p += b"\\U%08x" % ord(ch) + elif ord(ch) >= 256: + # /* Map 16-bit characters to '\uxxxx' */ + p += b"\\u%04x" % (ord(ch)) + # /* Copy everything else as-is */ else: p.append(ord(ch)) - - #p += '\0' + + # p += '\0' return p -def charmapencode_output(c, mapping): +def charmapencode_output(c, mapping): rep = mapping[c] if isinstance(rep, int): if rep < 256: @@ -1359,27 +1684,34 @@ def charmapencode_output(c, mapping): else: raise TypeError("character mapping must return integer, None or str") -def PyUnicode_EncodeCharmap(p, size, mapping='latin-1', errors='strict'): -## /* the following variable is used for caching string comparisons -## * -1=not initialized, 0=unknown, 1=strict, 2=replace, -## * 3=ignore, 4=xmlcharrefreplace */ +def PyUnicode_EncodeCharmap(p, size, mapping="latin-1", errors="strict"): + ## /* the following variable is used for caching string comparisons + ## * -1=not initialized, 0=unknown, 1=strict, 2=replace, + ## * 3=ignore, 4=xmlcharrefreplace */ -# /* Default to Latin-1 */ - if mapping == 'latin-1': + # /* Default to Latin-1 */ + if mapping == "latin-1": return PyUnicode_EncodeLatin1(p, size, errors) - if (size == 0): - return b'' + if size == 0: + return b"" inpos = 0 res = [] - while (inpos", p, inpos, inpos+1, False) + x = unicode_call_errorhandler( + errors, + "charmap", + "character maps to ", + p, + inpos, + inpos + 1, + False, + ) replacement = x[0] if isinstance(replacement, bytes): res += list(replacement) @@ -1388,119 +1720,120 @@ def PyUnicode_EncodeCharmap(p, size, mapping='latin-1', errors='strict'): for y in replacement: res += charmapencode_output(ord(y), mapping) except KeyError: - raise UnicodeEncodeError("charmap", p, inpos, inpos+1, - "character maps to ") + raise UnicodeEncodeError( + "charmap", p, inpos, inpos + 1, "character maps to " + ) inpos += 1 return res -def PyUnicode_DecodeCharmap(s, size, mapping, errors): -## /* Default to Latin-1 */ - if (mapping == None): +def PyUnicode_DecodeCharmap(s, size, mapping, errors): + ## /* Default to Latin-1 */ + if mapping == None: return PyUnicode_DecodeLatin1(s, size, errors) - if (size == 0): - return '' + if size == 0: + return "" p = [] inpos = 0 - while (inpos< len(s)): - - #/* Get mapping (char ordinal -> integer, Unicode char or None) */ + while inpos < len(s): + # /* Get mapping (char ordinal -> integer, Unicode char or None) */ ch = s[inpos] try: x = mapping[ch] if isinstance(x, int): - if x < 65536: + if x == 0xFFFE: + raise KeyError + if 0 <= x <= 0x10FFFF: p += chr(x) else: - raise TypeError("character mapping must be in range(65536)") + raise TypeError( + "character mapping must be in range(0x%x)" % (0x110000,) + ) elif isinstance(x, str): + if len(x) == 1 and x == "\ufffe": + raise KeyError p += x - elif not x: + elif x is None: raise KeyError else: raise TypeError - except KeyError: - x = unicode_call_errorhandler(errors, "charmap", - "character maps to ", s, inpos, inpos+1) + except (KeyError, IndexError): + x = unicode_call_errorhandler( + errors, "charmap", "character maps to ", s, inpos, inpos + 1 + ) p += x[0] inpos += 1 return p -def PyUnicode_DecodeRawUnicodeEscape(s, size, errors, final): - if (size == 0): - return '' +def PyUnicode_DecodeRawUnicodeEscape(s, size, errors, final): + if size == 0: + return "", 0 if isinstance(s, str): s = s.encode() pos = 0 p = [] - while (pos < len(s)): - ch = chr(s[pos]) - #/* Non-escape characters are interpreted as Unicode ordinals */ - if (ch != '\\'): - p.append(ch) + while pos < len(s): + # /* Non-escape characters are interpreted as Unicode ordinals */ + if s[pos] != ord("\\"): + p.append(chr(s[pos])) pos += 1 - continue + continue startinpos = pos -## /* \u-escapes are only interpreted iff the number of leading -## backslashes is odd */ + p_len_before = len(p) + ## /* \u-escapes are only interpreted iff the number of leading + ## backslashes is odd */ bs = pos while pos < size: - if (s[pos] != ord('\\')): + if s[pos] != ord("\\"): break p.append(chr(s[pos])) pos += 1 - - if (pos >= size): + + if pos >= size: + if not final: + del p[p_len_before:] + pos = startinpos break - if (((pos - bs) & 1) == 0 or - (s[pos] != ord('u') and s[pos] != ord('U'))) : + if ((pos - bs) & 1) == 0 or (s[pos] != ord("u") and s[pos] != ord("U")): p.append(chr(s[pos])) pos += 1 continue - + p.pop(-1) - if s[pos] == ord('u'): - count = 4 - else: - count = 8 + count = 4 if s[pos] == ord("u") else 8 pos += 1 - #/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ + # /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ number_end = hex_number_end(s, pos, count) if number_end - pos != count: + if not final: + del p[p_len_before:] + pos = startinpos + break res = unicode_call_errorhandler( - errors, "rawunicodeescape", "truncated \\uXXXX", - s, pos-2, number_end) + errors, "rawunicodeescape", "truncated \\uXXXX", s, pos - 2, number_end + ) p.append(res[0]) pos = res[1] else: - x = int(s[pos:pos+count], 16) - #ifndef Py_UNICODE_WIDE - if sys.maxunicode > 0xffff: - if (x > sys.maxunicode): - res = unicode_call_errorhandler( - errors, "rawunicodeescape", "\\Uxxxxxxxx out of range", - s, pos-2, pos+count) - pos = res[1] - p.append(res[0]) - else: - p.append(chr(x)) - pos += count + x = int(s[pos : pos + count], 16) + if x > sys.maxunicode: + res = unicode_call_errorhandler( + errors, + "rawunicodeescape", + "\\Uxxxxxxxx out of range", + s, + pos - 2, + pos + count, + ) + pos = res[1] + p.append(res[0]) else: - if (x > 0x10000): - res = unicode_call_errorhandler( - errors, "rawunicodeescape", "\\Uxxxxxxxx out of range", - s, pos-2, pos+count) - pos = res[1] - p.append(res[0]) - - #endif - else: - p.append(chr(x)) - pos += count + p.append(chr(x)) + pos += count - return p + return p, pos diff --git a/Lib/pickletools.py b/Lib/pickletools.py index e08db712a6f..254b6c7fcc9 100644 --- a/Lib/pickletools.py +++ b/Lib/pickletools.py @@ -335,7 +335,7 @@ def read_stringnl(f, decode=True, stripquotes=True, *, encoding='latin-1'): ValueError: no newline found when trying to read stringnl Embedded escapes are undone in the result. - >>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'")) # TODO: RUSTPYTHON # doctest: +EXPECTED_FAILURE + >>> read_stringnl(io.BytesIO(br"'a\n\\b\x00c\td'" + b"\n'e'")) 'a\n\\b\x00c\td' """ diff --git a/Lib/test/datetimetester.py b/Lib/test/datetimetester.py index 1bb74c6d969..8352b69f7b2 100644 --- a/Lib/test/datetimetester.py +++ b/Lib/test/datetimetester.py @@ -1620,7 +1620,6 @@ def test_pickling(self): self.assertEqual(orig, derived) self.assertEqual(orig.__reduce__(), orig.__reduce_ex__(2)) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_compat_unpickle(self): tests = [ b"cdatetime\ndate\n(S'\\x07\\xdf\\x0b\\x1b'\ntR.", @@ -2407,7 +2406,6 @@ def test_pickling_subclass_datetime(self): self.assertEqual(orig, derived) self.assertTrue(isinstance(derived, SubclassDatetime)) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_compat_unpickle(self): tests = [ b'cdatetime\ndatetime\n(' @@ -3768,7 +3766,6 @@ def test_pickling_subclass_time(self): self.assertEqual(orig, derived) self.assertTrue(isinstance(derived, SubclassTime)) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_compat_unpickle(self): tests = [ (b"cdatetime\ntime\n(S'\\x14;\\x10\\x00\\x10\\x00'\ntR.", @@ -4186,7 +4183,6 @@ def test_pickling(self): self.assertEqual(derived.tzname(), 'cookie') self.assertEqual(orig.__reduce__(), orig.__reduce_ex__(2)) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_compat_unpickle(self): tests = [ b"cdatetime\ntime\n(S'\\x05\\x06\\x07\\x01\\xe2@'\n" @@ -4652,7 +4648,6 @@ def test_pickling(self): self.assertEqual(derived.tzname(), 'cookie') self.assertEqual(orig.__reduce__(), orig.__reduce_ex__(2)) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_compat_unpickle(self): tests = [ b'cdatetime\ndatetime\n' diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py index 232121b6210..85364299f0a 100644 --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1239,7 +1239,6 @@ def test_raw(self): if b != b'\\': self.assertEqual(decode(b + b'0'), (b + b'0', 2)) - @unittest.expectedFailure # TODO: RUSTPYTHON; + (b'[]', 4) def test_escape(self): decode = codecs.escape_decode check = coding_checker(self, decode) @@ -1296,7 +1295,6 @@ def test_warnings(self): r'"\\501" is an invalid octal escape sequence'): self.assertEqual(decode(br'\x\501', 'ignore'), (b'A', 6)) - @unittest.expectedFailure # TODO: RUSTPYTHON; ValueError: not raised by escape_decode def test_errors(self): decode = codecs.escape_decode self.assertRaises(ValueError, decode, br"\x") @@ -2387,7 +2385,6 @@ def test_decoder_state(self): class CharmapTest(unittest.TestCase): - @unittest.expectedFailure # TODO: RUSTPYTHON; IndexError: index out of range def test_decode_with_string_map(self): self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"), @@ -2443,7 +2440,6 @@ def test_decode_with_string_map(self): ("", len(allbytes)) ) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: UnicodeDecodeError not raised by charmap_decode def test_decode_with_int2str_map(self): self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "strict", @@ -2560,7 +2556,6 @@ def test_decode_with_int2str_map(self): b"\x00\x01\x02", "strict", {0: "A", 1: 'Bb', 2: 999999999} ) - @unittest.expectedFailure # TODO: RUSTPYTHON; TypeError: character mapping must be in range(65536) def test_decode_with_int2int_map(self): a = ord('a') b = ord('b') @@ -2805,7 +2800,6 @@ def test_decode_errors(self): self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: '\x00\t\n\r\\' != '\x00\t\n\r' def test_partial(self): self.check_partial( "\x00\t\n\r\\\xff\uffff\U00010000", @@ -2849,7 +2843,6 @@ def test_partial(self): def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: 'unicodeescape' codec can't decode bytes in position 72-75: truncated \uXXXX escape def test_readline(self): return super().test_readline() @@ -2908,7 +2901,6 @@ def test_decode_errors(self): self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10)) self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10)) - @unittest.expectedFailure # TODO: RUSTPYTHON; - \ def test_partial(self): self.check_partial( "\x00\t\n\r\\\xff\uffff\U00010000", @@ -2938,11 +2930,9 @@ def test_partial(self): ] ) - @unittest.expectedFailure # TODO: RUSTPYTHON; - \ def test_incremental_surrogatepass(self): return super().test_incremental_surrogatepass() - @unittest.expectedFailure # TODO: RUSTPYTHON; UnicodeDecodeError: 'rawunicodeescape' codec can't decode bytes in position 72-76: truncated \uXXXX def test_readline(self): return super().test_readline() @@ -2990,7 +2980,6 @@ def test_ascii(self): self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"), b"foo\x80bar") - @unittest.expectedFailure # TODO: RUSTPYTHON; Result: FAILURE def test_charmap(self): # bad byte: \xa5 is unmapped in iso-8859-3 self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"), @@ -3183,7 +3172,6 @@ def test_binary_to_text_denylists_text_transforms(self): bad_input.decode("rot_13") self.assertIsNone(failure.exception.__cause__) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'error' object has no attribute '__notes__'. Did you mean: '__ne__'? @unittest.skipUnless(zlib, "Requires zlib support") def test_custom_zlib_error_is_noted(self): # Check zlib codec gives a good error for malformed input @@ -3192,7 +3180,6 @@ def test_custom_zlib_error_is_noted(self): codecs.decode(b"hello", "zlib_codec") self.assertEqual(msg, failure.exception.__notes__[0]) - @unittest.expectedFailure # TODO: RUSTPYTHON; - AttributeError: 'Error' object has no attribute '__notes__' def test_custom_hex_error_is_noted(self): # Check hex codec gives a good error for malformed input import binascii @@ -3292,55 +3279,46 @@ def check_note(self, obj_to_raise, msg, exc_type=RuntimeError): with self.assertNoted("decoding", exc_type, msg): codecs.decode(b"bytes input", self.codec_name) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'RuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_raise_by_type(self): self.check_note(RuntimeError, "") - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'RuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_raise_by_value(self): msg = "This should be noted" self.check_note(RuntimeError(msg), msg) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'MyRuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_raise_grandchild_subclass_exact_size(self): msg = "This should be noted" class MyRuntimeError(RuntimeError): __slots__ = () self.check_note(MyRuntimeError(msg), msg, MyRuntimeError) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'MyRuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_raise_subclass_with_weakref_support(self): msg = "This should be noted" class MyRuntimeError(RuntimeError): pass self.check_note(MyRuntimeError(msg), msg, MyRuntimeError) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'CustomInit' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_init_override(self): class CustomInit(RuntimeError): def __init__(self): pass self.check_note(CustomInit, "") - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'CustomNew' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_new_override(self): class CustomNew(RuntimeError): def __new__(cls): return super().__new__(cls) self.check_note(CustomNew, "") - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'RuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_instance_attribute(self): msg = "This should be noted" exc = RuntimeError(msg) exc.attr = 1 self.check_note(exc, "^{}$".format(msg)) - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'RuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_non_str_arg(self): self.check_note(RuntimeError(1), "1") - @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: 'RuntimeError' object has no attribute '__notes__'. Did you mean: '__ne__'? def test_multiple_args(self): msg_re = r"^\('a', 'b', 'c'\)$" self.check_note(RuntimeError('a', 'b', 'c'), msg_re) @@ -3357,7 +3335,6 @@ def test_codec_lookup_failure(self): with self.assertRaisesRegex(LookupError, msg): codecs.decode(b"bytes input", self.codec_name) - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: "^'exception_notes_test' encoder returned 'str' instead of 'bytes'; use codecs.encode\(\) to encode to arbitrary types$" does not match "'exception_notes_test' encoder returned 'str' instead of 'bytes'; use codecs.encode() to encode arbitrary types" def test_unflagged_non_text_codec_handling(self): # The stdlib non-text codecs are now marked so they're # pre-emptively skipped by the text model related methods @@ -3985,7 +3962,6 @@ def test_rot13_func(self): class CodecNameNormalizationTest(unittest.TestCase): """Test codec name normalization""" - @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: Tuples differ: (1, 2, 3, 4) != (None, None, None, None) def test_codecs_lookup(self): FOUND = (1, 2, 3, 4) NOT_FOUND = (None, None, None, None) diff --git a/Lib/test/test_pickle.py b/Lib/test/test_pickle.py index 1a14024db08..865406738e5 100644 --- a/Lib/test/test_pickle.py +++ b/Lib/test/test_pickle.py @@ -73,15 +73,12 @@ def loads(self, buf, **kwds): u = self.unpickler(f, **kwds) return u.load() - @unittest.expectedFailure # TODO: RUSTPYTHON def test_badly_escaped_string(self): return super().test_badly_escaped_string() - @unittest.expectedFailure # TODO: RUSTPYTHON def test_correctly_quoted_string(self): return super().test_correctly_quoted_string() - @unittest.expectedFailure # TODO: RUSTPYTHON def test_load_python2_str_as_bytes(self): return super().test_load_python2_str_as_bytes() @@ -182,7 +179,6 @@ def loads(self, buf, **kwds): test_find_class = None test_custom_find_class = None - @unittest.expectedFailure # TODO: RUSTPYTHON def test_badly_escaped_string(self): return super().test_badly_escaped_string() @@ -202,7 +198,6 @@ def test_bytes_memoization(self): def test_c_methods(self): return super().test_c_methods() - @unittest.expectedFailure # TODO: RUSTPYTHON def test_correctly_quoted_string(self): return super().test_correctly_quoted_string() @@ -210,7 +205,6 @@ def test_correctly_quoted_string(self): def test_in_band_buffers(self): return super().test_in_band_buffers() - @unittest.expectedFailure # TODO: RUSTPYTHON def test_load_python2_str_as_bytes(self): return super().test_load_python2_str_as_bytes() diff --git a/crates/vm/src/codecs.rs b/crates/vm/src/codecs.rs index cdae4c2ba13..cca33eba2e1 100644 --- a/crates/vm/src/codecs.rs +++ b/crates/vm/src/codecs.rs @@ -220,10 +220,11 @@ impl CodecsRegistry { } pub(crate) fn register_manual(&self, name: &str, codec: PyCodec) -> PyResult<()> { + let name = normalize_encoding_name(name); self.inner .write() .search_cache - .insert(name.to_owned(), codec); + .insert(name.into_owned(), codec); Ok(()) } @@ -283,7 +284,9 @@ impl CodecsRegistry { vm: &VirtualMachine, ) -> PyResult { let codec = self.lookup(encoding, vm)?; - codec.encode(obj, errors, vm) + codec.encode(obj, errors, vm).inspect_err(|exc| { + Self::add_codec_note(exc, "encoding", encoding, vm); + }) } pub fn decode( @@ -294,7 +297,9 @@ impl CodecsRegistry { vm: &VirtualMachine, ) -> PyResult { let codec = self.lookup(encoding, vm)?; - codec.decode(obj, errors, vm) + codec.decode(obj, errors, vm).inspect_err(|exc| { + Self::add_codec_note(exc, "decoding", encoding, vm); + }) } pub fn encode_text( @@ -306,12 +311,15 @@ impl CodecsRegistry { ) -> PyResult { let codec = self._lookup_text_encoding(encoding, "codecs.encode()", vm)?; codec - .encode(obj.into(), errors, vm)? + .encode(obj.into(), errors, vm) + .inspect_err(|exc| { + Self::add_codec_note(exc, "encoding", encoding, vm); + })? .downcast() .map_err(|obj| { vm.new_type_error(format!( "'{}' encoder returned '{}' instead of 'bytes'; use codecs.encode() to \ - encode arbitrary types", + encode to arbitrary types", encoding, obj.class().name(), )) @@ -326,20 +334,55 @@ impl CodecsRegistry { vm: &VirtualMachine, ) -> PyResult { let codec = self._lookup_text_encoding(encoding, "codecs.decode()", vm)?; - codec.decode(obj, errors, vm)?.downcast().map_err(|obj| { - vm.new_type_error(format!( - "'{}' decoder returned '{}' instead of 'str'; use codecs.decode() \ - to encode arbitrary types", - encoding, - obj.class().name(), - )) - }) + codec + .decode(obj, errors, vm) + .inspect_err(|exc| { + Self::add_codec_note(exc, "decoding", encoding, vm); + })? + .downcast() + .map_err(|obj| { + vm.new_type_error(format!( + "'{}' decoder returned '{}' instead of 'str'; use codecs.decode() to \ + decode to arbitrary types", + encoding, + obj.class().name(), + )) + }) + } + + fn add_codec_note( + exc: &crate::builtins::PyBaseExceptionRef, + operation: &str, + encoding: &str, + vm: &VirtualMachine, + ) { + let note = format!("{operation} with '{encoding}' codec failed"); + let _ = vm.call_method(exc.as_object(), "add_note", (vm.ctx.new_str(note),)); } pub fn register_error(&self, name: String, handler: PyObjectRef) -> Option { self.inner.write().errors.insert(name, handler) } + pub fn unregister_error(&self, name: &str, vm: &VirtualMachine) -> PyResult { + const BUILTIN_ERROR_HANDLERS: &[&str] = &[ + "strict", + "ignore", + "replace", + "xmlcharrefreplace", + "backslashreplace", + "namereplace", + "surrogatepass", + "surrogateescape", + ]; + if BUILTIN_ERROR_HANDLERS.contains(&name) { + return Err(vm.new_value_error(format!( + "cannot un-register built-in error handler '{name}'" + ))); + } + Ok(self.inner.write().errors.remove(name).is_some()) + } + pub fn lookup_error_opt(&self, name: &str) -> Option { self.inner.read().errors.get(name).cloned() } @@ -351,19 +394,28 @@ impl CodecsRegistry { } fn normalize_encoding_name(encoding: &str) -> Cow<'_, str> { - if let Some(i) = encoding.find(|c: char| c == ' ' || c.is_ascii_uppercase()) { - let mut out = encoding.as_bytes().to_owned(); - for byte in &mut out[i..] { - if *byte == b' ' { - *byte = b'-'; - } else { - byte.make_ascii_lowercase(); + // _Py_normalize_encoding: collapse non-alphanumeric/non-dot chars into + // single underscore, strip non-ASCII, lowercase ASCII letters. + let needs_transform = encoding + .bytes() + .any(|b| b.is_ascii_uppercase() || !b.is_ascii_alphanumeric() && b != b'.'); + if !needs_transform { + return encoding.into(); + } + let mut out = String::with_capacity(encoding.len()); + let mut punct = false; + for c in encoding.chars() { + if c.is_ascii_alphanumeric() || c == '.' { + if punct && !out.is_empty() { + out.push('_'); } + out.push(c.to_ascii_lowercase()); + punct = false; + } else { + punct = true; } - String::from_utf8(out).unwrap().into() - } else { - encoding.into() } + out.into() } #[derive(Eq, PartialEq)] @@ -416,7 +468,7 @@ impl StandardEncoding { } else { None } - } else if encoding == "CP_UTF8" { + } else if encoding == "cp65001" { Some(Self::Utf8) } else { None diff --git a/crates/vm/src/stdlib/codecs.rs b/crates/vm/src/stdlib/codecs.rs index f1fdbf1bdcd..161876c965a 100644 --- a/crates/vm/src/stdlib/codecs.rs +++ b/crates/vm/src/stdlib/codecs.rs @@ -98,6 +98,21 @@ mod _codecs { vm.state.codec_registry.lookup_error(name.as_str(), vm) } + #[pyfunction] + fn _unregister_error(errors: PyStrRef, vm: &VirtualMachine) -> PyResult { + if errors.as_wtf8().as_bytes().contains(&0) { + return Err(cstring_error(vm)); + } + if !errors.as_wtf8().is_utf8() { + return Err(vm.new_unicode_encode_error( + "'utf-8' codec can't encode character: surrogates not allowed".to_owned(), + )); + } + vm.state + .codec_registry + .unregister_error(errors.as_str(), vm) + } + type EncodeResult = PyResult<(Vec, usize)>; #[derive(FromArgs)]