diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 76446f5c919..b415c5907fa 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -851,7 +851,6 @@ def test_other_escapes(self): with self.subTest(c): self.assertRaises(re.PatternError, re.compile, '[\\%c]' % c) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_named_unicode_escapes(self): # test individual Unicode named escapes self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<')) diff --git a/Lib/test/test_ucn.py b/Lib/test/test_ucn.py index 69b58da0202..10f262cff12 100644 --- a/Lib/test/test_ucn.py +++ b/Lib/test/test_ucn.py @@ -203,7 +203,6 @@ def check_version(testfile): with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(seqname) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_errors(self): self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, 'xx') diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index e14123aaa65..ceae20e8cb2 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -24,10 +24,9 @@ class UnicodeMethodsTest(unittest.TestCase): # update this, if the database changes - expectedchecksum = '63aa77dcb36b0e1df082ee2a6071caeda7f0955e' + expectedchecksum = '9e43ee3929471739680c0e705482b4ae1c4122e4' - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; + 9e43ee3929471739680c0e705482b4ae1c4122e4 @requires_resource('cpu') def test_method_checksum(self): h = hashlib.sha1() @@ -79,10 +78,9 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): # Update this if the database changes. Make sure to do a full rebuild # (e.g. 'make distclean && make') to get the correct checksum. - expectedchecksum = '232affd2a50ec4bd69d2482aa0291385cbdefaba' + expectedchecksum = '23ab09ed4abdf93db23b97359108ed630dd8311d' - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; AttributeError: module 'unicodedata' has no attribute 'digit' @requires_resource('cpu') def test_function_checksum(self): data = [] @@ -122,11 +120,9 @@ def test_no_names_in_pua(self): char = chr(i) self.assertRaises(ValueError, self.db.name, char) - # TODO: RUSTPYTHON; LookupError: undefined character name 'LATIN SMLL LETR A' - @unittest.expectedFailure def test_lookup_nonexistant(self): # just make sure that lookup can fail - for nonexistant in [ + for nonexistent in [ "LATIN SMLL LETR A", "OPEN HANDS SIGHS", "DREGS", @@ -134,10 +130,8 @@ def test_lookup_nonexistant(self): "MODIFIER LETTER CYRILLIC SMALL QUESTION MARK", "???", ]: - self.assertRaises(KeyError, self.db.lookup, nonexistant) + self.assertRaises(KeyError, self.db.lookup, nonexistent) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_digit(self): self.assertEqual(self.db.digit('A', None), None) self.assertEqual(self.db.digit('9'), 9) @@ -150,8 +144,6 @@ def test_digit(self): self.assertRaises(TypeError, self.db.digit, 'xx') self.assertRaises(ValueError, self.db.digit, 'x') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_numeric(self): self.assertEqual(self.db.numeric('A',None), None) self.assertEqual(self.db.numeric('9'), 9) @@ -165,8 +157,6 @@ def test_numeric(self): self.assertRaises(TypeError, self.db.numeric, 'xx') self.assertRaises(ValueError, self.db.numeric, 'x') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_decimal(self): self.assertEqual(self.db.decimal('A',None), None) self.assertEqual(self.db.decimal('9'), 9) @@ -189,8 +179,7 @@ def test_category(self): self.assertRaises(TypeError, self.db.category) self.assertRaises(TypeError, self.db.category, 'xx') - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; - L def test_bidirectional(self): self.assertEqual(self.db.bidirectional('\uFFFE'), '') self.assertEqual(self.db.bidirectional(' '), 'WS') @@ -200,8 +189,6 @@ def test_bidirectional(self): self.assertRaises(TypeError, self.db.bidirectional) self.assertRaises(TypeError, self.db.bidirectional, 'xx') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_decomposition(self): self.assertEqual(self.db.decomposition('\uFFFE'),'') self.assertEqual(self.db.decomposition('\u00bc'), ' 0031 2044 0034') @@ -218,8 +205,6 @@ def test_mirrored(self): self.assertRaises(TypeError, self.db.mirrored) self.assertRaises(TypeError, self.db.mirrored, 'xx') - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_combining(self): self.assertEqual(self.db.combining('\uFFFE'), 0) self.assertEqual(self.db.combining('a'), 0) @@ -247,8 +232,7 @@ def test_issue10254(self): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; ? + def test_issue29456(self): # Fix #29456 u1176_str_a = '\u1100\u1176\u11a8' @@ -275,8 +259,7 @@ def test_east_asian_width(self): self.assertEqual(eaw('\u2010'), 'A') self.assertEqual(eaw('\U00020000'), 'W') - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; + W def test_east_asian_width_unassigned(self): eaw = self.db.east_asian_width # unassigned @@ -294,8 +277,7 @@ def test_east_asian_width_unassigned(self): self.assertEqual(eaw(char), 'A') self.assertIs(self.db.name(char, None), None) - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; + N def test_east_asian_width_9_0_changes(self): self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N') self.assertEqual(self.db.east_asian_width('\u231a'), 'W') @@ -307,8 +289,7 @@ def test_disallow_instantiation(self): # Ensure that the type disallows instantiation (bpo-43916) check_disallow_instantiation(self, unicodedata.UCD) - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; --- @force_not_colorized def test_failed_import_during_compiling(self): # Issue 4367 @@ -326,8 +307,6 @@ def test_failed_import_during_compiling(self): "(can't load unicodedata module)" self.assertIn(error, result.err.decode("ascii")) - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_decimal_numeric_consistent(self): # Test that decimal and numeric are consistent, # i.e. if a character has a decimal value, @@ -341,8 +320,6 @@ def test_decimal_numeric_consistent(self): count += 1 self.assertTrue(count >= 10) # should have tested at least the ASCII digits - # TODO: RUSTPYTHON - @unittest.expectedFailure def test_digit_numeric_consistent(self): # Test that digit and numeric are consistent, # i.e. if a character has a digit value, @@ -359,8 +336,7 @@ def test_digit_numeric_consistent(self): def test_bug_1704793(self): self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346') - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true def test_ucd_510(self): import unicodedata # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0 @@ -384,8 +360,7 @@ def test_bug_5828(self): [0] ) - # TODO: RUSTPYTHON - @unittest.expectedFailure + @unittest.expectedFailure # TODO: RUSTPYTHON; + Dž def test_bug_4971(self): # LETTER DZ WITH CARON: DZ, Dz, dz self.assertEqual("\u01c4".title(), "\u01c5") @@ -414,7 +389,6 @@ def unistr(data): data = [int(x, 16) for x in data.split(" ")] return "".join([chr(x) for x in data]) - @unittest.expectedFailure # TODO: RUSTPYTHON @requires_resource('network') @requires_resource('cpu') def test_normalization(self): @@ -502,6 +476,29 @@ def test_bug_834676(self): # Check for bug 834676 unicodedata.normalize('NFC', '\ud55c\uae00') + def test_normalize_return_type(self): + # gh-129569: normalize() return type must always be str + normalize = unicodedata.normalize + + class MyStr(str): + pass + + normalization_forms = ("NFC", "NFKC", "NFD", "NFKD") + input_strings = ( + # normalized strings + "", + "ascii", + # unnormalized strings + "\u1e0b\u0323", + "\u0071\u0307\u0323", + ) + + for form in normalization_forms: + for input_str in input_strings: + with self.subTest(form=form, input_str=input_str): + self.assertIs(type(normalize(form, input_str)), str) + self.assertIs(type(normalize(form, MyStr(input_str))), str) + if __name__ == "__main__": unittest.main() diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py index 5e6a46b3d35..b2bde5a9b1d 100644 --- a/Lib/test/test_urlparse.py +++ b/Lib/test/test_urlparse.py @@ -1495,7 +1495,6 @@ def test_all(self): expected.append(name) self.assertCountEqual(urllib.parse.__all__, expected) - @unittest.expectedFailure # TODO: RUSTPYTHON def test_urlsplit_normalization(self): # Certain characters should never occur in the netloc, # including under normalization. diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index a859e39df10..a575c1ae7e5 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -43,7 +43,7 @@ mod unicodedata { }; use itertools::Itertools; use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; - use ucd::{Codepoint, EastAsianWidth}; + use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType}; use unic_char_property::EnumeratedCharProperty; use unic_normal::StrNormalForm; use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; @@ -62,9 +62,15 @@ mod unicodedata { "lookup", "name", "bidirectional", + "combining", + "decimal", + "decomposition", + "digit", "east_asian_width", - "normalize", + "is_normalized", "mirrored", + "normalize", + "numeric", ] { module.set_attr(attr, ucd.get_attr(attr, vm)?, vm)?; } @@ -125,7 +131,11 @@ mod unicodedata { { return Ok(character.to_string()); } - Err(vm.new_lookup_error(format!("undefined character name '{name}'"))) + Err(vm.new_key_error( + vm.ctx + .new_str(format!("undefined character name '{name}'")) + .into(), + )) } #[pymethod] @@ -189,6 +199,19 @@ mod unicodedata { Ok(normalized_text) } + #[pymethod] + fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { + use super::NormalizeForm::*; + let text = unistr.as_wtf8(); + let normalized: Wtf8Buf = match form { + Nfc => text.map_utf8(|s| s.nfc()).collect(), + Nfkc => text.map_utf8(|s| s.nfkc()).collect(), + Nfd => text.map_utf8(|s| s.nfd()).collect(), + Nfkd => text.map_utf8(|s| s.nfkd()).collect(), + }; + Ok(text == &*normalized) + } + #[pymethod] fn mirrored(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { match self.extract_char(character, vm)? { @@ -204,12 +227,120 @@ mod unicodedata { } } + #[pymethod] + fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { + Ok(self + .extract_char(character, vm)? + .and_then(|c| c.to_char()) + .map_or(0, |ch| ch.canonical_combining_class() as i32)) + } + + #[pymethod] + fn decomposition(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { + let ch = match self.extract_char(character, vm)?.and_then(|c| c.to_char()) { + Some(ch) => ch, + None => return Ok(String::new()), + }; + let chars: Vec = ch.decomposition_map().collect(); + // If decomposition maps to just the character itself, there's no decomposition + if chars.len() == 1 && chars[0] == ch { + return Ok(String::new()); + } + let hex_parts = chars.iter().map(|c| format!("{:04X}", *c as u32)).join(" "); + let tag = match ch.decomposition_type() { + Some(DecompositionType::Canonical) | None => return Ok(hex_parts), + Some(dt) => decomposition_type_tag(dt), + }; + Ok(format!("<{tag}> {hex_parts}")) + } + + #[pymethod] + fn digit( + &self, + character: PyStrRef, + default: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); + if let Some(ch) = ch + && matches!( + ch.numeric_type(), + Some(NumericType::Decimal) | Some(NumericType::Digit) + ) + && let Some(Number::Integer(n)) = ch.numeric_value() + { + return Ok(vm.ctx.new_int(n).into()); + } + default.ok_or_else(|| vm.new_value_error("not a digit")) + } + + #[pymethod] + fn decimal( + &self, + character: PyStrRef, + default: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); + if let Some(ch) = ch + && ch.numeric_type() == Some(NumericType::Decimal) + && let Some(Number::Integer(n)) = ch.numeric_value() + { + return Ok(vm.ctx.new_int(n).into()); + } + default.ok_or_else(|| vm.new_value_error("not a decimal")) + } + + #[pymethod] + fn numeric( + &self, + character: PyStrRef, + default: OptionalArg, + vm: &VirtualMachine, + ) -> PyResult { + let ch = self.extract_char(character, vm)?.and_then(|c| c.to_char()); + if let Some(ch) = ch { + match ch.numeric_value() { + Some(Number::Integer(n)) => { + return Ok(vm.ctx.new_float(n as f64).into()); + } + Some(Number::Rational(num, den)) => { + return Ok(vm.ctx.new_float(num as f64 / den as f64).into()); + } + None => {} + } + } + default.ok_or_else(|| vm.new_value_error("not a numeric character")) + } + #[pygetset] fn unidata_version(&self) -> String { self.unic_version.to_string() } } + fn decomposition_type_tag(dt: DecompositionType) -> &'static str { + match dt { + DecompositionType::Canonical => "canonical", + DecompositionType::Compat => "compat", + DecompositionType::Circle => "circle", + DecompositionType::Final => "final", + DecompositionType::Font => "font", + DecompositionType::Fraction => "fraction", + DecompositionType::Initial => "initial", + DecompositionType::Isolated => "isolated", + DecompositionType::Medial => "medial", + DecompositionType::Narrow => "narrow", + DecompositionType::Nobreak => "noBreak", + DecompositionType::Small => "small", + DecompositionType::Square => "square", + DecompositionType::Sub => "sub", + DecompositionType::Super => "super", + DecompositionType::Vertical => "vertical", + DecompositionType::Wide => "wide", + } + } + trait EastAsianWidthAbbrName { fn abbr_name(&self) -> &'static str; } diff --git a/crates/vm/src/stdlib/time.rs b/crates/vm/src/stdlib/time.rs index eafd8e47b0b..05790fe332b 100644 --- a/crates/vm/src/stdlib/time.rs +++ b/crates/vm/src/stdlib/time.rs @@ -939,7 +939,10 @@ mod decl { /// Extract fields from StructTimeData into a libc::tm for mktime. #[cfg(any(unix, windows))] - pub(super) fn tm_from_struct_time(t: &StructTimeData, vm: &VirtualMachine) -> PyResult { + pub(super) fn tm_from_struct_time( + t: &StructTimeData, + vm: &VirtualMachine, + ) -> PyResult { let invalid_tuple = || vm.new_type_error("mktime(): illegal time tuple argument"); let year: i32 = t .tm_year @@ -1002,9 +1005,7 @@ mod decl { } let secs = float.floor(); if secs < libc::time_t::MIN as f64 || secs > libc::time_t::MAX as f64 { - return Err( - vm.new_overflow_error("timestamp out of range for platform time_t") - ); + return Err(vm.new_overflow_error("timestamp out of range for platform time_t")); } Ok(secs as libc::time_t) }