Skip to content

Commit 33a3ec8

Browse files
committed
Handle unicode string slicing with graphemes
1 parent 0f87d15 commit 33a3ec8

5 files changed

Lines changed: 77 additions & 11 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/snippets/unicode_slicing.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
def test_slice_bounds(s):
2+
# End out of range
3+
assert s[0:100] == s
4+
assert s[0:-100] == ''
5+
# Start out of range
6+
assert s[100:1] == ''
7+
# Out of range both sides
8+
# This is the behaviour in cpython
9+
# assert s[-100:100] == s
10+
11+
def expect_index_error(s, index):
12+
try:
13+
s[index]
14+
except IndexError:
15+
pass
16+
else:
17+
assert False
18+
19+
unicode_str = "∀∂"
20+
assert unicode_str[0] == "∀"
21+
assert unicode_str[1] == "∂"
22+
assert unicode_str[-1] == "∂"
23+
24+
test_slice_bounds(unicode_str)
25+
expect_index_error(unicode_str, 100)
26+
expect_index_error(unicode_str, -100)
27+
28+
ascii_str = "hello world"
29+
test_slice_bounds(ascii_str)
30+
assert ascii_str[0] == "h"
31+
assert ascii_str[1] == "e"
32+
assert ascii_str[-1] == "d"
33+

vm/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,5 @@ serde_json = "1.0.26"
1717
byteorder = "1.2.6"
1818
regex = "1"
1919
statrs = "0.10.0"
20-
caseless = "0.2.1"
20+
caseless = "0.2.1"
21+
unicode-segmentation = "1.2.1"

vm/src/obj/objsequence.rs

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,12 @@ pub trait PySliceableSequence {
1313
fn len(&self) -> usize;
1414
fn get_pos(&self, p: i32) -> usize {
1515
if p < 0 {
16-
self.len() - ((-p) as usize)
16+
if -p as usize > self.len() {
17+
// return something that is out of bounds so `get_item` raises an IndexError
18+
self.len() + 1
19+
} else {
20+
self.len() - ((-p) as usize)
21+
}
1722
} else if p as usize > self.len() {
1823
// This is for the slicing case where the end element is greater than the length of the
1924
// sequence
@@ -52,7 +57,7 @@ pub trait PySliceableSequence {
5257
}
5358
}
5459

55-
impl PySliceableSequence for Vec<PyObjectRef> {
60+
impl<T: Clone> PySliceableSequence for Vec<T> {
5661
fn do_slice(&self, start: usize, stop: usize) -> Self {
5762
self[start..stop].to_vec()
5863
}
@@ -78,8 +83,8 @@ pub fn get_item(
7883
let obj = elements[pos_index].clone();
7984
Ok(obj)
8085
} else {
81-
let value_error = vm.context().exceptions.value_error.clone();
82-
Err(vm.new_exception(value_error, "Index out of bounds!".to_string()))
86+
let index_error = vm.context().exceptions.index_error.clone();
87+
Err(vm.new_exception(index_error, "Index out of bounds!".to_string()))
8388
}
8489
}
8590
PyObjectPayload::Slice {

vm/src/obj/objstr.rs

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ use num_traits::ToPrimitive;
1111
use std::hash::{Hash, Hasher};
1212
// rust's builtin to_lowercase isn't sufficient for casefold
1313
extern crate caseless;
14+
extern crate unicode_segmentation;
15+
16+
use self::unicode_segmentation::UnicodeSegmentation;
1417

1518
pub fn init(context: &PyContext) {
1619
let ref str_type = context.str_type;
@@ -980,22 +983,45 @@ fn str_new(vm: &mut VirtualMachine, args: PyFuncArgs) -> PyResult {
980983

981984
impl PySliceableSequence for String {
982985
fn do_slice(&self, start: usize, stop: usize) -> Self {
983-
self[start..stop].to_string()
986+
to_graphemes(self)
987+
.get(start..stop)
988+
.map_or(String::default(), |c| c.join(""))
984989
}
990+
985991
fn do_stepped_slice(&self, start: usize, stop: usize, step: usize) -> Self {
986-
self[start..stop].chars().step_by(step).collect()
992+
if let Some(s) = to_graphemes(self).get(start..stop) {
993+
return s
994+
.iter()
995+
.cloned()
996+
.step_by(step)
997+
.collect::<Vec<String>>()
998+
.join("");
999+
}
1000+
String::default()
9871001
}
1002+
9881003
fn len(&self) -> usize {
989-
self.len()
1004+
to_graphemes(self).len()
9901005
}
9911006
}
9921007

1008+
/// Convert a string-able `value` to a vec of graphemes
1009+
/// represents the string according to user perceived characters
1010+
fn to_graphemes<S: AsRef<str>>(value: S) -> Vec<String> {
1011+
UnicodeSegmentation::graphemes(value.as_ref(), true)
1012+
.map(String::from)
1013+
.collect()
1014+
}
1015+
9931016
pub fn subscript(vm: &mut VirtualMachine, value: &str, b: PyObjectRef) -> PyResult {
994-
// let value = a
9951017
if objtype::isinstance(&b, &vm.ctx.int_type()) {
9961018
let pos = objint::get_value(&b).to_i32().unwrap();
997-
let idx = value.to_string().get_pos(pos);
998-
Ok(vm.new_str(value[idx..idx + 1].to_string()))
1019+
let graphemes = to_graphemes(value);
1020+
let idx = graphemes.get_pos(pos);
1021+
graphemes
1022+
.get(idx)
1023+
.map(|c| vm.new_str(c.to_string()))
1024+
.ok_or(vm.new_index_error("string index out of range".to_string()))
9991025
} else {
10001026
match &(*b.borrow()).payload {
10011027
&PyObjectPayload::Slice {

0 commit comments

Comments
 (0)