builtin: add a rune iterator method to strings, allowing for for i, r in s.runes_iterator() { without first allocating an array for all the runes (#24769)

This commit is contained in:
Delyan Angelov 2025-06-21 12:33:14 +03:00 committed by GitHub
parent 502f0e7e77
commit 194db24829
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 91 additions and 20 deletions

View file

@ -2979,3 +2979,40 @@ fn data_to_hex_string(data &u8, len int) string {
hex[dst] = 0 hex[dst] = 0
return tos(hex, dst) return tos(hex, dst)
} }
pub struct RunesIterator {
mut:
s string
i int
}
// runes_iterator creates an iterator over all the runes in the given string `s`.
// It can be used in `for r in s.runes_iterator() {`, as a direct substitute to
// calling .runes(): `for r in s.runes() {`, which needs an intermediate allocation
// of an array.
pub fn (s string) runes_iterator() RunesIterator {
return RunesIterator{
s: s
i: 0
}
}
// next is the method that will be called for each iteration in `for r in s.runes_iterator() {`
pub fn (mut ri RunesIterator) next() ?rune {
for ri.i >= ri.s.len {
return none
}
char_len := utf8_char_len(unsafe { ri.s.str[ri.i] })
if char_len == 1 {
res := unsafe { ri.s.str[ri.i] }
ri.i++
return res
}
start := &u8(unsafe { &ri.s.str[ri.i] })
len := if ri.s.len - 1 >= ri.i + char_len { char_len } else { ri.s.len - ri.i }
ri.i += char_len
if char_len > 4 {
return 0
}
return rune(impl_utf8_to_utf32(start, len))
}

View file

@ -0,0 +1,32 @@
fn check(s string) {
srunes := s.runes()
println('')
println('> s: ${s}')
println('> s.len: ${s.len:-4}')
println('> srunes.len: ${srunes.len:-4}')
mut itera_ := []rune{}
for r in s.runes_iterator() {
itera_ << r
}
println('> srunes: ${srunes}')
println('> iterated: ${itera_}')
assert srunes == itera_
}
fn test_ascii() {
check('ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789')
}
fn test_mixed() {
check('abc,,привет,💰')
}
fn test_emoji_and_for_i_r_in_iterator() {
s := '💰'
check(s)
srunes := s.runes()
for i, r in s.runes_iterator() {
eprintln('> i: ${i} | r: ${r}')
assert srunes[i] == r
}
}

View file

@ -78,34 +78,36 @@ pub fn utf32_decode_to_buffer(code u32, mut buf &u8) int {
// it is used in vlib/builtin/string.v, // it is used in vlib/builtin/string.v,
// and also in vlib/v/gen/c/cgen.v // and also in vlib/v/gen/c/cgen.v
pub fn (_rune string) utf32_code() int { pub fn (_rune string) utf32_code() int {
if res := _rune.bytes().utf8_to_utf32() { if _rune.len > 4 {
return int(res)
}
return 0 return 0
}
return int(impl_utf8_to_utf32(&u8(_rune.str), _rune.len))
} }
// convert array of utf8 bytes to single utf32 value // convert array of utf8 bytes to single utf32 value
// will error if more than 4 bytes are submitted // will error if more than 4 bytes are submitted
@[direct_array_access]
pub fn (_bytes []u8) utf8_to_utf32() !rune { pub fn (_bytes []u8) utf8_to_utf32() !rune {
if _bytes.len == 0 {
return 0
}
// return ASCII unchanged
if _bytes.len == 1 {
return rune(_bytes[0])
}
if _bytes.len > 4 { if _bytes.len > 4 {
return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum') return error('attempted to decode too many bytes, utf-8 is limited to four bytes maximum')
} }
return impl_utf8_to_utf32(&u8(_bytes.data), _bytes.len)
}
mut b := u8(int(_bytes[0])) @[direct_array_access]
fn impl_utf8_to_utf32(_bytes &u8, _bytes_len int) rune {
b = b << _bytes.len if _bytes_len == 0 {
return 0
}
// return ASCII unchanged
if _bytes_len == 1 {
return unsafe { rune(_bytes[0]) }
}
mut b := u8(int(unsafe { _bytes[0] }))
b = b << _bytes_len
mut res := rune(b) mut res := rune(b)
mut shift := 6 - _bytes.len mut shift := 6 - _bytes_len
for i := 1; i < _bytes.len; i++ { for i := 1; i < _bytes_len; i++ {
c := rune(_bytes[i]) c := rune(unsafe { _bytes[i] })
res = rune(res) << shift res = rune(res) << shift
res |= c & 63 // 0x3f res |= c & 63 // 0x3f
shift = 6 shift = 6

View file

@ -5,5 +5,5 @@ _result_ok(&(string[]) { s }, (_result*)(&_t2), sizeof(string));
} else { } else {
return (_result_string){ .is_error=true, .err=_v_error(_S("empty")), .data={E_STRUCT} }; return (_result_string){ .is_error=true, .err=_v_error(_S("empty")), .data={E_STRUCT} };
} }
return _t1; return _t2;
} }