scanner: fix backslashes followed directly by newline in string literals (fix #20291) (#20296)

This commit is contained in:
Delyan Angelov 2023-12-29 07:21:49 +02:00 committed by GitHub
parent 680b0d463a
commit 0df6fcce8c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 65 additions and 17 deletions

View file

@ -1067,22 +1067,57 @@ fn test_split_into_lines() {
}
}
fn test_string_literal_with_backslash() {
a := 'HelloWorld'
const single_backslash = '\\'
const double_backslash = '\\\\'
const newline = '\n'
// vfmt off
fn test_string_literal_with_backslash_followed_by_newline() {
// Note `\` is followed *directly* by a newline, then some more whitespace, then a non whitespace string.
// In this case, the \ is treated as line breaking, and the whitespace after that on the new line,
// should be just ignored.
//
// See also https://doc.rust-lang.org/reference/tokens.html#string-literals
// >> Both byte sequences are normally translated to U+000A, but as a special exception,
// when an unescaped U+005C character occurs immediately before the line-break,
// the U+005C character, the line-break, and all whitespace at the beginning of the
// next line are ignored.
a := 'Hello\
World'
assert a == 'HelloWorld'
b := 'OneTwoThree'
assert b == 'OneTwoThree'
}
// Here, `\\\` means `\\` followed by `\`, followed by a newline.
// the first is a single escaped \, that should go into the literal, the second together with
// the newline and the whitespace after it, is a line-break, and should be simply ignored.
// Same with `\\\\\`, which is `\\\\`, followed by `\`, i.e. an escaped double backslash,
// and a line-break after it:
b := 'One \
Two Three \\\
Four \\\\
Five \\\\\
end'
assert b == 'One Two Three ${single_backslash}Four ${double_backslash}${newline} Five ${double_backslash}end'
// Note `\\` is followed *directly* by a newline, but `\\` is just an escape for `\`,
// and thus the newline has no special meaning, and should go into the string literal.
c := 'Hello\\
World'
assert c == 'Hello\\\n World'
d := 'One\\
Two Three \\
Four'
assert d == 'One\\\n Two Three \\\n Four'
}
// vfmt on
/*
type MyString = string
fn test_string_alias() {
s := MyString('hi')
ss := s + '!'
assert ss == 'hi!'
}
*/
// sort an array of structs, by their string field values

View file

@ -1239,7 +1239,7 @@ pub fn (mut s Scanner) ident_string() string {
backslash_count++
}
// end of string
if c == s.quote && (is_raw || backslash_count % 2 == 0) {
if c == s.quote && (is_raw || backslash_count & 1 == 0) {
// handle '123\\' backslash at the end
break
}
@ -1253,7 +1253,7 @@ pub fn (mut s Scanner) ident_string() string {
s.inc_line_number()
}
// Escape `\x` `\u` `\U`
if backslash_count % 2 == 1 && !is_raw && !is_cstr {
if backslash_count & 1 == 1 && !is_raw && !is_cstr {
// Escape `\x`
if c == `x` {
if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
@ -1287,13 +1287,13 @@ pub fn (mut s Scanner) ident_string() string {
u32_escapes_pos << s.pos - 1
}
// Unknown escape sequence
if !is_escape_sequence(c) && !c.is_digit() {
if !is_escape_sequence(c) && !c.is_digit() && c != `\n` {
s.error('`${c.ascii_str()}` unknown escape sequence')
}
}
// ${var} (ignore in vfmt mode) (skip \$)
if prevc == `$` && c == `{` && !is_raw
&& s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 {
&& s.count_symbol_before(s.pos - 2, scanner.backslash) & 1 == 0 {
s.is_inside_string = true
if s.is_enclosed_inter {
s.is_nested_enclosed_inter = true
@ -1306,7 +1306,7 @@ pub fn (mut s Scanner) ident_string() string {
}
// $var
if prevc == `$` && util.is_name_char(c) && !is_raw
&& s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 {
&& s.count_symbol_before(s.pos - 2, scanner.backslash) & 1 == 0 {
s.is_inside_string = true
s.is_inter_start = true
s.pos -= 2
@ -1483,13 +1483,26 @@ fn trim_slash_line_break(s string) string {
mut start := 0
mut ret_str := s
for {
// find the position of the first `\` followed by a newline, after `start`:
idx := ret_str.index_after('\\\n', start)
if idx != -1 {
ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
start = idx
} else {
if idx == -1 {
break
}
start = idx
// Here, ret_str[idx] is \, and ret_str[idx+1] is newline.
// Depending on the number of backslashes before the newline, we should either
// treat the last one and the whitespace after it as line-break, or just ignore it:
mut nbackslashes := 0
for eidx := idx; eidx >= 0 && ret_str[eidx] == `\\`; eidx-- {
nbackslashes++
}
// eprintln('>> start: ${start:-5} | nbackslashes: ${nbackslashes:-5} | ret_str: $ret_str')
if idx == 0 || (nbackslashes & 1) == 1 {
ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
} else {
// ensure the loop will terminate, when we could not strip anything:
start++
}
}
return ret_str
}
@ -1560,7 +1573,7 @@ pub fn (mut s Scanner) ident_char() string {
// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605
// we don't handle binary escape codes in rune literals
orig := c
if c.len % 2 == 0
if c.len & 1 == 0
&& (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) {
if escaped_unicode_16 {
// there can only be one, so attempt to decode it now