scanner: fix backslashes followed directly by newline in string literals (fix #20291) (#20296)

2025-09-13 14:32:26 +03:00 · 2023-12-29 07:21:49 +02:00 · 2023-12-29 07:21:49 +02:00 · 0df6fcce8c
commit 0df6fcce8c
parent 680b0d463a
2 changed files with 65 additions and 17 deletions
--- a/vlib/builtin/string_test.v
+++ b/vlib/builtin/string_test.v
@ -1067,22 +1067,57 @@ fn test_split_into_lines() {
 	}
 }

-fn test_string_literal_with_backslash() {
-	a := 'HelloWorld'
+const single_backslash = '\\'
+const double_backslash = '\\\\'
+const newline = '\n'
+
+// vfmt off
+fn test_string_literal_with_backslash_followed_by_newline() {
+	// Note `\` is followed *directly* by a newline, then some more whitespace, then a non whitespace string.
+	// In this case, the \ is treated as line breaking, and the whitespace after that on the new line,
+	// should be just ignored.
+	//
+	// See also https://doc.rust-lang.org/reference/tokens.html#string-literals
+	// >> Both byte sequences are normally translated to U+000A, but as a special exception,
+	// when an unescaped U+005C character occurs immediately before the line-break,
+	// the U+005C character, the line-break, and all whitespace at the beginning of the
+	// next line are ignored.
+	a := 'Hello\
+             World'
 	assert a == 'HelloWorld'

-	b := 'OneTwoThree'
-	assert b == 'OneTwoThree'
-}
+	// Here, `\\\` means `\\` followed by `\`, followed by a newline.
+	// the first is a single escaped \, that should go into the literal, the second together with
+	// the newline and the whitespace after it, is a line-break, and should be simply ignored.
+	// Same with `\\\\\`, which is `\\\\`, followed by `\`, i.e. an escaped double backslash,
+	// and a line-break after it:
+	b := 'One \
+	         Two Three \\\
+             Four \\\\
+    Five \\\\\
+    end'
+	assert b == 'One Two Three ${single_backslash}Four ${double_backslash}${newline}    Five ${double_backslash}end'
+	
+	// Note `\\` is followed *directly* by a newline, but `\\` is just an escape for `\`,
+	// and thus the newline has no special meaning, and should go into the string literal.
+	c := 'Hello\\
+        World'
+	assert c == 'Hello\\\n        World'
+
+	d := 'One\\
+    Two Three \\
+    Four'
+	assert d == 'One\\\n    Two Three \\\n    Four'
+}
+// vfmt on

-/*
 type MyString = string

 fn test_string_alias() {
 	s := MyString('hi')
 	ss := s + '!'
+	assert ss == 'hi!'
 }
-*/

 // sort an array of structs, by their string field values

--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -1239,7 +1239,7 @@ pub fn (mut s Scanner) ident_string() string {
 			backslash_count++
 		}
 		// end of string
-		if c == s.quote && (is_raw || backslash_count % 2 == 0) {
+		if c == s.quote && (is_raw || backslash_count & 1 == 0) {
 			// handle '123\\' backslash at the end
 			break
 		}
@ -1253,7 +1253,7 @@ pub fn (mut s Scanner) ident_string() string {
 			s.inc_line_number()
 		}
 		// Escape `\x` `\u` `\U`
-		if backslash_count % 2 == 1 && !is_raw && !is_cstr {
+		if backslash_count & 1 == 1 && !is_raw && !is_cstr {
 			// Escape `\x`
 			if c == `x` {
 				if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit()
@ -1287,13 +1287,13 @@ pub fn (mut s Scanner) ident_string() string {
 				u32_escapes_pos << s.pos - 1
 			}
 			// Unknown escape sequence
-			if !is_escape_sequence(c) && !c.is_digit() {
+			if !is_escape_sequence(c) && !c.is_digit() && c != `\n` {
 				s.error('`${c.ascii_str()}` unknown escape sequence')
 			}
 		}
 		// ${var} (ignore in vfmt mode) (skip \$)
 		if prevc == `$` && c == `{` && !is_raw
-			&& s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 {
+			&& s.count_symbol_before(s.pos - 2, scanner.backslash) & 1 == 0 {
 			s.is_inside_string = true
 			if s.is_enclosed_inter {
 				s.is_nested_enclosed_inter = true
@ -1306,7 +1306,7 @@ pub fn (mut s Scanner) ident_string() string {
 		}
 		// $var
 		if prevc == `$` && util.is_name_char(c) && !is_raw
-			&& s.count_symbol_before(s.pos - 2, scanner.backslash) % 2 == 0 {
+			&& s.count_symbol_before(s.pos - 2, scanner.backslash) & 1 == 0 {
 			s.is_inside_string = true
 			s.is_inter_start = true
 			s.pos -= 2
@ -1483,13 +1483,26 @@ fn trim_slash_line_break(s string) string {
 	mut start := 0
 	mut ret_str := s
 	for {
+		// find the position of the first `\` followed by a newline, after `start`:
 		idx := ret_str.index_after('\\\n', start)
-		if idx != -1 {
-			ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
-			start = idx
-		} else {
+		if idx == -1 {
 			break
 		}
+		start = idx
+		// Here, ret_str[idx] is \, and ret_str[idx+1] is newline.
+		// Depending on the number of backslashes before the newline, we should either
+		// treat the last one and the whitespace after it as line-break, or just ignore it:
+		mut nbackslashes := 0
+		for eidx := idx; eidx >= 0 && ret_str[eidx] == `\\`; eidx-- {
+			nbackslashes++
+		}
+		// eprintln('>> start: ${start:-5} | nbackslashes: ${nbackslashes:-5} | ret_str: $ret_str')
+		if idx == 0 || (nbackslashes & 1) == 1 {
+			ret_str = ret_str[..idx] + ret_str[idx + 2..].trim_left(' \n\t\v\f\r')
+		} else {
+			// ensure the loop will terminate, when we could not strip anything:
+			start++
+		}
 	}
 	return ret_str
 }
@ -1560,7 +1573,7 @@ pub fn (mut s Scanner) ident_char() string {
 		// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605 or (32 bit unicode) \U00002605
 		// we don't handle binary escape codes in rune literals
 		orig := c
-		if c.len % 2 == 0
+		if c.len & 1 == 0
 			&& (escaped_hex || escaped_unicode_16 || escaped_unicode_32 || escaped_octal) {
 			if escaped_unicode_16 {
 				// there can only be one, so attempt to decode it now