encoding.html: implement unescape() (#19267)

2025-09-15 23:42:28 +03:00 · 2023-09-05 07:29:24 +02:00 · 2023-09-05 07:29:24 +02:00 · 273341685a
commit 273341685a
parent c126450201
3 changed files with 2258 additions and 5 deletions
--- a/vlib/encoding/html/escape.v
+++ b/vlib/encoding/html/escape.v
@ -1,13 +1,24 @@
 module html
 import encoding.hex
 import strconv
 [params]
 pub struct EscapeConfig {
 	quote bool = true
 }
 [params]
 pub struct UnescapeConfig {
 	EscapeConfig
 	all bool
 }
 const (
-	html_replacement_table       = ['&', '&amp;', '<', '&lt;', '>', '&gt;']
+	escape_seq         = ['&', '&amp;', '<', '&lt;', '>', '&gt;']
-	html_quote_replacement_table = ['"', '&#34;', "'", '&#39;'] // `'&#34;'` is shorter than `'&quot;'`
+	escape_quote_seq   = ['"', '&#34;', "'", '&#39;']
 	unescape_seq       = ['&amp;', '&', '&lt;', '<', '&gt;', '>']
 	unescape_quote_seq = ['&#34;', '"', '&#39;', "'"]
 )
 // escape converts special characters in the input, specifically "<", ">", and "&"
@ -16,10 +27,74 @@ const (
 // **Note:** escape() supports funky accents by doing nothing about them. V's UTF-8
 // support through `string` is robust enough to deal with these cases.
 pub fn escape(input string, config EscapeConfig) string {
 	tag_free_input := input.replace_each(html.html_replacement_table)
 	return if config.quote {
-		tag_free_input.replace_each(html.html_quote_replacement_table)
+		input.replace_each(html.escape_seq).replace_each(html.escape_quote_seq)
 	} else {
-		tag_free_input
+		input.replace_each(html.escape_seq)
 	}
 }
 // unescape converts entities like "&lt;" to "<". By default it is the converse of `escape`.
 // If `all` is set to true, it handles named, numeric, and hex values - for example,
 // `'&apos;'`, `'&#39;'`, and `'&#x27;'` then unescape to "'".
 pub fn unescape(input string, config UnescapeConfig) string {
 	return if config.all {
 		unescape_all(input)
 	} else if config.quote {
 		input.replace_each(html.unescape_seq).replace_each(html.unescape_quote_seq)
 	} else {
 		input.replace_each(html.unescape_seq)
 	}
 }
 fn unescape_all(input string) string {
 	mut result := []rune{}
 	runes := input.runes()
 	mut i := 0
 	outer: for i < runes.len {
 		if runes[i] == `&` {
 			mut j := i + 1
 			for j < runes.len && runes[j] != `;` {
 				j++
 			}
 			if j < runes.len && runes[i + 1] == `#` {
 				// Numeric escape sequences (e.g., &#39; or &#x27;)
 				code := runes[i + 2..j].string()
 				if code[0] == `x` {
 					// Hexadecimal escape sequence
 					for c in code[1..] {
 						if !c.is_hex_digit() {
 							// Leave invalid sequences unchanged
 							result << runes[i..j + 1]
 							i = j + 1
 							continue outer
 						}
 					}
 					result << hex.decode(code[1..]) or { []u8{} }.bytestr().runes()
 				} else {
 					// Decimal escape sequence
 					if v := strconv.atoi(code) {
 						result << v
 					} else {
 						// Leave invalid sequences unchanged
 						result << runes[i..j + 1]
 					}
 				}
 			} else {
 				// Named entity (e.g., &lt;)
 				entity := runes[i + 1..j].string()
 				if v := named_references[entity] {
 					result << v
 				} else {
 					// Leave unknown entities unchanged
 					result << runes[i..j + 1]
 				}
 			}
 			i = j + 1
 		} else {
 			result << runes[i]
 			i++
 		}
 	}
 	return result.string()
 }
--- a/vlib/encoding/html/escape_test.v
+++ b/vlib/encoding/html/escape_test.v
@ -20,3 +20,51 @@ fn test_escape_html() {
 	assert html.escape('café') == 'café'
 	assert html.escape('<p>façade</p>') == '&lt;p&gt;façade&lt;/p&gt;'
 }
 fn test_unescape_html() {
 	// Test different formats
 	assert html.unescape('&#39;&#x27;&apos;') == "'&#x27;&apos;"
 	// Converse escape tests
 	assert html.unescape('&lt;&gt;&amp;') == '<>&'
 	assert html.unescape('No change') == 'No change'
 	assert html.unescape('&lt;b&gt;Bold text&lt;/b&gt;') == '<b>Bold text</b>'
 	assert html.unescape('&lt;img /&gt;') == '<img />'
 	assert html.unescape('&#39; onmouseover=&#39;alert(1)&#39;') == "' onmouseover='alert(1)'"
 	assert html.unescape('&lt;a href=&#39;http://www.example.com&#39;&gt;link&lt;/a&gt;') == "<a href='http://www.example.com'>link</a>"
 	assert html.unescape('&lt;script&gt;alert(&#39;hello&#39;);&lt;/script&gt;') == "<script>alert('hello');</script>"
 	// Cases obtained from:
 	// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
 	assert html.unescape('plain text') == 'plain text'
 	assert html.unescape('') == ''
 	assert html.unescape('bread &amp; butter') == 'bread & butter'
 	assert html.unescape('&#34;bread&#34; &amp; butter') == '"bread" & butter'
 	assert html.unescape('greater than &gt;') == 'greater than >'
 	assert html.unescape('&lt; less than') == '< less than'
 	// Leave accents as-is
 	assert html.unescape('café') == 'café'
 	assert html.unescape('&lt;p&gt;façade&lt;/p&gt;') == '<p>façade</p>'
 }
 fn test_unescape_all_html() {
 	// Test different formats
 	assert html.unescape('&#39;&#x27;&apos;', all: true) == "'''"
 	// Converse escape tests
 	assert html.unescape('&lt;&gt;&amp;', all: true) == '<>&'
 	assert html.unescape('No change', all: true) == 'No change'
 	assert html.unescape('&lt;b&gt;Bold text&lt;/b&gt;', all: true) == '<b>Bold text</b>'
 	assert html.unescape('&lt;img /&gt;', all: true) == '<img />'
 	assert html.unescape('&#39; onmouseover=&#39;alert(1)&#39;', all: true) == "' onmouseover='alert(1)'"
 	assert html.unescape('&lt;a href=&#39;http://www.example.com&#39;&gt;link&lt;/a&gt;', all: true) == "<a href='http://www.example.com'>link</a>"
 	assert html.unescape('&lt;script&gt;alert(&#39;hello&#39;);&lt;/script&gt;', all: true) == "<script>alert('hello');</script>"
 	// Cases obtained from:
 	// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
 	assert html.unescape('plain text', all: true) == 'plain text'
 	assert html.unescape('', all: true) == ''
 	assert html.unescape('bread &amp; butter', all: true) == 'bread & butter'
 	assert html.unescape('&#34;bread&#34; &amp; butter', all: true) == '"bread" & butter'
 	assert html.unescape('greater than &gt;', all: true) == 'greater than >'
 	assert html.unescape('&lt; less than', all: true) == '< less than'
 	// Leave accents as-is
 	assert html.unescape('café', all: true) == 'café'
 	assert html.unescape('&lt;p&gt;façade&lt;/p&gt;', all: true) == '<p>façade</p>'
 }
--- a/vlib/encoding/html/named_references.v
+++ b/vlib/encoding/html/named_references.v