mirror of
https://github.com/vlang/v.git
synced 2025-09-15 23:42:28 +03:00
encoding.html: implement unescape()
(#19267)
This commit is contained in:
parent
c126450201
commit
273341685a
3 changed files with 2258 additions and 5 deletions
|
@ -1,13 +1,24 @@
|
||||||
module html
|
module html
|
||||||
|
|
||||||
|
import encoding.hex
|
||||||
|
import strconv
|
||||||
|
|
||||||
[params]
|
[params]
|
||||||
pub struct EscapeConfig {
|
pub struct EscapeConfig {
|
||||||
quote bool = true
|
quote bool = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[params]
|
||||||
|
pub struct UnescapeConfig {
|
||||||
|
EscapeConfig
|
||||||
|
all bool
|
||||||
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
html_replacement_table = ['&', '&', '<', '<', '>', '>']
|
escape_seq = ['&', '&', '<', '<', '>', '>']
|
||||||
html_quote_replacement_table = ['"', '"', "'", '''] // `'"'` is shorter than `'"'`
|
escape_quote_seq = ['"', '"', "'", ''']
|
||||||
|
unescape_seq = ['&', '&', '<', '<', '>', '>']
|
||||||
|
unescape_quote_seq = ['"', '"', ''', "'"]
|
||||||
)
|
)
|
||||||
|
|
||||||
// escape converts special characters in the input, specifically "<", ">", and "&"
|
// escape converts special characters in the input, specifically "<", ">", and "&"
|
||||||
|
@ -16,10 +27,74 @@ const (
|
||||||
// **Note:** escape() supports funky accents by doing nothing about them. V's UTF-8
|
// **Note:** escape() supports funky accents by doing nothing about them. V's UTF-8
|
||||||
// support through `string` is robust enough to deal with these cases.
|
// support through `string` is robust enough to deal with these cases.
|
||||||
pub fn escape(input string, config EscapeConfig) string {
|
pub fn escape(input string, config EscapeConfig) string {
|
||||||
tag_free_input := input.replace_each(html.html_replacement_table)
|
|
||||||
return if config.quote {
|
return if config.quote {
|
||||||
tag_free_input.replace_each(html.html_quote_replacement_table)
|
input.replace_each(html.escape_seq).replace_each(html.escape_quote_seq)
|
||||||
} else {
|
} else {
|
||||||
tag_free_input
|
input.replace_each(html.escape_seq)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// unescape converts entities like "<" to "<". By default it is the converse of `escape`.
|
||||||
|
// If `all` is set to true, it handles named, numeric, and hex values - for example,
|
||||||
|
// `'''`, `'''`, and `'''` then unescape to "'".
|
||||||
|
pub fn unescape(input string, config UnescapeConfig) string {
|
||||||
|
return if config.all {
|
||||||
|
unescape_all(input)
|
||||||
|
} else if config.quote {
|
||||||
|
input.replace_each(html.unescape_seq).replace_each(html.unescape_quote_seq)
|
||||||
|
} else {
|
||||||
|
input.replace_each(html.unescape_seq)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn unescape_all(input string) string {
|
||||||
|
mut result := []rune{}
|
||||||
|
runes := input.runes()
|
||||||
|
mut i := 0
|
||||||
|
outer: for i < runes.len {
|
||||||
|
if runes[i] == `&` {
|
||||||
|
mut j := i + 1
|
||||||
|
for j < runes.len && runes[j] != `;` {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
if j < runes.len && runes[i + 1] == `#` {
|
||||||
|
// Numeric escape sequences (e.g., ' or ')
|
||||||
|
code := runes[i + 2..j].string()
|
||||||
|
if code[0] == `x` {
|
||||||
|
// Hexadecimal escape sequence
|
||||||
|
for c in code[1..] {
|
||||||
|
if !c.is_hex_digit() {
|
||||||
|
// Leave invalid sequences unchanged
|
||||||
|
result << runes[i..j + 1]
|
||||||
|
i = j + 1
|
||||||
|
continue outer
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result << hex.decode(code[1..]) or { []u8{} }.bytestr().runes()
|
||||||
|
} else {
|
||||||
|
// Decimal escape sequence
|
||||||
|
if v := strconv.atoi(code) {
|
||||||
|
result << v
|
||||||
|
} else {
|
||||||
|
// Leave invalid sequences unchanged
|
||||||
|
result << runes[i..j + 1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Named entity (e.g., <)
|
||||||
|
entity := runes[i + 1..j].string()
|
||||||
|
if v := named_references[entity] {
|
||||||
|
result << v
|
||||||
|
} else {
|
||||||
|
// Leave unknown entities unchanged
|
||||||
|
result << runes[i..j + 1]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i = j + 1
|
||||||
|
} else {
|
||||||
|
result << runes[i]
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.string()
|
||||||
|
}
|
||||||
|
|
|
@ -20,3 +20,51 @@ fn test_escape_html() {
|
||||||
assert html.escape('café') == 'café'
|
assert html.escape('café') == 'café'
|
||||||
assert html.escape('<p>façade</p>') == '<p>façade</p>'
|
assert html.escape('<p>façade</p>') == '<p>façade</p>'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn test_unescape_html() {
|
||||||
|
// Test different formats
|
||||||
|
assert html.unescape(''''') == "'''"
|
||||||
|
// Converse escape tests
|
||||||
|
assert html.unescape('<>&') == '<>&'
|
||||||
|
assert html.unescape('No change') == 'No change'
|
||||||
|
assert html.unescape('<b>Bold text</b>') == '<b>Bold text</b>'
|
||||||
|
assert html.unescape('<img />') == '<img />'
|
||||||
|
assert html.unescape('' onmouseover='alert(1)'') == "' onmouseover='alert(1)'"
|
||||||
|
assert html.unescape('<a href='http://www.example.com'>link</a>') == "<a href='http://www.example.com'>link</a>"
|
||||||
|
assert html.unescape('<script>alert('hello');</script>') == "<script>alert('hello');</script>"
|
||||||
|
// Cases obtained from:
|
||||||
|
// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
|
||||||
|
assert html.unescape('plain text') == 'plain text'
|
||||||
|
assert html.unescape('') == ''
|
||||||
|
assert html.unescape('bread & butter') == 'bread & butter'
|
||||||
|
assert html.unescape('"bread" & butter') == '"bread" & butter'
|
||||||
|
assert html.unescape('greater than >') == 'greater than >'
|
||||||
|
assert html.unescape('< less than') == '< less than'
|
||||||
|
// Leave accents as-is
|
||||||
|
assert html.unescape('café') == 'café'
|
||||||
|
assert html.unescape('<p>façade</p>') == '<p>façade</p>'
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_unescape_all_html() {
|
||||||
|
// Test different formats
|
||||||
|
assert html.unescape(''''', all: true) == "'''"
|
||||||
|
// Converse escape tests
|
||||||
|
assert html.unescape('<>&', all: true) == '<>&'
|
||||||
|
assert html.unescape('No change', all: true) == 'No change'
|
||||||
|
assert html.unescape('<b>Bold text</b>', all: true) == '<b>Bold text</b>'
|
||||||
|
assert html.unescape('<img />', all: true) == '<img />'
|
||||||
|
assert html.unescape('' onmouseover='alert(1)'', all: true) == "' onmouseover='alert(1)'"
|
||||||
|
assert html.unescape('<a href='http://www.example.com'>link</a>', all: true) == "<a href='http://www.example.com'>link</a>"
|
||||||
|
assert html.unescape('<script>alert('hello');</script>', all: true) == "<script>alert('hello');</script>"
|
||||||
|
// Cases obtained from:
|
||||||
|
// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
|
||||||
|
assert html.unescape('plain text', all: true) == 'plain text'
|
||||||
|
assert html.unescape('', all: true) == ''
|
||||||
|
assert html.unescape('bread & butter', all: true) == 'bread & butter'
|
||||||
|
assert html.unescape('"bread" & butter', all: true) == '"bread" & butter'
|
||||||
|
assert html.unescape('greater than >', all: true) == 'greater than >'
|
||||||
|
assert html.unescape('< less than', all: true) == '< less than'
|
||||||
|
// Leave accents as-is
|
||||||
|
assert html.unescape('café', all: true) == 'café'
|
||||||
|
assert html.unescape('<p>façade</p>', all: true) == '<p>façade</p>'
|
||||||
|
}
|
||||||
|
|
2130
vlib/encoding/html/named_references.v
Normal file
2130
vlib/encoding/html/named_references.v
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue