mirror of
https://github.com/vlang/v.git
synced 2025-09-15 15:32:27 +03:00
encoding.html: implement unescape()
(#19267)
This commit is contained in:
parent
c126450201
commit
273341685a
3 changed files with 2258 additions and 5 deletions
|
@ -1,13 +1,24 @@
|
|||
module html
|
||||
|
||||
import encoding.hex
|
||||
import strconv
|
||||
|
||||
[params]
|
||||
pub struct EscapeConfig {
|
||||
quote bool = true
|
||||
}
|
||||
|
||||
[params]
|
||||
pub struct UnescapeConfig {
|
||||
EscapeConfig
|
||||
all bool
|
||||
}
|
||||
|
||||
const (
|
||||
html_replacement_table = ['&', '&', '<', '<', '>', '>']
|
||||
html_quote_replacement_table = ['"', '"', "'", '''] // `'"'` is shorter than `'"'`
|
||||
escape_seq = ['&', '&', '<', '<', '>', '>']
|
||||
escape_quote_seq = ['"', '"', "'", ''']
|
||||
unescape_seq = ['&', '&', '<', '<', '>', '>']
|
||||
unescape_quote_seq = ['"', '"', ''', "'"]
|
||||
)
|
||||
|
||||
// escape converts special characters in the input, specifically "<", ">", and "&"
|
||||
|
@ -16,10 +27,74 @@ const (
|
|||
// **Note:** escape() supports funky accents by doing nothing about them. V's UTF-8
|
||||
// support through `string` is robust enough to deal with these cases.
|
||||
pub fn escape(input string, config EscapeConfig) string {
|
||||
tag_free_input := input.replace_each(html.html_replacement_table)
|
||||
return if config.quote {
|
||||
tag_free_input.replace_each(html.html_quote_replacement_table)
|
||||
input.replace_each(html.escape_seq).replace_each(html.escape_quote_seq)
|
||||
} else {
|
||||
tag_free_input
|
||||
input.replace_each(html.escape_seq)
|
||||
}
|
||||
}
|
||||
|
||||
// unescape converts entities like "<" to "<". By default it is the converse of `escape`.
|
||||
// If `all` is set to true, it handles named, numeric, and hex values - for example,
|
||||
// `'''`, `'''`, and `'''` then unescape to "'".
|
||||
pub fn unescape(input string, config UnescapeConfig) string {
|
||||
return if config.all {
|
||||
unescape_all(input)
|
||||
} else if config.quote {
|
||||
input.replace_each(html.unescape_seq).replace_each(html.unescape_quote_seq)
|
||||
} else {
|
||||
input.replace_each(html.unescape_seq)
|
||||
}
|
||||
}
|
||||
|
||||
fn unescape_all(input string) string {
|
||||
mut result := []rune{}
|
||||
runes := input.runes()
|
||||
mut i := 0
|
||||
outer: for i < runes.len {
|
||||
if runes[i] == `&` {
|
||||
mut j := i + 1
|
||||
for j < runes.len && runes[j] != `;` {
|
||||
j++
|
||||
}
|
||||
if j < runes.len && runes[i + 1] == `#` {
|
||||
// Numeric escape sequences (e.g., ' or ')
|
||||
code := runes[i + 2..j].string()
|
||||
if code[0] == `x` {
|
||||
// Hexadecimal escape sequence
|
||||
for c in code[1..] {
|
||||
if !c.is_hex_digit() {
|
||||
// Leave invalid sequences unchanged
|
||||
result << runes[i..j + 1]
|
||||
i = j + 1
|
||||
continue outer
|
||||
}
|
||||
}
|
||||
result << hex.decode(code[1..]) or { []u8{} }.bytestr().runes()
|
||||
} else {
|
||||
// Decimal escape sequence
|
||||
if v := strconv.atoi(code) {
|
||||
result << v
|
||||
} else {
|
||||
// Leave invalid sequences unchanged
|
||||
result << runes[i..j + 1]
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Named entity (e.g., <)
|
||||
entity := runes[i + 1..j].string()
|
||||
if v := named_references[entity] {
|
||||
result << v
|
||||
} else {
|
||||
// Leave unknown entities unchanged
|
||||
result << runes[i..j + 1]
|
||||
}
|
||||
}
|
||||
i = j + 1
|
||||
} else {
|
||||
result << runes[i]
|
||||
i++
|
||||
}
|
||||
}
|
||||
return result.string()
|
||||
}
|
||||
|
|
|
@ -20,3 +20,51 @@ fn test_escape_html() {
|
|||
assert html.escape('café') == 'café'
|
||||
assert html.escape('<p>façade</p>') == '<p>façade</p>'
|
||||
}
|
||||
|
||||
fn test_unescape_html() {
|
||||
// Test different formats
|
||||
assert html.unescape(''''') == "'''"
|
||||
// Converse escape tests
|
||||
assert html.unescape('<>&') == '<>&'
|
||||
assert html.unescape('No change') == 'No change'
|
||||
assert html.unescape('<b>Bold text</b>') == '<b>Bold text</b>'
|
||||
assert html.unescape('<img />') == '<img />'
|
||||
assert html.unescape('' onmouseover='alert(1)'') == "' onmouseover='alert(1)'"
|
||||
assert html.unescape('<a href='http://www.example.com'>link</a>') == "<a href='http://www.example.com'>link</a>"
|
||||
assert html.unescape('<script>alert('hello');</script>') == "<script>alert('hello');</script>"
|
||||
// Cases obtained from:
|
||||
// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
|
||||
assert html.unescape('plain text') == 'plain text'
|
||||
assert html.unescape('') == ''
|
||||
assert html.unescape('bread & butter') == 'bread & butter'
|
||||
assert html.unescape('"bread" & butter') == '"bread" & butter'
|
||||
assert html.unescape('greater than >') == 'greater than >'
|
||||
assert html.unescape('< less than') == '< less than'
|
||||
// Leave accents as-is
|
||||
assert html.unescape('café') == 'café'
|
||||
assert html.unescape('<p>façade</p>') == '<p>façade</p>'
|
||||
}
|
||||
|
||||
fn test_unescape_all_html() {
|
||||
// Test different formats
|
||||
assert html.unescape(''''', all: true) == "'''"
|
||||
// Converse escape tests
|
||||
assert html.unescape('<>&', all: true) == '<>&'
|
||||
assert html.unescape('No change', all: true) == 'No change'
|
||||
assert html.unescape('<b>Bold text</b>', all: true) == '<b>Bold text</b>'
|
||||
assert html.unescape('<img />', all: true) == '<img />'
|
||||
assert html.unescape('' onmouseover='alert(1)'', all: true) == "' onmouseover='alert(1)'"
|
||||
assert html.unescape('<a href='http://www.example.com'>link</a>', all: true) == "<a href='http://www.example.com'>link</a>"
|
||||
assert html.unescape('<script>alert('hello');</script>', all: true) == "<script>alert('hello');</script>"
|
||||
// Cases obtained from:
|
||||
// https://github.com/apache/commons-lang/blob/master/src/test/java/org/apache/commons/lang3/StringEscapeUtilsTest.java
|
||||
assert html.unescape('plain text', all: true) == 'plain text'
|
||||
assert html.unescape('', all: true) == ''
|
||||
assert html.unescape('bread & butter', all: true) == 'bread & butter'
|
||||
assert html.unescape('"bread" & butter', all: true) == '"bread" & butter'
|
||||
assert html.unescape('greater than >', all: true) == 'greater than >'
|
||||
assert html.unescape('< less than', all: true) == '< less than'
|
||||
// Leave accents as-is
|
||||
assert html.unescape('café', all: true) == 'café'
|
||||
assert html.unescape('<p>façade</p>', all: true) == '<p>façade</p>'
|
||||
}
|
||||
|
|
2130
vlib/encoding/html/named_references.v
Normal file
2130
vlib/encoding/html/named_references.v
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue