vlib: add an encoding.xml module with parser, validation, entity encoding, unit tests (#19708)

This commit is contained in:
Subhomoy Haldar 2023-11-06 13:14:30 +00:00 committed by GitHub
parent 01022e918e
commit 35558df96c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
48 changed files with 2004 additions and 1 deletions

7
.gitignore vendored
View file

@ -130,4 +130,9 @@ vls.log
wasm.v
TAGS
tags
vlib/builtin/js/*.js
# ignore large GTK *.gir files
Gtk-4.0.gir
*.gir
vlib/builtin/js/*.js

View file

@ -0,0 +1,44 @@
## Description
`xml` is a module to parse XML documents into a tree structure. It also supports
validation of XML documents against a DTD.
Note that this is not a streaming XML parser. It reads the entire document into
memory and then parses it. This is not a problem for small documents, but it
might be a problem for extremely large documents (several hundred megabytes or more).
## Usage
### Parsing XML Files
There are three different ways to parse an XML Document:
1. Pass the entire XML document as a string to `XMLDocument.from_string`.
2. Specify a file path to `XMLDocument.from_file`.
3. Use a source that implements `io.Reader` and pass it to `XMLDocument.from_reader`.
```v
import encoding.xml
//...
doc := xml.XMLDocument.from_file('test/sample.xml')!
```
### Validating XML Documents
Simply call `validate` on the parsed XML document.
### Querying
Check the `get_element...` methods defined on the XMLDocument struct.
### Escaping and Un-escaping XML Entities
When the `validate` method is called, the XML document is parsed and all text
nodes are un-escaped. This means that the text nodes will contain the actual
text and not the escaped version of the text.
When the XML document is serialized (using `str` or `pretty_str`), all text nodes are escaped.
The escaping and un-escaping can also be done manually using the `escape_text` and
`unescape_text` methods.

View file

@ -0,0 +1,148 @@
module xml
import strings
// pretty_str returns a pretty-printed version of the XML node. It requires the current indentation
// the node is at, the depth of the node in the tree, and a map of reverse entities to use when
// escaping text.
pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_entities map[string]string) string {
// Create the proper indentation first
mut indent_builder := strings.new_builder(original_indent.len * depth)
for _ in 0 .. depth {
indent_builder.write_string(original_indent)
}
indent := indent_builder.str()
// Now we can stringify the node
mut builder := strings.new_builder(1024)
builder.write_string(indent)
builder.write_u8(`<`)
builder.write_string(node.name)
for key, value in node.attributes {
builder.write_u8(` `)
builder.write_string(key)
builder.write_string('="')
builder.write_string(value)
builder.write_u8(`"`)
}
builder.write_string('>\n')
for child in node.children {
match child {
string {
builder.write_string(indent)
builder.write_string(original_indent)
builder.write_string(escape_text(child, reverse_entities: reverse_entities))
}
XMLNode {
builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities))
}
XMLComment {
builder.write_string(indent)
builder.write_string(original_indent)
builder.write_string('<!--')
builder.write_string(child.text)
builder.write_string('-->')
}
XMLCData {
builder.write_string(indent)
builder.write_string(original_indent)
builder.write_string('<![CDATA[')
builder.write_string(child.text)
builder.write_string(']]>')
}
}
builder.write_u8(`\n`)
}
builder.write_string(indent)
builder.write_string('</')
builder.write_string(node.name)
builder.write_u8(`>`)
return builder.str()
}
fn (list []DTDListItem) pretty_str(indent string) string {
if list.len == 0 {
return ''
}
mut builder := strings.new_builder(1024)
builder.write_u8(`[`)
builder.write_u8(`\n`)
for item in list {
match item {
DTDEntity {
builder.write_string('${indent}<!ENTITY ${item.name} "${item.value}">')
}
DTDElement {
builder.write_string('${indent}<!ELEMENT ${item.name} ${item.definition}>')
}
}
builder.write_u8(`\n`)
}
builder.write_u8(`]`)
return builder.str()
}
fn (doctype DocumentType) pretty_str(indent string) string {
match doctype.dtd {
string {
content := doctype.dtd
return if content.len > 0 {
'<!DOCTYPE ${doctype.name} SYSTEM "${content}">'
} else {
''
}
}
DocumentTypeDefinition {
if doctype.dtd.list.len == 0 {
return ''
}
mut builder := strings.new_builder(1024)
builder.write_string('<!DOCTYPE ')
builder.write_string(doctype.name)
builder.write_string(' ')
builder.write_string(doctype.dtd.list.pretty_str(indent))
builder.write_string('>')
builder.write_u8(`\n`)
return builder.str()
}
}
}
// pretty_str returns a pretty-printed version of the XML document. It requires the string used to
// indent each level of the document.
pub fn (doc XMLDocument) pretty_str(indent string) string {
mut document_builder := strings.new_builder(1024)
prolog := '<?xml version="${doc.version}" encoding="${doc.encoding}"?>'
comments := if doc.comments.len > 0 {
mut comments_buffer := strings.new_builder(512)
for comment in doc.comments {
comments_buffer.write_string('<!--')
comments_buffer.write_string(comment.text)
comments_buffer.write_string('-->')
comments_buffer.write_u8(`\n`)
}
comments_buffer.str()
} else {
''
}
document_builder.write_string(prolog)
document_builder.write_u8(`\n`)
document_builder.write_string(doc.doctype.pretty_str(indent))
document_builder.write_u8(`\n`)
document_builder.write_string(comments)
document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities))
return document_builder.str()
}
// str returns a string representation of the XML document. It uses a 2-space indentation
// to pretty-print the document.
pub fn (doc XMLDocument) str() string {
return doc.pretty_str(' ')
}

View file

@ -0,0 +1,79 @@
module xml
import strings
pub const default_entities = {
'lt': '<'
'gt': '>'
'amp': '&'
'apos': "'"
'quot': '"'
}
pub const default_entities_reverse = {
'<': 'lt'
'>': 'gt'
'&': 'amp'
"'": 'apos'
'"': 'quot'
}
[params]
pub struct EscapeConfig {
reverse_entities map[string]string = xml.default_entities_reverse
}
// escape_text replaces all entities in the given string with their respective
// XML entity strings. See default_entities, which can be overridden.
pub fn escape_text(content string, config EscapeConfig) string {
mut flattened_entities := []string{cap: 2 * config.reverse_entities.len}
for target, replacement in config.reverse_entities {
flattened_entities << target
flattened_entities << '&' + replacement + ';'
}
return content.replace_each(flattened_entities)
}
[params]
pub struct UnescapeConfig {
entities map[string]string = xml.default_entities
}
// unescape_text replaces all entities in the given string with their respective
// original characters or strings. See default_entities_reverse, which can be overridden.
pub fn unescape_text(content string, config UnescapeConfig) !string {
mut buffer := strings.new_builder(content.len)
mut index := 0
runes := content.runes()
for index < runes.len {
match runes[index] {
`&` {
mut offset := 1
mut entity_buf := strings.new_builder(8)
for index + offset < runes.len && runes[index + offset] != `;` {
entity_buf.write_rune(runes[index + offset])
offset++
}
// Did we reach the end of the string?
if index + offset == runes.len {
return error('Unexpected end of string while parsing entity.')
}
// Did we find a valid entity?
entity := entity_buf.str()
if entity in config.entities {
buffer.write_string(config.entities[entity])
index += offset
} else {
return error('Unknown entity: ' + entity)
}
}
else {
buffer.write_rune(runes[index])
}
}
index++
}
return buffer.str()
}

View file

@ -0,0 +1,35 @@
module main
import encoding.xml
fn test_escape() {
assert xml.escape_text('Normal string') == 'Normal string'
assert xml.escape_text('12 < 34') == '12 &lt; 34'
assert xml.escape_text('12 > 34') == '12 &gt; 34'
assert xml.escape_text('12 & 34') == '12 &amp; 34'
assert xml.escape_text('He said, "Very well, let us proceed."') == 'He said, &quot;Very well, let us proceed.&quot;'
assert xml.escape_text("He said, 'Very well, let us proceed.'") == 'He said, &apos;Very well, let us proceed.&apos;'
assert xml.escape_text('Do not escape ©.') == 'Do not escape ©.'
mut reverse_entities := xml.default_entities_reverse.clone()
reverse_entities['©'] = 'copy'
assert xml.escape_text('Do escape ©.', reverse_entities: reverse_entities) == 'Do escape &copy;.'
}
fn test_unescape() ! {
assert xml.unescape_text('Normal string')! == 'Normal string'
assert xml.unescape_text('12 &lt; 34')! == '12 < 34'
assert xml.unescape_text('12 &gt; 34')! == '12 > 34'
assert xml.unescape_text('12 &amp; 34')! == '12 & 34'
assert xml.unescape_text('He said, &quot;Very well, let us proceed.&quot;')! == 'He said, "Very well, let us proceed."'
assert xml.unescape_text('He said, &apos;Very well, let us proceed.&apos;')! == "He said, 'Very well, let us proceed.'"
xml.unescape_text('12 &invalid; 34') or { assert err.msg() == 'Unknown entity: invalid' }
xml.unescape_text('Do not unescape &copy;') or { assert err.msg() == 'Unknown entity: copy' }
mut entities := xml.default_entities.clone()
entities['copy'] = '©'
assert xml.unescape_text('Do unescape &copy;.', entities: entities)! == 'Do unescape ©.'
}

604
vlib/encoding/xml/parser.v Normal file
View file

@ -0,0 +1,604 @@
module xml
import io
import os
import strings
const (
default_prolog_attributes = {
'version': '1.0'
'encoding': 'UTF-8'
}
default_string_builder_cap = 32
element_len = '<!ELEMENT'.len
entity_len = '<!ENTITY'.len
doctype_chars = 'OCTYPE'.bytes()
double_dash = '--'.bytes()
c_tag = '[C'.bytes()
data_chars = 'DATA'.bytes()
)
// Helper types to assist in parsing
struct TextSpan {
mut:
start int
end int
}
enum AttributeParserState {
key
eq
value
}
fn parse_attributes(attribute_contents string) !map[string]string {
if attribute_contents.contains_u8(`<`) {
return error('Malformed XML. Found "<" in attribute string: "${attribute_contents}"')
}
mut attributes := map[string]string{}
mut state := AttributeParserState.key
mut key_span, mut value_span := TextSpan{}, TextSpan{}
for index, ch in attribute_contents {
match state {
.key {
match ch {
`=` {
state = AttributeParserState.eq
}
else {
key_span.end++
}
}
}
.eq {
match ch {
`=` {
return error('Duplicate "=" in attribute string: "${attribute_contents}"')
}
`'`, `"` {
state = AttributeParserState.value
value_span.start = index + 1
}
else {
return error('Invalid character in attribute string: "${attribute_contents}"')
}
}
}
.value {
match ch {
`'`, `"` {
state = AttributeParserState.key
value_span.end = index
attributes[attribute_contents[key_span.start..key_span.end].trim_space()] = attribute_contents[value_span.start..value_span.end]
key_span.start = index + 1
key_span.end = index + 1
}
else {
state = AttributeParserState.value
value_span.end++
}
}
}
}
}
return attributes
}
fn parse_comment(mut reader io.Reader) !XMLComment {
mut comment_buffer := strings.new_builder(xml.default_string_builder_cap)
mut local_buf := [u8(0)]
for {
ch := next_char(mut reader, mut local_buf)!
match ch {
`-` {
after_ch := next_char(mut reader, mut local_buf)!
if after_ch == `-` {
if next_char(mut reader, mut local_buf)! == `>` {
break
}
return error('XML Comment not closed. Expected ">".')
} else {
comment_buffer.write_u8(ch)
comment_buffer.write_u8(after_ch)
}
}
else {
comment_buffer.write_u8(ch)
}
}
}
comment_contents := comment_buffer.str()
return XMLComment{comment_contents}
}
enum CDATAParserState {
normal
single
double
}
fn parse_cdata(mut reader io.Reader) !XMLCData {
mut contents_buf := strings.new_builder(xml.default_string_builder_cap)
mut state := CDATAParserState.normal
mut local_buf := [u8(0)]
for {
ch := next_char(mut reader, mut local_buf)!
contents_buf.write_u8(ch)
match ch {
`]` {
match state {
.double {
// Another ] after the ]] for some reason. Keep the state
}
.single {
state = .double
}
.normal {
state = .single
}
}
}
`>` {
match state {
.double {
break
}
else {
state = .normal
}
}
}
else {
state = .normal
}
}
}
contents := contents_buf.str().trim_space()
if !contents.ends_with(']]>') {
return error('CDATA section not closed.')
}
return XMLCData{contents[1..contents.len - 3]}
}
fn parse_entity(contents string) !(DTDEntity, string) {
// We find the nearest '>' to the start of the ENTITY
entity_end := contents.index('>') or { return error('Entity declaration not closed.') }
entity_contents := contents[xml.entity_len..entity_end]
name := entity_contents.trim_left(' \t\n').all_before(' ')
if name.len == 0 {
return error('Entity is missing name.')
}
value := entity_contents.all_after_first(name).trim_space().trim('"\'')
if value.len == 0 {
return error('Entity is missing value.')
}
// TODO: Add support for SYSTEM and PUBLIC entities
return DTDEntity{name, value}, contents[entity_end + 1..]
}
fn parse_element(contents string) !(DTDElement, string) {
// We find the nearest '>' to the start of the ELEMENT
element_end := contents.index('>') or { return error('Element declaration not closed.') }
element_contents := contents[xml.element_len..element_end].trim_left(' \t\n')
mut name_span := TextSpan{}
for ch in element_contents {
match ch {
` `, `\t`, `\n` {
break
}
// Valid characters in an entity name are:
// 1. Lowercase alphabet - a-z
// 2. Uppercase alphabet - A-Z
// 3. Numbers - 0-9
// 4. Underscore - _
// 5. Colon - :
// 6. Period - .
`a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` {
name_span.end++
}
else {
return error('Invalid character in element name: "${ch}"')
}
}
}
name := element_contents[name_span.start..name_span.end].trim_left(' \t\n')
if name.len == 0 {
return error('Element is missing name.')
}
definition_string := element_contents.all_after_first(name).trim_space().trim('"\'')
definition := if definition_string.starts_with('(') {
// We have a list of possible children
// Ensure that both ( and ) are present
if !definition_string.ends_with(')') {
return error('Element declaration not closed.')
}
definition_string.trim('()').split(',')
} else {
// Invalid definition
return error('Invalid element definition: ${definition_string}')
}
// TODO: Add support for SYSTEM and PUBLIC entities
return DTDElement{name, definition}, contents[element_end + 1..]
}
fn parse_doctype(mut reader io.Reader) !DocumentType {
// We may have more < in the doctype so keep count
mut depth := 1
mut doctype_buffer := strings.new_builder(xml.default_string_builder_cap)
mut local_buf := [u8(0)]
for {
ch := next_char(mut reader, mut local_buf)!
doctype_buffer.write_u8(ch)
match ch {
`<` {
depth++
}
`>` {
depth--
if depth == 0 {
break
}
}
else {}
}
}
doctype_contents := doctype_buffer.str().trim_space()
name := doctype_contents.all_before('[').trim_space()
mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space()
mut items := []DTDListItem{}
for list_contents.len > 0 {
if list_contents.starts_with('<!ENTITY') {
entity, remaining := parse_entity(list_contents)!
items << entity
list_contents = remaining.trim_space()
} else if list_contents.starts_with('<!ELEMENT') {
element, remaining := parse_element(list_contents)!
items << element
list_contents = remaining.trim_space()
} else {
return error('Unknown DOCTYPE list item: ${list_contents}')
}
}
return DocumentType{
name: name
dtd: DocumentTypeDefinition{
list: items
}
}
}
fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
// Trim trailing whitespace
mut local_buf := [u8(0)]
mut ch := next_char(mut reader, mut local_buf)!
for {
match ch {
` `, `\t`, `\n` {
ch = next_char(mut reader, mut local_buf)!
continue
}
`<` {
break
}
else {
return error('Expecting a prolog or root node starting with "<".')
}
}
}
ch = next_char(mut reader, mut local_buf)!
if ch != `?` {
return Prolog{}, ch
}
ch = next_char(mut reader, mut local_buf)!
if ch != `x` {
return error('Expecting a prolog starting with "<?x".')
}
ch = next_char(mut reader, mut local_buf)!
if ch != `m` {
return error('Expecting a prolog starting with "<?xm".')
}
ch = next_char(mut reader, mut local_buf)!
if ch != `l` {
return error('Expecting a prolog starting with "<?xml".')
}
mut prolog_buffer := strings.new_builder(xml.default_string_builder_cap)
// Keep reading character by character until we find the end of the prolog
mut found_question_mark := false
for {
ch = next_char(mut reader, mut local_buf)!
match ch {
`?` {
if found_question_mark {
return error('Invalid prolog: Two question marks found in a row.')
}
found_question_mark = true
}
`>` {
if found_question_mark {
break
}
return error('Invalid prolog: Found ">" before "?".')
}
else {
if found_question_mark {
found_question_mark = false
prolog_buffer.write_u8(`?`)
}
prolog_buffer.write_u8(ch)
}
}
}
prolog_attributes := prolog_buffer.str().trim_space()
attributes := if prolog_attributes.len == 0 {
xml.default_prolog_attributes
} else {
parse_attributes(prolog_attributes)!
}
version := attributes['version'] or { return error('XML declaration missing version.') }
encoding := attributes['encoding'] or { 'UTF-8' }
mut comments := []XMLComment{}
mut doctype := DocumentType{
name: ''
dtd: ''
}
mut found_doctype := false
for {
ch = next_char(mut reader, mut local_buf)!
match ch {
` `, `\t`, `\n` {
continue
}
`<` {
// We have a comment, DOCTYPE, or root node
ch = next_char(mut reader, mut local_buf)!
match ch {
`!` {
// A comment or DOCTYPE
match next_char(mut reader, mut local_buf)! {
`-` {
// A comment
if next_char(mut reader, mut local_buf)! != `-` {
return error('Invalid comment.')
}
comments << parse_comment(mut reader)!
}
`D` {
if found_doctype {
return error('Duplicate DOCTYPE declaration.')
}
// <!D -> OCTYPE
mut doc_buf := []u8{len: 6}
if reader.read(mut doc_buf)! != 6 {
return error('Invalid DOCTYPE.')
}
if doc_buf != xml.doctype_chars {
return error('Invalid DOCTYPE.')
}
found_doctype = true
doctype = parse_doctype(mut reader)!
}
else {
return error('Unsupported control sequence found in prolog.')
}
}
}
else {
// We have found the start of the root node
break
}
}
}
else {}
}
}
return Prolog{
version: version
encoding: encoding
doctype: doctype
comments: comments
}, ch
}
fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode {
mut inner_contents := strings.new_builder(xml.default_string_builder_cap)
mut children := []XMLNodeContents{}
mut local_buf := [u8(0)]
for {
ch := next_char(mut reader, mut local_buf)!
match ch {
`<` {
second_char := next_char(mut reader, mut local_buf)!
match second_char {
`!` {
// Comment, CDATA
mut next_two := [u8(0), 0]
if reader.read(mut next_two)! != 2 {
return error('Invalid XML. Incomplete comment or CDATA declaration.')
}
if next_two == xml.double_dash {
// Comment
comment := parse_comment(mut reader)!
children << comment
} else if next_two == xml.c_tag {
// <![CDATA -> DATA
mut cdata_buf := []u8{len: 4}
if reader.read(mut cdata_buf)! != 4 {
return error('Invalid XML. Incomplete CDATA declaration.')
}
if cdata_buf != xml.data_chars {
return error('Invalid XML. Expected "CDATA" after "<![C".')
}
cdata := parse_cdata(mut reader)!
children << cdata
} else {
return error('Invalid XML. Unknown control sequence: ${next_two.bytestr()}')
}
}
`/` {
// End of node
mut node_end_buffer := []u8{len: name.len + 1}
if reader.read(mut node_end_buffer)! != name.len + 1 {
return error('Invalid XML. Incomplete node end.')
}
mut ending_chars := name.bytes()
ending_chars << `>`
if node_end_buffer != ending_chars {
return error('XML node <${name}> not closed.')
}
collected_contents := inner_contents.str().trim_space()
if collected_contents.len > 0 {
// We have some inner text
children << collected_contents.replace('\r\n', '\n')
}
return XMLNode{
name: name
attributes: attributes
children: children
}
}
else {
// Start of child node
child := parse_single_node(second_char, mut reader) or {
if err.msg() == 'XML node cannot start with "</".' {
return error('XML node <${name}> not closed.')
} else {
return err
}
}
text := inner_contents.str().trim_space()
if text.len > 0 {
children << text.replace('\r\n', '\n')
}
children << child
}
}
}
else {
inner_contents.write_u8(ch)
}
}
}
return error('XML node <${name}> not closed.')
}
fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
mut local_buf := [u8(0)]
mut ch := next_char(mut reader, mut local_buf)!
mut contents := strings.new_builder(xml.default_string_builder_cap)
// We're expecting an opening tag
if ch == `/` {
return error('XML node cannot start with "</".')
}
contents.write_u8(ch)
for {
ch = next_char(mut reader, mut local_buf)!
if ch == `>` {
break
}
contents.write_u8(ch)
}
tag_contents := contents.str().trim_space()
parts := tag_contents.split_any(' \t\n')
name := first_char.ascii_str() + parts[0]
// Check if it is a self-closing tag
if tag_contents.ends_with('/') {
// We're not looking for children and inner text
return XMLNode{
name: name
attributes: parse_attributes(tag_contents[name.len - 1..tag_contents.len].trim_space())!
}
}
attribute_string := tag_contents[name.len - 1..].trim_space()
attributes := parse_attributes(attribute_string)!
return parse_children(name, attributes, mut reader)
}
// XMLDocument.from_string parses an XML document from a string.
pub fn XMLDocument.from_string(raw_contents string) !XMLDocument {
mut reader := FullBufferReader{
contents: raw_contents.bytes()
}
return XMLDocument.from_reader(mut reader)!
}
// XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety
// and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead.
pub fn XMLDocument.from_file(path string) !XMLDocument {
mut reader := FullBufferReader{
contents: os.read_bytes(path)!
}
return XMLDocument.from_reader(mut reader)!
}
// XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse
// an XML document from any arbitrary source that implements that io.Reader interface.
pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument {
prolog, first_char := parse_prolog(mut reader) or {
if err is os.Eof || err is io.Eof || err.msg() == 'Unexpected End Of File.' {
return error('XML document is empty.')
} else {
return err
}
}
root := parse_single_node(first_char, mut reader)!
return XMLDocument{
version: prolog.version
encoding: prolog.encoding
comments: prolog.comments
doctype: prolog.doctype
root: root
}
}

60
vlib/encoding/xml/query.v Normal file
View file

@ -0,0 +1,60 @@
module xml
fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
// Is this the node we're looking for?
if attribute_id := node.attributes['id'] {
if attribute_id == id {
return node
}
}
if node.children.len == 0 {
return none
}
// Recurse into children
for child in node.children {
match child {
XMLNode {
if result := child.get_element_by_id(id) {
return result
}
}
else {}
}
}
return none
}
fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
mut result := []XMLNode{}
if node.name == tag {
result << node
}
if node.children.len == 0 {
return result
}
// Recurse into children
for child in node.children {
if child is XMLNode {
result << child.get_elements_by_tag(tag)
}
}
return result
}
// get_element_by_id returns the first element with the given id, or none if no
// such element exists.
pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode {
return doc.root.get_element_by_id(id)
}
// get_elements_by_tag returns all elements with the given tag name.
pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode {
return doc.root.get_elements_by_tag(tag)
}

View file

@ -0,0 +1,30 @@
module xml
import io
fn next_char(mut reader io.Reader, mut buf []u8) !u8 {
if reader.read(mut buf)! == 0 {
return error('Unexpected End Of File.')
}
return buf[0]
}
struct FullBufferReader {
contents []u8
mut:
position int
}
[direct_array_access]
fn (mut fbr FullBufferReader) read(mut buf []u8) !int {
if fbr.position >= fbr.contents.len {
return io.Eof{}
}
remaining := fbr.contents.len - fbr.position
n := if buf.len < remaining { buf.len } else { remaining }
unsafe {
vmemcpy(&u8(buf.data), &u8(fbr.contents.data) + fbr.position, n)
}
fbr.position += n
return n
}

View file

@ -0,0 +1,89 @@
module main
import encoding.xml
import os
fn test_large_gtk_file() ! {
// Note: If you are contributing to this project, you should download the
// GIR file from https://raw.githubusercontent.com/gtk-rs/gir-files/master/Gtk-4.0.gir
// and place it in the same directory as this file.
path := os.join_path(os.dir(@FILE), 'Gtk-4.0.gir')
if !os.exists(path) {
println('Skipping test_large_gtk_file because file does not exist.')
return
}
actual := xml.XMLDocument.from_file(path) or {
return error('Failed to parse large GTK XML file')
}
mut valid := false
for elm in actual.get_elements_by_tag('class') {
if 'c:type' in elm.attributes && elm.attributes['c:type'] == 'GtkWindow' {
assert elm.attributes['parent'] == 'Widget'
assert elm.attributes['c:symbol-prefix'] == 'window'
valid = true
}
}
assert valid, 'GtkWindow class not found!'
valid = false
for elm in actual.get_elements_by_tag('constructor') {
if 'c:identifier' in elm.attributes && elm.attributes['c:identifier'] == 'gtk_window_new' {
assert elm == xml.XMLNode{
name: 'constructor'
attributes: {
'name': 'new'
'c:identifier': 'gtk_window_new'
}
children: [
xml.XMLNodeContents(xml.XMLNode{
name: 'doc'
attributes: {
'xml:space': 'preserve'
}
children: [
xml.XMLNodeContents('Creates a new `GtkWindow`.
To get an undecorated window (no window borders), use
[method@Gtk.Window.set_decorated].
All top-level windows created by gtk_window_new() are stored
in an internal top-level window list. This list can be obtained
from [func@Gtk.Window.list_toplevels]. Due to GTK keeping a
reference to the window internally, gtk_window_new() does not
return a reference to the caller.
To delete a `GtkWindow`, call [method@Gtk.Window.destroy].'),
]
}),
xml.XMLNodeContents(xml.XMLNode{
name: 'return-value'
attributes: {
'transfer-ownership': 'none'
}
children: [
xml.XMLNodeContents(xml.XMLNode{
name: 'doc'
attributes: {
'xml:space': 'preserve'
}
children: [xml.XMLNodeContents('a new `GtkWindow`.')]
}),
xml.XMLNodeContents(xml.XMLNode{
name: 'type'
attributes: {
'name': 'Widget'
'c:type': 'GtkWidget*'
}
children: []
}),
]
}),
]
}
valid = true
}
}
assert valid, 'gtk_window_new constructor not found!'
}

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<message>
<greeting>
Hello, World!
</greeting>
</message>

View file

@ -0,0 +1,23 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'hello_world.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'message'
children: [
xml.XMLNode{
name: 'greeting'
children: [
'Hello, World!',
]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,6 @@
<note>
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>

View file

@ -0,0 +1,41 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'note.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'note'
children: [
xml.XMLNode{
name: 'to'
children: [
'Tove',
]
},
xml.XMLNode{
name: 'from'
children: [
'Jani',
]
},
xml.XMLNode{
name: 'heading'
children: [
'Reminder',
]
},
xml.XMLNode{
name: 'body'
children: [
"Don't forget me this weekend!",
]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,34 @@
<CATALOG>
<CD>
<TITLE>Empire Burlesque</TITLE>
<ARTIST>Bob Dylan</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>Columbia</COMPANY>
<PRICE>10.90</PRICE>
<YEAR>1985</YEAR>
</CD>
<CD>
<TITLE>Hide your heart</TITLE>
<ARTIST>Bonnie Tyler</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>CBS Records</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1988</YEAR>
</CD>
<CD>
<TITLE>Greatest Hits</TITLE>
<ARTIST>Dolly Parton</ARTIST>
<COUNTRY>USA</COUNTRY>
<COMPANY>RCA</COMPANY>
<PRICE>9.90</PRICE>
<YEAR>1982</YEAR>
</CD>
<CD>
<TITLE>Still got the blues</TITLE>
<ARTIST>Gary Moore</ARTIST>
<COUNTRY>UK</COUNTRY>
<COMPANY>Virgin records</COMPANY>
<PRICE>10.20</PRICE>
<YEAR>1990</YEAR>
</CD>
</CATALOG>

View file

@ -0,0 +1,181 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'cd_catalog.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'CATALOG'
children: [
xml.XMLNode{
name: 'CD'
children: [
xml.XMLNode{
name: 'TITLE'
children: [
'Empire Burlesque',
]
},
xml.XMLNode{
name: 'ARTIST'
children: [
'Bob Dylan',
]
},
xml.XMLNode{
name: 'COUNTRY'
children: [
'USA',
]
},
xml.XMLNode{
name: 'COMPANY'
children: [
'Columbia',
]
},
xml.XMLNode{
name: 'PRICE'
children: [
'10.90',
]
},
xml.XMLNode{
name: 'YEAR'
children: [
'1985',
]
},
]
},
xml.XMLNode{
name: 'CD'
children: [
xml.XMLNode{
name: 'TITLE'
children: [
'Hide your heart',
]
},
xml.XMLNode{
name: 'ARTIST'
children: [
'Bonnie Tyler',
]
},
xml.XMLNode{
name: 'COUNTRY'
children: [
'UK',
]
},
xml.XMLNode{
name: 'COMPANY'
children: [
'CBS Records',
]
},
xml.XMLNode{
name: 'PRICE'
children: [
'9.90',
]
},
xml.XMLNode{
name: 'YEAR'
children: [
'1988',
]
},
]
},
xml.XMLNode{
name: 'CD'
children: [
xml.XMLNode{
name: 'TITLE'
children: [
'Greatest Hits',
]
},
xml.XMLNode{
name: 'ARTIST'
children: [
'Dolly Parton',
]
},
xml.XMLNode{
name: 'COUNTRY'
children: [
'USA',
]
},
xml.XMLNode{
name: 'COMPANY'
children: [
'RCA',
]
},
xml.XMLNode{
name: 'PRICE'
children: [
'9.90',
]
},
xml.XMLNode{
name: 'YEAR'
children: [
'1982',
]
},
]
},
xml.XMLNode{
name: 'CD'
children: [
xml.XMLNode{
name: 'TITLE'
children: [
'Still got the blues',
]
},
xml.XMLNode{
name: 'ARTIST'
children: [
'Gary Moore',
]
},
xml.XMLNode{
name: 'COUNTRY'
children: [
'UK',
]
},
xml.XMLNode{
name: 'COMPANY'
children: [
'Virgin records',
]
},
xml.XMLNode{
name: 'PRICE'
children: [
'10.20',
]
},
xml.XMLNode{
name: 'YEAR'
children: [
'1990',
]
},
]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1 @@
XML document is empty.

View file

@ -0,0 +1 @@
<sample>Single root element.</sample>

View file

@ -0,0 +1,18 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'root.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'sample'
children: [
'Single root element.',
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,14 @@
<level1>
<level2>
<level3>
<level4>
Deeply nested content.
</level4>
</level3>
</level2>
<level2>
<level3>
Less deeply nested content.
</level3>
</level2>
</level1>

View file

@ -0,0 +1,44 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'nested.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'level1'
children: [
xml.XMLNode{
name: 'level2'
children: [
xml.XMLNode{
name: 'level3'
children: [
xml.XMLNode{
name: 'level4'
children: [
'Deeply nested content.',
]
},
]
},
]
},
xml.XMLNode{
name: 'level2'
children: [
xml.XMLNode{
name: 'level3'
children: [
'Less deeply nested content.',
]
},
]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,5 @@
<letter>
Dear Mr. <name>John Smith</name>.
Your order <orderid>1032</orderid>
will be shipped on <shipdate>2001-07-13</shipdate>.
</letter>

View file

@ -0,0 +1,33 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'mixed.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'letter'
children: [
'Dear Mr.',
xml.XMLNode{
name: 'name'
children: ['John Smith']
},
'.\n Your order',
xml.XMLNode{
name: 'orderid'
children: ['1032']
},
'will be shipped on',
xml.XMLNode{
name: 'shipdate'
children: ['2001-07-13']
},
'.',
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!-- Employee Information-->
<address>
<!-- Full or first name -->
<name>Jones</name>
<!-- Registered name of the company -> -->
<company>ABSystems</company>
<phone>
<!-- Phone with country code -) -->
(046) 1233-44778
</phone>
</address>

View file

@ -0,0 +1,42 @@
import os
import encoding.xml
fn test_valid_parsing() ! {
path := os.join_path(os.dir(@FILE), 'comment.xml')
expected := xml.XMLDocument{
comments: [
xml.XMLComment{
text: ' Employee Information'
},
]
root: xml.XMLNode{
name: 'address'
children: [
xml.XMLComment{
text: ' Full or first name '
},
xml.XMLNode{
name: 'name'
children: ['Jones']
},
xml.XMLComment{
text: ' Registered name of the company -> '
},
xml.XMLNode{
name: 'company'
children: ['ABSystems']
},
xml.XMLNode{
name: 'phone'
children: [xml.XMLComment{
text: ' Phone with country code -) '
}, '(046) 1233-44778']
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1 @@
Malformed XML. Found "<" in attribute string: "<body"

View file

@ -0,0 +1 @@
<message <body>Sample</body></message>

View file

@ -0,0 +1 @@
XML node <warning> not closed.

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<message>
<warning>
Hello World
<!--missing </warning> -->
</message>

View file

@ -0,0 +1,4 @@
<sample>
<html>This is &lt;b&gt;bold&lt;/b&gt;</html>
<html><![CDATA[This is <b>bold</b>]]></html>
</sample>

View file

@ -0,0 +1,29 @@
module main
import os
import encoding.xml
fn test_valid_parsing() {
path := os.join_path(os.dir(@FILE), 'cdata.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'sample'
children: [
xml.XMLNode{
name: 'html'
children: ['This is &lt;b&gt;bold&lt;/b&gt;']
},
xml.XMLNode{
name: 'html'
children: [xml.XMLCData{
text: 'This is <b>bold</b>'
}]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE body [
<!ENTITY warning "Warning: Something bad happened... please refresh and try again.">
]>
<body>
<message> &warning; </message>
</body>

View file

@ -0,0 +1,41 @@
module main
import os
import encoding.xml
fn test_valid_parsing() {
path := os.join_path(os.dir(@FILE), 'entity.xml')
mut reverse_entities := xml.default_entities_reverse.clone()
reverse_entities['Warning: Something bad happened... please refresh and try again.'] = 'warning'
expected := xml.XMLDocument{
parsed_reverse_entities: reverse_entities
doctype: xml.DocumentType{
name: 'body'
dtd: xml.DocumentTypeDefinition{
name: ''
list: [
xml.DTDEntity{
name: 'warning'
value: 'Warning: Something bad happened... please refresh and try again.'
},
]
}
}
root: xml.XMLNode{
name: 'body'
children: [
xml.XMLNode{
name: 'message'
children: [
'Warning: Something bad happened... please refresh and try again.',
]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!.validate()!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,71 @@
module main
import os
import encoding.xml
fn test_valid_parsing() {
path := os.join_path(os.dir(@FILE), 'element.xml')
expected := xml.XMLDocument{
doctype: xml.DocumentType{
name: 'note'
dtd: xml.DocumentTypeDefinition{
name: ''
list: [
xml.DTDElement{
name: 'note'
definition: ['to', 'from', 'heading', 'body']
},
xml.DTDElement{
name: 'to'
definition: ['#PCDATA']
},
xml.DTDElement{
name: 'from'
definition: ['#PCDATA']
},
xml.DTDElement{
name: 'heading'
definition: ['#PCDATA']
},
xml.DTDElement{
name: 'body'
definition: ['#PCDATA']
},
]
}
}
root: xml.XMLNode{
name: 'note'
children: [
xml.XMLNode{
name: 'to'
children: [
'Tove',
]
},
xml.XMLNode{
name: 'from'
children: [
'Jani',
]
},
xml.XMLNode{
name: 'heading'
children: [
'Reminder',
]
},
xml.XMLNode{
name: 'body'
children: [
"Don't forget me this weekend!",
]
},
]
}
}
actual := xml.XMLDocument.from_file(path)!.validate()!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE note [
<!ELEMENT note (to,from,heading,body)>
<!ELEMENT to (#PCDATA)>
<!ELEMENT from (#PCDATA)>
<!ELEMENT heading (#PCDATA)>
<!ELEMENT body (#PCDATA)>
]>
<note>
<to>Tove</to>
<from>Jani</from>
<heading>Reminder</heading>
<body>Don't forget me this weekend!</body>
</note>

View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<book category="web">
<title lang="en" code:type="const char*">Learning XML</title>
<author attr=" surrounding spaces ">Erik T. Ray</author>
<year>2003</year>
<price>39.95</price>
</book>

View file

@ -0,0 +1,45 @@
module main
import os
import encoding.xml
fn test_valid_parsing() {
path := os.join_path(os.dir(@FILE), 'attributes.xml')
expected := xml.XMLDocument{
root: xml.XMLNode{
name: 'book'
attributes: {
'category': 'web'
}
children: [
xml.XMLNode{
name: 'title'
attributes: {
'lang': 'en'
'code:type': 'const char*'
}
children: ['Learning XML']
},
xml.XMLNode{
name: 'author'
attributes: {
'attr': ' surrounding spaces '
}
children: ['Erik T. Ray']
},
xml.XMLNode{
name: 'year'
children: ['2003']
},
xml.XMLNode{
name: 'price'
children: ['39.95']
},
]
}
}
actual := xml.XMLDocument.from_file(path)!
assert expected == actual, 'Parsed XML document should be equal to expected XML document'
}

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE body [
<!ENTITY>
]>
<body>
</body>

View file

@ -0,0 +1 @@
Entity is missing name.

View file

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE body [
<!ENTITY missing>
]>
<body>
&missing;
</body>

View file

@ -0,0 +1 @@
Entity is missing value.

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE note [
<!ELEMENT>
]>
<note>
</note>

View file

@ -0,0 +1 @@
Element is missing name.

View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE note [
<!ELEMENT note invalid>
]>
<note>
</note>

View file

@ -0,0 +1 @@
Invalid element definition: invalid

View file

@ -0,0 +1,25 @@
module main
import os
import encoding.xml
// All the XML files in the spec directory obtained recursively
const spec_files = os.walk_ext(os.join_path(os.dir(@FILE), 'local'), 'xml')
fn test_can_parse_all_files() ! {
assert spec_files.len > 0, 'No XML files found in the spec directory'
for file in spec_files {
doc := xml.XMLDocument.from_file(file) or {
// Parsing failed. Check if this was an expected error.
parent := os.dir(file)
error_file := os.join_path(parent, 'expected_error.txt')
error_text := os.read_file(error_file) or {
// No expected error. Fail the test.
return error('Failed to parse XML file: ' + file)
}
// Check if the error message matches the expected error.
assert err.msg().trim_space() == error_text.trim_space()
continue
}
}
}

71
vlib/encoding/xml/types.v Normal file
View file

@ -0,0 +1,71 @@
module xml
pub type XMLNodeContents = XMLCData | XMLComment | XMLNode | string
pub struct XMLCData {
pub:
text string [required]
}
pub struct XMLComment {
pub:
text string [required]
}
// XMLNode represents a single XML node. It contains the node name,
// a map of attributes, and a list of children. The children can be
// other XML nodes, CDATA, plain text, or comments.
pub struct XMLNode {
pub:
name string [required]
attributes map[string]string
children []XMLNodeContents
}
// XMLDocument is the struct that represents a single XML document.
// It contains the prolog and the single root node. The prolog struct
// is embedded into the XMLDocument struct, so that the prolog fields
// are accessible directly from the this struct.
// Public prolog fields include version, enccoding, comments preceding
// the root node, and the document type definition.
pub struct XMLDocument {
Prolog
pub:
root XMLNode [required]
}
pub type DTDListItem = DTDElement | DTDEntity
pub struct DTDEntity {
name string [required]
value string [required]
}
pub struct DTDElement {
name string [required]
definition []string [required]
}
pub struct DocumentTypeDefinition {
name string
list []DTDListItem
}
pub struct DocumentType {
name string [required]
dtd DTDInfo
}
type DTDInfo = DocumentTypeDefinition | string
struct Prolog {
parsed_reverse_entities map[string]string = default_entities_reverse.clone()
pub:
version string = '1.0'
encoding string = 'UTF-8'
doctype DocumentType = DocumentType{
name: ''
dtd: ''
}
comments []XMLComment
}

View file

@ -0,0 +1,96 @@
module xml
fn (node XMLNode) validate(elements map[string]DTDElement, entities map[string]string) !XMLNode {
mut children := []XMLNodeContents{cap: node.children.len}
valid_elements := elements[node.name].definition
mut validate_node_children := node.name in elements
// Check if the node will match everything
if valid_elements.len == 1 && valid_elements[0] == '#PCDATA' {
validate_node_children = false
}
for child in node.children {
match child {
XMLNode {
if validate_node_children {
name := child.name
if name !in valid_elements {
return error('Invalid child element ${name} for ${node.name}')
}
}
children << child.validate(elements, entities)!
}
string {
children << unescape_text(child, entities: entities)!
}
else {
// Ignore other nodes
children << child
}
}
}
return XMLNode{
name: node.name
attributes: node.attributes
children: children
}
}
// validate checks the document is well-formed and valid. It returns a new
// document with the parsed entities expanded when validation is successful.
// Otherwise it returns an error.
pub fn (doc XMLDocument) validate() !XMLDocument {
// The document is well-formed because we were able to parse it properly.
match doc.doctype.dtd {
DocumentTypeDefinition {
// Store the element and entity definitions
mut elements := map[string]DTDElement{}
mut entities := default_entities.clone()
mut reverse_entities := default_entities_reverse.clone()
for item in doc.doctype.dtd.list {
match item {
DTDElement {
name := item.name
if name in elements {
return error('Duplicate element definition for ${name}')
}
elements[name] = item
}
DTDEntity {
name := item.name
if name in entities {
return error('Duplicate entity definition for ${name}')
}
entities[name] = item.value
reverse_entities[item.value] = name
}
}
}
// Now validate the document against the elements and entities.
new_root := doc.root.validate(elements, entities)!
// Check the DOCTYPE name matches the root name
if doc.doctype.name.len > 0 && doc.doctype.name != new_root.name {
return error('Root element ${new_root.name} does not match DOCTYPE ${doc.doctype.name}')
}
return XMLDocument{
version: doc.version
encoding: doc.encoding
doctype: doc.doctype
comments: doc.comments
root: new_root
parsed_reverse_entities: reverse_entities
}
}
string {
// TODO: Validate the document against the DTD string.
return doc
}
}
}