vlib: add an encoding.xml module with parser, validation, entity encoding, unit tests (#19708)

2025-09-13 14:32:26 +03:00 · 2023-11-06 13:14:30 +00:00 · 2023-11-06 13:14:30 +00:00 · 35558df96c
commit 35558df96c
parent 01022e918e
48 changed files with 2004 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -130,4 +130,9 @@ vls.log
 wasm.v
 TAGS
 tags
-vlib/builtin/js/*.js
+
+# ignore large GTK *.gir files
+Gtk-4.0.gir
+*.gir
+
+vlib/builtin/js/*.js
--- a/vlib/encoding/xml/README.md
+++ b/vlib/encoding/xml/README.md
@ -0,0 +1,44 @@
+## Description
+
+`xml` is a module to parse XML documents into a tree structure. It also supports
+validation of XML documents against a DTD.
+
+Note that this is not a streaming XML parser. It reads the entire document into
+memory and then parses it. This is not a problem for small documents, but it
+might be a problem for extremely large documents (several hundred megabytes or more).
+
+## Usage
+
+### Parsing XML Files
+
+There are three different ways to parse an XML Document:
+
+1. Pass the entire XML document as a string to `XMLDocument.from_string`.
+2. Specify a file path to `XMLDocument.from_file`.
+3. Use a source that implements `io.Reader` and pass it to `XMLDocument.from_reader`.
+
+```v
+import encoding.xml
+
+//...
+doc := xml.XMLDocument.from_file('test/sample.xml')!
+```
+
+### Validating XML Documents
+
+Simply call `validate` on the parsed XML document.
+
+### Querying
+
+Check the `get_element...` methods defined on the XMLDocument struct.
+
+### Escaping and Un-escaping XML Entities
+
+When the `validate` method is called, the XML document is parsed and all text
+nodes are un-escaped. This means that the text nodes will contain the actual
+text and not the escaped version of the text.
+
+When the XML document is serialized (using `str` or `pretty_str`), all text nodes are escaped.
+
+The escaping and un-escaping can also be done manually using the `escape_text` and
+`unescape_text` methods.
--- a/vlib/encoding/xml/encoding.v
+++ b/vlib/encoding/xml/encoding.v
@ -0,0 +1,148 @@
+module xml
+
+import strings
+
+// pretty_str returns a pretty-printed version of the XML node. It requires the current indentation
+// the node is at, the depth of the node in the tree, and a map of reverse entities to use when
+// escaping text.
+pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_entities map[string]string) string {
+	// Create the proper indentation first
+	mut indent_builder := strings.new_builder(original_indent.len * depth)
+	for _ in 0 .. depth {
+		indent_builder.write_string(original_indent)
+	}
+	indent := indent_builder.str()
+
+	// Now we can stringify the node
+	mut builder := strings.new_builder(1024)
+	builder.write_string(indent)
+	builder.write_u8(`<`)
+	builder.write_string(node.name)
+
+	for key, value in node.attributes {
+		builder.write_u8(` `)
+		builder.write_string(key)
+		builder.write_string('="')
+		builder.write_string(value)
+		builder.write_u8(`"`)
+	}
+	builder.write_string('>\n')
+	for child in node.children {
+		match child {
+			string {
+				builder.write_string(indent)
+				builder.write_string(original_indent)
+				builder.write_string(escape_text(child, reverse_entities: reverse_entities))
+			}
+			XMLNode {
+				builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities))
+			}
+			XMLComment {
+				builder.write_string(indent)
+				builder.write_string(original_indent)
+				builder.write_string('<!--')
+				builder.write_string(child.text)
+				builder.write_string('-->')
+			}
+			XMLCData {
+				builder.write_string(indent)
+				builder.write_string(original_indent)
+				builder.write_string('<![CDATA[')
+				builder.write_string(child.text)
+				builder.write_string(']]>')
+			}
+		}
+		builder.write_u8(`\n`)
+	}
+	builder.write_string(indent)
+	builder.write_string('</')
+	builder.write_string(node.name)
+	builder.write_u8(`>`)
+	return builder.str()
+}
+
+fn (list []DTDListItem) pretty_str(indent string) string {
+	if list.len == 0 {
+		return ''
+	}
+
+	mut builder := strings.new_builder(1024)
+	builder.write_u8(`[`)
+	builder.write_u8(`\n`)
+
+	for item in list {
+		match item {
+			DTDEntity {
+				builder.write_string('${indent}<!ENTITY ${item.name} "${item.value}">')
+			}
+			DTDElement {
+				builder.write_string('${indent}<!ELEMENT ${item.name} ${item.definition}>')
+			}
+		}
+		builder.write_u8(`\n`)
+	}
+	builder.write_u8(`]`)
+	return builder.str()
+}
+
+fn (doctype DocumentType) pretty_str(indent string) string {
+	match doctype.dtd {
+		string {
+			content := doctype.dtd
+			return if content.len > 0 {
+				'<!DOCTYPE ${doctype.name} SYSTEM "${content}">'
+			} else {
+				''
+			}
+		}
+		DocumentTypeDefinition {
+			if doctype.dtd.list.len == 0 {
+				return ''
+			}
+
+			mut builder := strings.new_builder(1024)
+			builder.write_string('<!DOCTYPE ')
+			builder.write_string(doctype.name)
+			builder.write_string(' ')
+			builder.write_string(doctype.dtd.list.pretty_str(indent))
+			builder.write_string('>')
+			builder.write_u8(`\n`)
+			return builder.str()
+		}
+	}
+}
+
+// pretty_str returns a pretty-printed version of the XML document. It requires the string used to
+// indent each level of the document.
+pub fn (doc XMLDocument) pretty_str(indent string) string {
+	mut document_builder := strings.new_builder(1024)
+
+	prolog := '<?xml version="${doc.version}" encoding="${doc.encoding}"?>'
+	comments := if doc.comments.len > 0 {
+		mut comments_buffer := strings.new_builder(512)
+		for comment in doc.comments {
+			comments_buffer.write_string('<!--')
+			comments_buffer.write_string(comment.text)
+			comments_buffer.write_string('-->')
+			comments_buffer.write_u8(`\n`)
+		}
+		comments_buffer.str()
+	} else {
+		''
+	}
+
+	document_builder.write_string(prolog)
+	document_builder.write_u8(`\n`)
+	document_builder.write_string(doc.doctype.pretty_str(indent))
+	document_builder.write_u8(`\n`)
+	document_builder.write_string(comments)
+	document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities))
+
+	return document_builder.str()
+}
+
+// str returns a string representation of the XML document. It uses a 2-space indentation
+// to pretty-print the document.
+pub fn (doc XMLDocument) str() string {
+	return doc.pretty_str('  ')
+}
--- a/vlib/encoding/xml/entity.v
+++ b/vlib/encoding/xml/entity.v
@ -0,0 +1,79 @@
+module xml
+
+import strings
+
+pub const default_entities = {
+	'lt':   '<'
+	'gt':   '>'
+	'amp':  '&'
+	'apos': "'"
+	'quot': '"'
+}
+
+pub const default_entities_reverse = {
+	'<': 'lt'
+	'>': 'gt'
+	'&': 'amp'
+	"'": 'apos'
+	'"': 'quot'
+}
+
+[params]
+pub struct EscapeConfig {
+	reverse_entities map[string]string = xml.default_entities_reverse
+}
+
+// escape_text replaces all entities in the given string with their respective
+// XML entity strings. See default_entities, which can be overridden.
+pub fn escape_text(content string, config EscapeConfig) string {
+	mut flattened_entities := []string{cap: 2 * config.reverse_entities.len}
+
+	for target, replacement in config.reverse_entities {
+		flattened_entities << target
+		flattened_entities << '&' + replacement + ';'
+	}
+
+	return content.replace_each(flattened_entities)
+}
+
+[params]
+pub struct UnescapeConfig {
+	entities map[string]string = xml.default_entities
+}
+
+// unescape_text replaces all entities in the given string with their respective
+// original characters or strings. See default_entities_reverse, which can be overridden.
+pub fn unescape_text(content string, config UnescapeConfig) !string {
+	mut buffer := strings.new_builder(content.len)
+	mut index := 0
+	runes := content.runes()
+	for index < runes.len {
+		match runes[index] {
+			`&` {
+				mut offset := 1
+				mut entity_buf := strings.new_builder(8)
+				for index + offset < runes.len && runes[index + offset] != `;` {
+					entity_buf.write_rune(runes[index + offset])
+					offset++
+				}
+				// Did we reach the end of the string?
+				if index + offset == runes.len {
+					return error('Unexpected end of string while parsing entity.')
+				}
+				// Did we find a valid entity?
+				entity := entity_buf.str()
+				if entity in config.entities {
+					buffer.write_string(config.entities[entity])
+					index += offset
+				} else {
+					return error('Unknown entity: ' + entity)
+				}
+			}
+			else {
+				buffer.write_rune(runes[index])
+			}
+		}
+		index++
+	}
+	return buffer.str()
+}
--- a/vlib/encoding/xml/entity_test.v
+++ b/vlib/encoding/xml/entity_test.v
@ -0,0 +1,35 @@
+module main
+
+import encoding.xml
+
+fn test_escape() {
+	assert xml.escape_text('Normal string') == 'Normal string'
+	assert xml.escape_text('12 < 34') == '12 &lt; 34'
+	assert xml.escape_text('12 > 34') == '12 &gt; 34'
+	assert xml.escape_text('12 & 34') == '12 &amp; 34'
+	assert xml.escape_text('He said, "Very well, let us proceed."') == 'He said, &quot;Very well, let us proceed.&quot;'
+	assert xml.escape_text("He said, 'Very well, let us proceed.'") == 'He said, &apos;Very well, let us proceed.&apos;'
+
+	assert xml.escape_text('Do not escape ©.') == 'Do not escape ©.'
+
+	mut reverse_entities := xml.default_entities_reverse.clone()
+	reverse_entities['©'] = 'copy'
+	assert xml.escape_text('Do escape ©.', reverse_entities: reverse_entities) == 'Do escape &copy;.'
+}
+
+fn test_unescape() ! {
+	assert xml.unescape_text('Normal string')! == 'Normal string'
+	assert xml.unescape_text('12 &lt; 34')! == '12 < 34'
+	assert xml.unescape_text('12 &gt; 34')! == '12 > 34'
+	assert xml.unescape_text('12 &amp; 34')! == '12 & 34'
+	assert xml.unescape_text('He said, &quot;Very well, let us proceed.&quot;')! == 'He said, "Very well, let us proceed."'
+	assert xml.unescape_text('He said, &apos;Very well, let us proceed.&apos;')! == "He said, 'Very well, let us proceed.'"
+
+	xml.unescape_text('12 &invalid; 34') or { assert err.msg() == 'Unknown entity: invalid' }
+
+	xml.unescape_text('Do not unescape &copy;') or { assert err.msg() == 'Unknown entity: copy' }
+
+	mut entities := xml.default_entities.clone()
+	entities['copy'] = '©'
+	assert xml.unescape_text('Do unescape &copy;.', entities: entities)! == 'Do unescape ©.'
+}
--- a/vlib/encoding/xml/parser.v
+++ b/vlib/encoding/xml/parser.v
@ -0,0 +1,604 @@
+module xml
+
+import io
+import os
+import strings
+
+const (
+	default_prolog_attributes = {
+		'version':  '1.0'
+		'encoding': 'UTF-8'
+	}
+	default_string_builder_cap = 32
+
+	element_len                = '<!ELEMENT'.len
+	entity_len                 = '<!ENTITY'.len
+
+	doctype_chars              = 'OCTYPE'.bytes()
+	double_dash                = '--'.bytes()
+	c_tag                      = '[C'.bytes()
+	data_chars                 = 'DATA'.bytes()
+)
+
+// Helper types to assist in parsing
+
+struct TextSpan {
+mut:
+	start int
+	end   int
+}
+
+enum AttributeParserState {
+	key
+	eq
+	value
+}
+
+fn parse_attributes(attribute_contents string) !map[string]string {
+	if attribute_contents.contains_u8(`<`) {
+		return error('Malformed XML. Found "<" in attribute string: "${attribute_contents}"')
+	}
+	mut attributes := map[string]string{}
+
+	mut state := AttributeParserState.key
+	mut key_span, mut value_span := TextSpan{}, TextSpan{}
+
+	for index, ch in attribute_contents {
+		match state {
+			.key {
+				match ch {
+					`=` {
+						state = AttributeParserState.eq
+					}
+					else {
+						key_span.end++
+					}
+				}
+			}
+			.eq {
+				match ch {
+					`=` {
+						return error('Duplicate "=" in attribute string: "${attribute_contents}"')
+					}
+					`'`, `"` {
+						state = AttributeParserState.value
+						value_span.start = index + 1
+					}
+					else {
+						return error('Invalid character in attribute string: "${attribute_contents}"')
+					}
+				}
+			}
+			.value {
+				match ch {
+					`'`, `"` {
+						state = AttributeParserState.key
+						value_span.end = index
+						attributes[attribute_contents[key_span.start..key_span.end].trim_space()] = attribute_contents[value_span.start..value_span.end]
+
+						key_span.start = index + 1
+						key_span.end = index + 1
+					}
+					else {
+						state = AttributeParserState.value
+						value_span.end++
+					}
+				}
+			}
+		}
+	}
+
+	return attributes
+}
+
+fn parse_comment(mut reader io.Reader) !XMLComment {
+	mut comment_buffer := strings.new_builder(xml.default_string_builder_cap)
+
+	mut local_buf := [u8(0)]
+	for {
+		ch := next_char(mut reader, mut local_buf)!
+		match ch {
+			`-` {
+				after_ch := next_char(mut reader, mut local_buf)!
+				if after_ch == `-` {
+					if next_char(mut reader, mut local_buf)! == `>` {
+						break
+					}
+					return error('XML Comment not closed. Expected ">".')
+				} else {
+					comment_buffer.write_u8(ch)
+					comment_buffer.write_u8(after_ch)
+				}
+			}
+			else {
+				comment_buffer.write_u8(ch)
+			}
+		}
+	}
+
+	comment_contents := comment_buffer.str()
+	return XMLComment{comment_contents}
+}
+
+enum CDATAParserState {
+	normal
+	single
+	double
+}
+
+fn parse_cdata(mut reader io.Reader) !XMLCData {
+	mut contents_buf := strings.new_builder(xml.default_string_builder_cap)
+
+	mut state := CDATAParserState.normal
+	mut local_buf := [u8(0)]
+
+	for {
+		ch := next_char(mut reader, mut local_buf)!
+		contents_buf.write_u8(ch)
+		match ch {
+			`]` {
+				match state {
+					.double {
+						// Another ] after the ]] for some reason. Keep the state
+					}
+					.single {
+						state = .double
+					}
+					.normal {
+						state = .single
+					}
+				}
+			}
+			`>` {
+				match state {
+					.double {
+						break
+					}
+					else {
+						state = .normal
+					}
+				}
+			}
+			else {
+				state = .normal
+			}
+		}
+	}
+
+	contents := contents_buf.str().trim_space()
+	if !contents.ends_with(']]>') {
+		return error('CDATA section not closed.')
+	}
+	return XMLCData{contents[1..contents.len - 3]}
+}
+
+fn parse_entity(contents string) !(DTDEntity, string) {
+	// We find the nearest '>' to the start of the ENTITY
+	entity_end := contents.index('>') or { return error('Entity declaration not closed.') }
+	entity_contents := contents[xml.entity_len..entity_end]
+
+	name := entity_contents.trim_left(' \t\n').all_before(' ')
+	if name.len == 0 {
+		return error('Entity is missing name.')
+	}
+	value := entity_contents.all_after_first(name).trim_space().trim('"\'')
+	if value.len == 0 {
+		return error('Entity is missing value.')
+	}
+
+	// TODO: Add support for SYSTEM and PUBLIC entities
+
+	return DTDEntity{name, value}, contents[entity_end + 1..]
+}
+
+fn parse_element(contents string) !(DTDElement, string) {
+	// We find the nearest '>' to the start of the ELEMENT
+	element_end := contents.index('>') or { return error('Element declaration not closed.') }
+	element_contents := contents[xml.element_len..element_end].trim_left(' \t\n')
+
+	mut name_span := TextSpan{}
+
+	for ch in element_contents {
+		match ch {
+			` `, `\t`, `\n` {
+				break
+			}
+			// Valid characters in an entity name are:
+			// 1. Lowercase alphabet - a-z
+			// 2. Uppercase alphabet - A-Z
+			// 3. Numbers - 0-9
+			// 4. Underscore - _
+			// 5. Colon - :
+			// 6. Period - .
+			`a`...`z`, `A`...`Z`, `0`...`9`, `_`, `:`, `.` {
+				name_span.end++
+			}
+			else {
+				return error('Invalid character in element name: "${ch}"')
+			}
+		}
+	}
+
+	name := element_contents[name_span.start..name_span.end].trim_left(' \t\n')
+	if name.len == 0 {
+		return error('Element is missing name.')
+	}
+	definition_string := element_contents.all_after_first(name).trim_space().trim('"\'')
+
+	definition := if definition_string.starts_with('(') {
+		// We have a list of possible children
+
+		// Ensure that both ( and ) are present
+		if !definition_string.ends_with(')') {
+			return error('Element declaration not closed.')
+		}
+
+		definition_string.trim('()').split(',')
+	} else {
+		// Invalid definition
+		return error('Invalid element definition: ${definition_string}')
+	}
+
+	// TODO: Add support for SYSTEM and PUBLIC entities
+
+	return DTDElement{name, definition}, contents[element_end + 1..]
+}
+
+fn parse_doctype(mut reader io.Reader) !DocumentType {
+	// We may have more < in the doctype so keep count
+	mut depth := 1
+	mut doctype_buffer := strings.new_builder(xml.default_string_builder_cap)
+	mut local_buf := [u8(0)]
+	for {
+		ch := next_char(mut reader, mut local_buf)!
+		doctype_buffer.write_u8(ch)
+		match ch {
+			`<` {
+				depth++
+			}
+			`>` {
+				depth--
+				if depth == 0 {
+					break
+				}
+			}
+			else {}
+		}
+	}
+
+	doctype_contents := doctype_buffer.str().trim_space()
+
+	name := doctype_contents.all_before('[').trim_space()
+
+	mut list_contents := doctype_contents.all_after('[').all_before(']').trim_space()
+	mut items := []DTDListItem{}
+
+	for list_contents.len > 0 {
+		if list_contents.starts_with('<!ENTITY') {
+			entity, remaining := parse_entity(list_contents)!
+			items << entity
+			list_contents = remaining.trim_space()
+		} else if list_contents.starts_with('<!ELEMENT') {
+			element, remaining := parse_element(list_contents)!
+			items << element
+			list_contents = remaining.trim_space()
+		} else {
+			return error('Unknown DOCTYPE list item: ${list_contents}')
+		}
+	}
+
+	return DocumentType{
+		name: name
+		dtd: DocumentTypeDefinition{
+			list: items
+		}
+	}
+}
+
+fn parse_prolog(mut reader io.Reader) !(Prolog, u8) {
+	// Trim trailing whitespace
+	mut local_buf := [u8(0)]
+	mut ch := next_char(mut reader, mut local_buf)!
+	for {
+		match ch {
+			` `, `\t`, `\n` {
+				ch = next_char(mut reader, mut local_buf)!
+				continue
+			}
+			`<` {
+				break
+			}
+			else {
+				return error('Expecting a prolog or root node starting with "<".')
+			}
+		}
+	}
+
+	ch = next_char(mut reader, mut local_buf)!
+	if ch != `?` {
+		return Prolog{}, ch
+	}
+
+	ch = next_char(mut reader, mut local_buf)!
+	if ch != `x` {
+		return error('Expecting a prolog starting with "<?x".')
+	}
+
+	ch = next_char(mut reader, mut local_buf)!
+	if ch != `m` {
+		return error('Expecting a prolog starting with "<?xm".')
+	}
+
+	ch = next_char(mut reader, mut local_buf)!
+	if ch != `l` {
+		return error('Expecting a prolog starting with "<?xml".')
+	}
+
+	mut prolog_buffer := strings.new_builder(xml.default_string_builder_cap)
+
+	// Keep reading character by character until we find the end of the prolog
+	mut found_question_mark := false
+
+	for {
+		ch = next_char(mut reader, mut local_buf)!
+		match ch {
+			`?` {
+				if found_question_mark {
+					return error('Invalid prolog: Two question marks found in a row.')
+				}
+				found_question_mark = true
+			}
+			`>` {
+				if found_question_mark {
+					break
+				}
+				return error('Invalid prolog: Found ">" before "?".')
+			}
+			else {
+				if found_question_mark {
+					found_question_mark = false
+					prolog_buffer.write_u8(`?`)
+				}
+				prolog_buffer.write_u8(ch)
+			}
+		}
+	}
+
+	prolog_attributes := prolog_buffer.str().trim_space()
+
+	attributes := if prolog_attributes.len == 0 {
+		xml.default_prolog_attributes
+	} else {
+		parse_attributes(prolog_attributes)!
+	}
+
+	version := attributes['version'] or { return error('XML declaration missing version.') }
+	encoding := attributes['encoding'] or { 'UTF-8' }
+
+	mut comments := []XMLComment{}
+	mut doctype := DocumentType{
+		name: ''
+		dtd: ''
+	}
+	mut found_doctype := false
+	for {
+		ch = next_char(mut reader, mut local_buf)!
+		match ch {
+			` `, `\t`, `\n` {
+				continue
+			}
+			`<` {
+				// We have a comment, DOCTYPE, or root node
+				ch = next_char(mut reader, mut local_buf)!
+				match ch {
+					`!` {
+						// A comment or DOCTYPE
+						match next_char(mut reader, mut local_buf)! {
+							`-` {
+								// A comment
+								if next_char(mut reader, mut local_buf)! != `-` {
+									return error('Invalid comment.')
+								}
+								comments << parse_comment(mut reader)!
+							}
+							`D` {
+								if found_doctype {
+									return error('Duplicate DOCTYPE declaration.')
+								}
+								// <!D -> OCTYPE
+								mut doc_buf := []u8{len: 6}
+								if reader.read(mut doc_buf)! != 6 {
+									return error('Invalid DOCTYPE.')
+								}
+								if doc_buf != xml.doctype_chars {
+									return error('Invalid DOCTYPE.')
+								}
+								found_doctype = true
+								doctype = parse_doctype(mut reader)!
+							}
+							else {
+								return error('Unsupported control sequence found in prolog.')
+							}
+						}
+					}
+					else {
+						// We have found the start of the root node
+						break
+					}
+				}
+			}
+			else {}
+		}
+	}
+
+	return Prolog{
+		version: version
+		encoding: encoding
+		doctype: doctype
+		comments: comments
+	}, ch
+}
+
+fn parse_children(name string, attributes map[string]string, mut reader io.Reader) !XMLNode {
+	mut inner_contents := strings.new_builder(xml.default_string_builder_cap)
+
+	mut children := []XMLNodeContents{}
+	mut local_buf := [u8(0)]
+
+	for {
+		ch := next_char(mut reader, mut local_buf)!
+		match ch {
+			`<` {
+				second_char := next_char(mut reader, mut local_buf)!
+				match second_char {
+					`!` {
+						// Comment, CDATA
+						mut next_two := [u8(0), 0]
+						if reader.read(mut next_two)! != 2 {
+							return error('Invalid XML. Incomplete comment or CDATA declaration.')
+						}
+						if next_two == xml.double_dash {
+							// Comment
+							comment := parse_comment(mut reader)!
+							children << comment
+						} else if next_two == xml.c_tag {
+							// <![CDATA -> DATA
+							mut cdata_buf := []u8{len: 4}
+							if reader.read(mut cdata_buf)! != 4 {
+								return error('Invalid XML. Incomplete CDATA declaration.')
+							}
+							if cdata_buf != xml.data_chars {
+								return error('Invalid XML. Expected "CDATA" after "<![C".')
+							}
+							cdata := parse_cdata(mut reader)!
+							children << cdata
+						} else {
+							return error('Invalid XML. Unknown control sequence: ${next_two.bytestr()}')
+						}
+					}
+					`/` {
+						// End of node
+						mut node_end_buffer := []u8{len: name.len + 1}
+						if reader.read(mut node_end_buffer)! != name.len + 1 {
+							return error('Invalid XML. Incomplete node end.')
+						}
+
+						mut ending_chars := name.bytes()
+						ending_chars << `>`
+
+						if node_end_buffer != ending_chars {
+							return error('XML node <${name}> not closed.')
+						}
+
+						collected_contents := inner_contents.str().trim_space()
+						if collected_contents.len > 0 {
+							// We have some inner text
+							children << collected_contents.replace('\r\n', '\n')
+						}
+						return XMLNode{
+							name: name
+							attributes: attributes
+							children: children
+						}
+					}
+					else {
+						// Start of child node
+						child := parse_single_node(second_char, mut reader) or {
+							if err.msg() == 'XML node cannot start with "</".' {
+								return error('XML node <${name}> not closed.')
+							} else {
+								return err
+							}
+						}
+						text := inner_contents.str().trim_space()
+						if text.len > 0 {
+							children << text.replace('\r\n', '\n')
+						}
+						children << child
+					}
+				}
+			}
+			else {
+				inner_contents.write_u8(ch)
+			}
+		}
+	}
+	return error('XML node <${name}> not closed.')
+}
+
+fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
+	mut local_buf := [u8(0)]
+	mut ch := next_char(mut reader, mut local_buf)!
+	mut contents := strings.new_builder(xml.default_string_builder_cap)
+	// We're expecting an opening tag
+	if ch == `/` {
+		return error('XML node cannot start with "</".')
+	}
+	contents.write_u8(ch)
+
+	for {
+		ch = next_char(mut reader, mut local_buf)!
+		if ch == `>` {
+			break
+		}
+		contents.write_u8(ch)
+	}
+
+	tag_contents := contents.str().trim_space()
+
+	parts := tag_contents.split_any(' \t\n')
+	name := first_char.ascii_str() + parts[0]
+
+	// Check if it is a self-closing tag
+	if tag_contents.ends_with('/') {
+		// We're not looking for children and inner text
+		return XMLNode{
+			name: name
+			attributes: parse_attributes(tag_contents[name.len - 1..tag_contents.len].trim_space())!
+		}
+	}
+
+	attribute_string := tag_contents[name.len - 1..].trim_space()
+	attributes := parse_attributes(attribute_string)!
+
+	return parse_children(name, attributes, mut reader)
+}
+
+// XMLDocument.from_string parses an XML document from a string.
+pub fn XMLDocument.from_string(raw_contents string) !XMLDocument {
+	mut reader := FullBufferReader{
+		contents: raw_contents.bytes()
+	}
+	return XMLDocument.from_reader(mut reader)!
+}
+
+// XMLDocument.from_file parses an XML document from a file. Note that the file is read in its entirety
+// and then parsed. If the file is too large, try using the XMLDocument.from_reader function instead.
+pub fn XMLDocument.from_file(path string) !XMLDocument {
+	mut reader := FullBufferReader{
+		contents: os.read_bytes(path)!
+	}
+	return XMLDocument.from_reader(mut reader)!
+}
+
+// XMLDocument.from_reader parses an XML document from a reader. This is the most generic way to parse
+// an XML document from any arbitrary source that implements that io.Reader interface.
+pub fn XMLDocument.from_reader(mut reader io.Reader) !XMLDocument {
+	prolog, first_char := parse_prolog(mut reader) or {
+		if err is os.Eof || err is io.Eof || err.msg() == 'Unexpected End Of File.' {
+			return error('XML document is empty.')
+		} else {
+			return err
+		}
+	}
+
+	root := parse_single_node(first_char, mut reader)!
+
+	return XMLDocument{
+		version: prolog.version
+		encoding: prolog.encoding
+		comments: prolog.comments
+		doctype: prolog.doctype
+		root: root
+	}
+}
--- a/vlib/encoding/xml/query.v
+++ b/vlib/encoding/xml/query.v
@ -0,0 +1,60 @@
+module xml
+
+fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
+	// Is this the node we're looking for?
+	if attribute_id := node.attributes['id'] {
+		if attribute_id == id {
+			return node
+		}
+	}
+
+	if node.children.len == 0 {
+		return none
+	}
+
+	// Recurse into children
+	for child in node.children {
+		match child {
+			XMLNode {
+				if result := child.get_element_by_id(id) {
+					return result
+				}
+			}
+			else {}
+		}
+	}
+
+	return none
+}
+
+fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
+	mut result := []XMLNode{}
+
+	if node.name == tag {
+		result << node
+	}
+
+	if node.children.len == 0 {
+		return result
+	}
+
+	// Recurse into children
+	for child in node.children {
+		if child is XMLNode {
+			result << child.get_elements_by_tag(tag)
+		}
+	}
+
+	return result
+}
+
+// get_element_by_id returns the first element with the given id, or none if no
+// such element exists.
+pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode {
+	return doc.root.get_element_by_id(id)
+}
+
+// get_elements_by_tag returns all elements with the given tag name.
+pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode {
+	return doc.root.get_elements_by_tag(tag)
+}
--- a/vlib/encoding/xml/reader_util.v
+++ b/vlib/encoding/xml/reader_util.v
@ -0,0 +1,30 @@
+module xml
+
+import io
+
+fn next_char(mut reader io.Reader, mut buf []u8) !u8 {
+	if reader.read(mut buf)! == 0 {
+		return error('Unexpected End Of File.')
+	}
+	return buf[0]
+}
+
+struct FullBufferReader {
+	contents []u8
+mut:
+	position int
+}
+
+[direct_array_access]
+fn (mut fbr FullBufferReader) read(mut buf []u8) !int {
+	if fbr.position >= fbr.contents.len {
+		return io.Eof{}
+	}
+	remaining := fbr.contents.len - fbr.position
+	n := if buf.len < remaining { buf.len } else { remaining }
+	unsafe {
+		vmemcpy(&u8(buf.data), &u8(fbr.contents.data) + fbr.position, n)
+	}
+	fbr.position += n
+	return n
+}
--- a/vlib/encoding/xml/test/gtk/gtk_test.v
+++ b/vlib/encoding/xml/test/gtk/gtk_test.v
@ -0,0 +1,89 @@
+module main
+
+import encoding.xml
+import os
+
+fn test_large_gtk_file() ! {
+	// Note: If you are contributing to this project, you should download the
+	// GIR file from https://raw.githubusercontent.com/gtk-rs/gir-files/master/Gtk-4.0.gir
+	// and place it in the same directory as this file.
+	path := os.join_path(os.dir(@FILE), 'Gtk-4.0.gir')
+	if !os.exists(path) {
+		println('Skipping test_large_gtk_file because file does not exist.')
+		return
+	}
+
+	actual := xml.XMLDocument.from_file(path) or {
+		return error('Failed to parse large GTK XML file')
+	}
+
+	mut valid := false
+	for elm in actual.get_elements_by_tag('class') {
+		if 'c:type' in elm.attributes && elm.attributes['c:type'] == 'GtkWindow' {
+			assert elm.attributes['parent'] == 'Widget'
+			assert elm.attributes['c:symbol-prefix'] == 'window'
+			valid = true
+		}
+	}
+	assert valid, 'GtkWindow class not found!'
+
+	valid = false
+	for elm in actual.get_elements_by_tag('constructor') {
+		if 'c:identifier' in elm.attributes && elm.attributes['c:identifier'] == 'gtk_window_new' {
+			assert elm == xml.XMLNode{
+				name: 'constructor'
+				attributes: {
+					'name':         'new'
+					'c:identifier': 'gtk_window_new'
+				}
+				children: [
+					xml.XMLNodeContents(xml.XMLNode{
+						name: 'doc'
+						attributes: {
+							'xml:space': 'preserve'
+						}
+						children: [
+							xml.XMLNodeContents('Creates a new `GtkWindow`.
+
+To get an undecorated window (no window borders), use
+[method@Gtk.Window.set_decorated].
+
+All top-level windows created by gtk_window_new() are stored
+in an internal top-level window list. This list can be obtained
+from [func@Gtk.Window.list_toplevels]. Due to GTK keeping a
+reference to the window internally, gtk_window_new() does not
+return a reference to the caller.
+
+To delete a `GtkWindow`, call [method@Gtk.Window.destroy].'),
+						]
+					}),
+					xml.XMLNodeContents(xml.XMLNode{
+						name: 'return-value'
+						attributes: {
+							'transfer-ownership': 'none'
+						}
+						children: [
+							xml.XMLNodeContents(xml.XMLNode{
+								name: 'doc'
+								attributes: {
+									'xml:space': 'preserve'
+								}
+								children: [xml.XMLNodeContents('a new `GtkWindow`.')]
+							}),
+							xml.XMLNodeContents(xml.XMLNode{
+								name: 'type'
+								attributes: {
+									'name':   'Widget'
+									'c:type': 'GtkWidget*'
+								}
+								children: []
+							}),
+						]
+					}),
+				]
+			}
+			valid = true
+		}
+	}
+	assert valid, 'gtk_window_new constructor not found!'
+}
--- a/vlib/encoding/xml/test/local/01_mdn_example/hello_world.xml
+++ b/vlib/encoding/xml/test/local/01_mdn_example/hello_world.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<message>
+  <greeting>
+    Hello, World!
+  </greeting>
+</message>
--- a/vlib/encoding/xml/test/local/01_mdn_example/hello_world_test.v
+++ b/vlib/encoding/xml/test/local/01_mdn_example/hello_world_test.v
@ -0,0 +1,23 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'hello_world.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'message'
+			children: [
+				xml.XMLNode{
+					name: 'greeting'
+					children: [
+						'Hello, World!',
+					]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/02_note_message/note.xml
+++ b/vlib/encoding/xml/test/local/02_note_message/note.xml
@ -0,0 +1,6 @@
+<note>
+  <to>Tove</to>
+  <from>Jani</from>
+  <heading>Reminder</heading>
+  <body>Don't forget me this weekend!</body>
+</note>
--- a/vlib/encoding/xml/test/local/02_note_message/note_test.v
+++ b/vlib/encoding/xml/test/local/02_note_message/note_test.v
@ -0,0 +1,41 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'note.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'note'
+			children: [
+				xml.XMLNode{
+					name: 'to'
+					children: [
+						'Tove',
+					]
+				},
+				xml.XMLNode{
+					name: 'from'
+					children: [
+						'Jani',
+					]
+				},
+				xml.XMLNode{
+					name: 'heading'
+					children: [
+						'Reminder',
+					]
+				},
+				xml.XMLNode{
+					name: 'body'
+					children: [
+						"Don't forget me this weekend!",
+					]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/03_cd_catalogue/cd_catalog.xml
+++ b/vlib/encoding/xml/test/local/03_cd_catalogue/cd_catalog.xml
@ -0,0 +1,34 @@
+<CATALOG>
+  <CD>
+    <TITLE>Empire Burlesque</TITLE>
+    <ARTIST>Bob Dylan</ARTIST>
+    <COUNTRY>USA</COUNTRY>
+    <COMPANY>Columbia</COMPANY>
+    <PRICE>10.90</PRICE>
+    <YEAR>1985</YEAR>
+  </CD>
+  <CD>
+    <TITLE>Hide your heart</TITLE>
+    <ARTIST>Bonnie Tyler</ARTIST>
+    <COUNTRY>UK</COUNTRY>
+    <COMPANY>CBS Records</COMPANY>
+    <PRICE>9.90</PRICE>
+    <YEAR>1988</YEAR>
+  </CD>
+  <CD>
+    <TITLE>Greatest Hits</TITLE>
+    <ARTIST>Dolly Parton</ARTIST>
+    <COUNTRY>USA</COUNTRY>
+    <COMPANY>RCA</COMPANY>
+    <PRICE>9.90</PRICE>
+    <YEAR>1982</YEAR>
+  </CD>
+  <CD>
+    <TITLE>Still got the blues</TITLE>
+    <ARTIST>Gary Moore</ARTIST>
+    <COUNTRY>UK</COUNTRY>
+    <COMPANY>Virgin records</COMPANY>
+    <PRICE>10.20</PRICE>
+    <YEAR>1990</YEAR>
+  </CD>
+</CATALOG>
--- a/vlib/encoding/xml/test/local/03_cd_catalogue/cd_test.v
+++ b/vlib/encoding/xml/test/local/03_cd_catalogue/cd_test.v
@ -0,0 +1,181 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'cd_catalog.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'CATALOG'
+			children: [
+				xml.XMLNode{
+					name: 'CD'
+					children: [
+						xml.XMLNode{
+							name: 'TITLE'
+							children: [
+								'Empire Burlesque',
+							]
+						},
+						xml.XMLNode{
+							name: 'ARTIST'
+							children: [
+								'Bob Dylan',
+							]
+						},
+						xml.XMLNode{
+							name: 'COUNTRY'
+							children: [
+								'USA',
+							]
+						},
+						xml.XMLNode{
+							name: 'COMPANY'
+							children: [
+								'Columbia',
+							]
+						},
+						xml.XMLNode{
+							name: 'PRICE'
+							children: [
+								'10.90',
+							]
+						},
+						xml.XMLNode{
+							name: 'YEAR'
+							children: [
+								'1985',
+							]
+						},
+					]
+				},
+				xml.XMLNode{
+					name: 'CD'
+					children: [
+						xml.XMLNode{
+							name: 'TITLE'
+							children: [
+								'Hide your heart',
+							]
+						},
+						xml.XMLNode{
+							name: 'ARTIST'
+							children: [
+								'Bonnie Tyler',
+							]
+						},
+						xml.XMLNode{
+							name: 'COUNTRY'
+							children: [
+								'UK',
+							]
+						},
+						xml.XMLNode{
+							name: 'COMPANY'
+							children: [
+								'CBS Records',
+							]
+						},
+						xml.XMLNode{
+							name: 'PRICE'
+							children: [
+								'9.90',
+							]
+						},
+						xml.XMLNode{
+							name: 'YEAR'
+							children: [
+								'1988',
+							]
+						},
+					]
+				},
+				xml.XMLNode{
+					name: 'CD'
+					children: [
+						xml.XMLNode{
+							name: 'TITLE'
+							children: [
+								'Greatest Hits',
+							]
+						},
+						xml.XMLNode{
+							name: 'ARTIST'
+							children: [
+								'Dolly Parton',
+							]
+						},
+						xml.XMLNode{
+							name: 'COUNTRY'
+							children: [
+								'USA',
+							]
+						},
+						xml.XMLNode{
+							name: 'COMPANY'
+							children: [
+								'RCA',
+							]
+						},
+						xml.XMLNode{
+							name: 'PRICE'
+							children: [
+								'9.90',
+							]
+						},
+						xml.XMLNode{
+							name: 'YEAR'
+							children: [
+								'1982',
+							]
+						},
+					]
+				},
+				xml.XMLNode{
+					name: 'CD'
+					children: [
+						xml.XMLNode{
+							name: 'TITLE'
+							children: [
+								'Still got the blues',
+							]
+						},
+						xml.XMLNode{
+							name: 'ARTIST'
+							children: [
+								'Gary Moore',
+							]
+						},
+						xml.XMLNode{
+							name: 'COUNTRY'
+							children: [
+								'UK',
+							]
+						},
+						xml.XMLNode{
+							name: 'COMPANY'
+							children: [
+								'Virgin records',
+							]
+						},
+						xml.XMLNode{
+							name: 'PRICE'
+							children: [
+								'10.20',
+							]
+						},
+						xml.XMLNode{
+							name: 'YEAR'
+							children: [
+								'1990',
+							]
+						},
+					]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/04_empty_file/empty.xml
+++ b/vlib/encoding/xml/test/local/04_empty_file/empty.xml
--- a/vlib/encoding/xml/test/local/04_empty_file/expected_error.txt
+++ b/vlib/encoding/xml/test/local/04_empty_file/expected_error.txt
@ -0,0 +1 @@
+XML document is empty.
--- a/vlib/encoding/xml/test/local/05_single_element/root.xml
+++ b/vlib/encoding/xml/test/local/05_single_element/root.xml
@ -0,0 +1 @@
+<sample>Single root element.</sample>
--- a/vlib/encoding/xml/test/local/05_single_element/root_test.v
+++ b/vlib/encoding/xml/test/local/05_single_element/root_test.v
@ -0,0 +1,18 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'root.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'sample'
+			children: [
+				'Single root element.',
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/06_nested_elements/nested.xml
+++ b/vlib/encoding/xml/test/local/06_nested_elements/nested.xml
@ -0,0 +1,14 @@
+<level1>
+  <level2>
+    <level3>
+      <level4>
+        Deeply nested content.
+      </level4>
+    </level3>
+  </level2>
+  <level2>
+    <level3>
+      Less deeply nested content.
+    </level3>
+  </level2>
+</level1>
--- a/vlib/encoding/xml/test/local/06_nested_elements/nested_test.v
+++ b/vlib/encoding/xml/test/local/06_nested_elements/nested_test.v
@ -0,0 +1,44 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'nested.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'level1'
+			children: [
+				xml.XMLNode{
+					name: 'level2'
+					children: [
+						xml.XMLNode{
+							name: 'level3'
+							children: [
+								xml.XMLNode{
+									name: 'level4'
+									children: [
+										'Deeply nested content.',
+									]
+								},
+							]
+						},
+					]
+				},
+				xml.XMLNode{
+					name: 'level2'
+					children: [
+						xml.XMLNode{
+							name: 'level3'
+							children: [
+								'Less deeply nested content.',
+							]
+						},
+					]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/07_mixed_contents/mixed.xml
+++ b/vlib/encoding/xml/test/local/07_mixed_contents/mixed.xml
@ -0,0 +1,5 @@
+<letter>
+  Dear Mr. <name>John Smith</name>.
+  Your order <orderid>1032</orderid>
+  will be shipped on <shipdate>2001-07-13</shipdate>.
+</letter>
--- a/vlib/encoding/xml/test/local/07_mixed_contents/mixed_test.v
+++ b/vlib/encoding/xml/test/local/07_mixed_contents/mixed_test.v
@ -0,0 +1,33 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'mixed.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'letter'
+			children: [
+				'Dear Mr.',
+				xml.XMLNode{
+					name: 'name'
+					children: ['John Smith']
+				},
+				'.\n  Your order',
+				xml.XMLNode{
+					name: 'orderid'
+					children: ['1032']
+				},
+				'will be shipped on',
+				xml.XMLNode{
+					name: 'shipdate'
+					children: ['2001-07-13']
+				},
+				'.',
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/08_comments/comment.xml
+++ b/vlib/encoding/xml/test/local/08_comments/comment.xml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!-- Employee Information-->
+<address>
+  <!-- Full or first name -->
+  <name>Jones</name>
+  <!-- Registered name of the company -> -->
+  <company>ABSystems</company>
+  <phone>
+    <!-- Phone with country code -) -->
+    (046) 1233-44778
+  </phone>
+</address>
--- a/vlib/encoding/xml/test/local/08_comments/comment_test.v
+++ b/vlib/encoding/xml/test/local/08_comments/comment_test.v
@ -0,0 +1,42 @@
+import os
+import encoding.xml
+
+fn test_valid_parsing() ! {
+	path := os.join_path(os.dir(@FILE), 'comment.xml')
+
+	expected := xml.XMLDocument{
+		comments: [
+			xml.XMLComment{
+				text: ' Employee Information'
+			},
+		]
+		root: xml.XMLNode{
+			name: 'address'
+			children: [
+				xml.XMLComment{
+					text: ' Full or first name '
+				},
+				xml.XMLNode{
+					name: 'name'
+					children: ['Jones']
+				},
+				xml.XMLComment{
+					text: ' Registered name of the company -> '
+				},
+				xml.XMLNode{
+					name: 'company'
+					children: ['ABSystems']
+				},
+				xml.XMLNode{
+					name: 'phone'
+					children: [xml.XMLComment{
+						text: ' Phone with country code -) '
+					}, '(046) 1233-44778']
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/09_malformed/expected_error.txt
+++ b/vlib/encoding/xml/test/local/09_malformed/expected_error.txt
@ -0,0 +1 @@
+Malformed XML. Found "<" in attribute string: "<body"
--- a/vlib/encoding/xml/test/local/09_malformed/malformed.xml
+++ b/vlib/encoding/xml/test/local/09_malformed/malformed.xml
@ -0,0 +1 @@
+<message <body>Sample</body></message>
--- a/vlib/encoding/xml/test/local/10_missing_tag/expected_error.txt
+++ b/vlib/encoding/xml/test/local/10_missing_tag/expected_error.txt
@ -0,0 +1 @@
+XML node <warning> not closed.
--- a/vlib/encoding/xml/test/local/10_missing_tag/malformed.xml
+++ b/vlib/encoding/xml/test/local/10_missing_tag/malformed.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<message>
+    <warning>
+        Hello World
+    <!--missing </warning> -->
+</message>
--- a/vlib/encoding/xml/test/local/11_cdata_content/cdata.xml
+++ b/vlib/encoding/xml/test/local/11_cdata_content/cdata.xml
@ -0,0 +1,4 @@
+<sample>
+  <html>This is &lt;b&gt;bold&lt;/b&gt;</html>
+  <html><![CDATA[This is <b>bold</b>]]></html>
+</sample>
--- a/vlib/encoding/xml/test/local/11_cdata_content/cdata_test.v
+++ b/vlib/encoding/xml/test/local/11_cdata_content/cdata_test.v
@ -0,0 +1,29 @@
+module main
+
+import os
+import encoding.xml
+
+fn test_valid_parsing() {
+	path := os.join_path(os.dir(@FILE), 'cdata.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'sample'
+			children: [
+				xml.XMLNode{
+					name: 'html'
+					children: ['This is &lt;b&gt;bold&lt;/b&gt;']
+				},
+				xml.XMLNode{
+					name: 'html'
+					children: [xml.XMLCData{
+						text: 'This is <b>bold</b>'
+					}]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/12_doctype_entity/entity.xml
+++ b/vlib/encoding/xml/test/local/12_doctype_entity/entity.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE body	[
+  <!ENTITY warning "Warning: Something bad happened... please refresh and try again.">
+]>  
+<body>
+  <message> &warning; </message>
+</body>
--- a/vlib/encoding/xml/test/local/12_doctype_entity/spec_entity_test.v
+++ b/vlib/encoding/xml/test/local/12_doctype_entity/spec_entity_test.v
@ -0,0 +1,41 @@
+module main
+
+import os
+import encoding.xml
+
+fn test_valid_parsing() {
+	path := os.join_path(os.dir(@FILE), 'entity.xml')
+
+	mut reverse_entities := xml.default_entities_reverse.clone()
+	reverse_entities['Warning: Something bad happened... please refresh and try again.'] = 'warning'
+
+	expected := xml.XMLDocument{
+		parsed_reverse_entities: reverse_entities
+		doctype: xml.DocumentType{
+			name: 'body'
+			dtd: xml.DocumentTypeDefinition{
+				name: ''
+				list: [
+					xml.DTDEntity{
+						name: 'warning'
+						value: 'Warning: Something bad happened... please refresh and try again.'
+					},
+				]
+			}
+		}
+		root: xml.XMLNode{
+			name: 'body'
+			children: [
+				xml.XMLNode{
+					name: 'message'
+					children: [
+						'Warning: Something bad happened... please refresh and try again.',
+					]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!.validate()!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/13_doctype_element/doctype_test.v
+++ b/vlib/encoding/xml/test/local/13_doctype_element/doctype_test.v
@ -0,0 +1,71 @@
+module main
+
+import os
+import encoding.xml
+
+fn test_valid_parsing() {
+	path := os.join_path(os.dir(@FILE), 'element.xml')
+
+	expected := xml.XMLDocument{
+		doctype: xml.DocumentType{
+			name: 'note'
+			dtd: xml.DocumentTypeDefinition{
+				name: ''
+				list: [
+					xml.DTDElement{
+						name: 'note'
+						definition: ['to', 'from', 'heading', 'body']
+					},
+					xml.DTDElement{
+						name: 'to'
+						definition: ['#PCDATA']
+					},
+					xml.DTDElement{
+						name: 'from'
+						definition: ['#PCDATA']
+					},
+					xml.DTDElement{
+						name: 'heading'
+						definition: ['#PCDATA']
+					},
+					xml.DTDElement{
+						name: 'body'
+						definition: ['#PCDATA']
+					},
+				]
+			}
+		}
+		root: xml.XMLNode{
+			name: 'note'
+			children: [
+				xml.XMLNode{
+					name: 'to'
+					children: [
+						'Tove',
+					]
+				},
+				xml.XMLNode{
+					name: 'from'
+					children: [
+						'Jani',
+					]
+				},
+				xml.XMLNode{
+					name: 'heading'
+					children: [
+						'Reminder',
+					]
+				},
+				xml.XMLNode{
+					name: 'body'
+					children: [
+						"Don't forget me this weekend!",
+					]
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!.validate()!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/13_doctype_element/element.xml
+++ b/vlib/encoding/xml/test/local/13_doctype_element/element.xml
@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE note [
+  <!ELEMENT note (to,from,heading,body)>
+  <!ELEMENT to	(#PCDATA)>
+  <!ELEMENT from (#PCDATA)>
+  <!ELEMENT heading (#PCDATA)>
+  <!ELEMENT body (#PCDATA)>
+]>
+<note>
+  <to>Tove</to>
+  <from>Jani</from>
+  <heading>Reminder</heading>
+  <body>Don't forget me this weekend!</body>
+</note>
--- a/vlib/encoding/xml/test/local/14_attributes/attributes.xml
+++ b/vlib/encoding/xml/test/local/14_attributes/attributes.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<book category="web">
+  <title lang="en" code:type="const char*">Learning XML</title>
+  <author attr=" surrounding spaces ">Erik T. Ray</author>
+  <year>2003</year>
+  <price>39.95</price>
+</book>
--- a/vlib/encoding/xml/test/local/14_attributes/attributes_test.v
+++ b/vlib/encoding/xml/test/local/14_attributes/attributes_test.v
@ -0,0 +1,45 @@
+module main
+
+import os
+import encoding.xml
+
+fn test_valid_parsing() {
+	path := os.join_path(os.dir(@FILE), 'attributes.xml')
+
+	expected := xml.XMLDocument{
+		root: xml.XMLNode{
+			name: 'book'
+			attributes: {
+				'category': 'web'
+			}
+			children: [
+				xml.XMLNode{
+					name: 'title'
+					attributes: {
+						'lang':      'en'
+						'code:type': 'const char*'
+					}
+					children: ['Learning XML']
+				},
+				xml.XMLNode{
+					name: 'author'
+					attributes: {
+						'attr': ' surrounding spaces '
+					}
+					children: ['Erik T. Ray']
+				},
+				xml.XMLNode{
+					name: 'year'
+					children: ['2003']
+				},
+				xml.XMLNode{
+					name: 'price'
+					children: ['39.95']
+				},
+			]
+		}
+	}
+	actual := xml.XMLDocument.from_file(path)!
+
+	assert expected == actual, 'Parsed XML document should be equal to expected XML document'
+}
--- a/vlib/encoding/xml/test/local/15_incomplete_entity_1/entity.xml
+++ b/vlib/encoding/xml/test/local/15_incomplete_entity_1/entity.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE body [
+  <!ENTITY>
+]>
+<body>
+</body>
--- a/vlib/encoding/xml/test/local/15_incomplete_entity_1/expected_error.txt
+++ b/vlib/encoding/xml/test/local/15_incomplete_entity_1/expected_error.txt
@ -0,0 +1 @@
+Entity is missing name.
--- a/vlib/encoding/xml/test/local/16_incomplete_entity_2/entity.xml
+++ b/vlib/encoding/xml/test/local/16_incomplete_entity_2/entity.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE body [
+  <!ENTITY missing>
+]>
+<body>
+  &missing;
+</body>
--- a/vlib/encoding/xml/test/local/16_incomplete_entity_2/expected_error.txt
+++ b/vlib/encoding/xml/test/local/16_incomplete_entity_2/expected_error.txt
@ -0,0 +1 @@
+Entity is missing value.
--- a/vlib/encoding/xml/test/local/17_incomplete_element_1/element.xml
+++ b/vlib/encoding/xml/test/local/17_incomplete_element_1/element.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE note [
+  <!ELEMENT>
+]>
+<note>
+</note>
--- a/vlib/encoding/xml/test/local/17_incomplete_element_1/expected_error.txt
+++ b/vlib/encoding/xml/test/local/17_incomplete_element_1/expected_error.txt
@ -0,0 +1 @@
+Element is missing name.
--- a/vlib/encoding/xml/test/local/18_incomplete_element_2/element.xml
+++ b/vlib/encoding/xml/test/local/18_incomplete_element_2/element.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE note [
+  <!ELEMENT note invalid>
+]>
+<note>
+</note>
--- a/vlib/encoding/xml/test/local/18_incomplete_element_2/expected_error.txt
+++ b/vlib/encoding/xml/test/local/18_incomplete_element_2/expected_error.txt
@ -0,0 +1 @@
+Invalid element definition: invalid
--- a/vlib/encoding/xml/test/spec_test.v
+++ b/vlib/encoding/xml/test/spec_test.v
@ -0,0 +1,25 @@
+module main
+
+import os
+import encoding.xml
+
+// All the XML files in the spec directory obtained recursively
+const spec_files = os.walk_ext(os.join_path(os.dir(@FILE), 'local'), 'xml')
+
+fn test_can_parse_all_files() ! {
+	assert spec_files.len > 0, 'No XML files found in the spec directory'
+	for file in spec_files {
+		doc := xml.XMLDocument.from_file(file) or {
+			// Parsing failed. Check if this was an expected error.
+			parent := os.dir(file)
+			error_file := os.join_path(parent, 'expected_error.txt')
+			error_text := os.read_file(error_file) or {
+				// No expected error. Fail the test.
+				return error('Failed to parse XML file: ' + file)
+			}
+			// Check if the error message matches the expected error.
+			assert err.msg().trim_space() == error_text.trim_space()
+			continue
+		}
+	}
+}
--- a/vlib/encoding/xml/types.v
+++ b/vlib/encoding/xml/types.v
@ -0,0 +1,71 @@
+module xml
+
+pub type XMLNodeContents = XMLCData | XMLComment | XMLNode | string
+
+pub struct XMLCData {
+pub:
+	text string [required]
+}
+
+pub struct XMLComment {
+pub:
+	text string [required]
+}
+
+// XMLNode represents a single XML node. It contains the node name,
+// a map of attributes, and a list of children. The children can be
+// other XML nodes, CDATA, plain text, or comments.
+pub struct XMLNode {
+pub:
+	name       string            [required]
+	attributes map[string]string
+	children   []XMLNodeContents
+}
+
+// XMLDocument is the struct that represents a single XML document.
+// It contains the prolog and the single root node. The prolog struct
+// is embedded into the XMLDocument struct, so that the prolog fields
+// are accessible directly from the this struct.
+// Public prolog fields include version, enccoding, comments preceding
+// the root node, and the document type definition.
+pub struct XMLDocument {
+	Prolog
+pub:
+	root XMLNode [required]
+}
+
+pub type DTDListItem = DTDElement | DTDEntity
+
+pub struct DTDEntity {
+	name  string [required]
+	value string [required]
+}
+
+pub struct DTDElement {
+	name       string   [required]
+	definition []string [required]
+}
+
+pub struct DocumentTypeDefinition {
+	name string
+	list []DTDListItem
+}
+
+pub struct DocumentType {
+	name string  [required]
+	dtd  DTDInfo
+}
+
+type DTDInfo = DocumentTypeDefinition | string
+
+struct Prolog {
+	parsed_reverse_entities map[string]string = default_entities_reverse.clone()
+pub:
+	version  string       = '1.0'
+	encoding string       = 'UTF-8'
+	doctype  DocumentType = DocumentType{
+		name: ''
+		dtd: ''
+	}
+	comments []XMLComment
+}
--- a/vlib/encoding/xml/validation.v
+++ b/vlib/encoding/xml/validation.v
@ -0,0 +1,96 @@
+module xml
+
+fn (node XMLNode) validate(elements map[string]DTDElement, entities map[string]string) !XMLNode {
+	mut children := []XMLNodeContents{cap: node.children.len}
+
+	valid_elements := elements[node.name].definition
+	mut validate_node_children := node.name in elements
+
+	// Check if the node will match everything
+	if valid_elements.len == 1 && valid_elements[0] == '#PCDATA' {
+		validate_node_children = false
+	}
+
+	for child in node.children {
+		match child {
+			XMLNode {
+				if validate_node_children {
+					name := child.name
+					if name !in valid_elements {
+						return error('Invalid child element ${name} for ${node.name}')
+					}
+				}
+				children << child.validate(elements, entities)!
+			}
+			string {
+				children << unescape_text(child, entities: entities)!
+			}
+			else {
+				// Ignore other nodes
+				children << child
+			}
+		}
+	}
+
+	return XMLNode{
+		name: node.name
+		attributes: node.attributes
+		children: children
+	}
+}
+
+// validate checks the document is well-formed and valid. It returns a new
+// document with the parsed entities expanded when validation is successful.
+// Otherwise it returns an error.
+pub fn (doc XMLDocument) validate() !XMLDocument {
+	// The document is well-formed because we were able to parse it properly.
+	match doc.doctype.dtd {
+		DocumentTypeDefinition {
+			// Store the element and entity definitions
+			mut elements := map[string]DTDElement{}
+			mut entities := default_entities.clone()
+			mut reverse_entities := default_entities_reverse.clone()
+
+			for item in doc.doctype.dtd.list {
+				match item {
+					DTDElement {
+						name := item.name
+						if name in elements {
+							return error('Duplicate element definition for ${name}')
+						}
+						elements[name] = item
+					}
+					DTDEntity {
+						name := item.name
+						if name in entities {
+							return error('Duplicate entity definition for ${name}')
+						}
+						entities[name] = item.value
+						reverse_entities[item.value] = name
+					}
+				}
+			}
+
+			// Now validate the document against the elements and entities.
+			new_root := doc.root.validate(elements, entities)!
+
+			// Check the DOCTYPE name matches the root name
+			if doc.doctype.name.len > 0 && doc.doctype.name != new_root.name {
+				return error('Root element ${new_root.name} does not match DOCTYPE ${doc.doctype.name}')
+			}
+
+			return XMLDocument{
+				version: doc.version
+				encoding: doc.encoding
+				doctype: doc.doctype
+				comments: doc.comments
+				root: new_root
+				parsed_reverse_entities: reverse_entities
+			}
+		}
+		string {
+			// TODO: Validate the document against the DTD string.
+			return doc
+		}
+	}
+}
				`@ -0,0 +1 @@`
				`Malformed XML. Found "<" in attribute string: "<body"`