encoding.xml: make functions public, add documentation, tests, fix attribute parsing for self-closing tags (#19901)

2025-09-13 22:42:26 +03:00 · 2023-11-16 18:13:36 +00:00 · 2023-11-16 18:13:36 +00:00 · c9429f3331
commit c9429f3331
parent e9258c2a08
8 changed files with 482 additions and 54 deletions
--- a/vlib/encoding/xml/README.md
+++ b/vlib/encoding/xml/README.md
@ -7,6 +7,10 @@ Note that this is not a streaming XML parser. It reads the entire document into
 memory and then parses it. This is not a problem for small documents, but it
 might be a problem for extremely large documents (several hundred megabytes or more).
 The public function `parse_single_node` can be used to parse a single node from
 an implementation of `io.Reader`, which can help parse large XML documents on an
 element-by-element basis. Sample usage is provided in the `parser_test.v` file.
 ## Usage
 ### Parsing XML Files
--- a/vlib/encoding/xml/encoding.v
+++ b/vlib/encoding/xml/encoding.v
@ -26,38 +26,43 @@ pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_enti
 		builder.write_string(value)
 		builder.write_u8(`"`)
 	}
-	builder.write_string('>\n')
+	if node.children.len > 0 {
-	for child in node.children {
+		builder.write_string('>\n')
-		match child {
+		for child in node.children {
-			string {
+			match child {
-				builder.write_string(indent)
+				string {
-				builder.write_string(original_indent)
+					builder.write_string(indent)
-				builder.write_string(escape_text(child, reverse_entities: reverse_entities))
+					builder.write_string(original_indent)
-			}
+					builder.write_string(escape_text(child, reverse_entities: reverse_entities))
-			XMLNode {
+				}
-				builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities))
+				XMLNode {
-			}
+					builder.write_string(child.pretty_str(original_indent, depth + 1,
-			XMLComment {
+						reverse_entities))
-				builder.write_string(indent)
+				}
-				builder.write_string(original_indent)
+				XMLComment {
-				builder.write_string('<!--')
+					builder.write_string(indent)
-				builder.write_string(child.text)
+					builder.write_string(original_indent)
-				builder.write_string('-->')
+					builder.write_string('<!--')
-			}
+					builder.write_string(child.text)
-			XMLCData {
+					builder.write_string('-->')
-				builder.write_string(indent)
+				}
-				builder.write_string(original_indent)
+				XMLCData {
-				builder.write_string('<![CDATA[')
+					builder.write_string(indent)
-				builder.write_string(child.text)
+					builder.write_string(original_indent)
-				builder.write_string(']]>')
+					builder.write_string('<![CDATA[')
 					builder.write_string(child.text)
 					builder.write_string(']]>')
 				}
 			}
 			builder.write_u8(`\n`)
 		}
-		builder.write_u8(`\n`)
+		builder.write_string(indent)
 		builder.write_string('</')
 		builder.write_string(node.name)
 		builder.write_u8(`>`)
 	} else {
 		builder.write_string('/>')
 	}
 	builder.write_string(indent)
 	builder.write_string('</')
 	builder.write_string(node.name)
 	builder.write_u8(`>`)
 	return builder.str()
 }
@ -73,10 +78,20 @@ fn (list []DTDListItem) pretty_str(indent string) string {
 	for item in list {
 		match item {
 			DTDEntity {
-				builder.write_string('${indent}<!ENTITY ${item.name} "${item.value}">')
+				builder.write_string(indent)
 				builder.write_string('<!ENTITY ')
 				builder.write_string(item.name)
 				builder.write_string(' "')
 				builder.write_string(item.value)
 				builder.write_string('">')
 			}
 			DTDElement {
-				builder.write_string('${indent}<!ELEMENT ${item.name} ${item.definition}>')
+				builder.write_string(indent)
 				builder.write_string('<!ELEMENT ')
 				builder.write_string(item.name)
 				builder.write_string(' [')
 				builder.write_string(item.definition.join(', '))
 				builder.write_string(']>')
 			}
 		}
 		builder.write_u8(`\n`)
@ -86,11 +101,17 @@ fn (list []DTDListItem) pretty_str(indent string) string {
 }
 fn (doctype DocumentType) pretty_str(indent string) string {
 	mut builder := strings.new_builder(1024)
 	match doctype.dtd {
 		string {
 			content := doctype.dtd
 			return if content.len > 0 {
-				'<!DOCTYPE ${doctype.name} SYSTEM "${content}">'
+				builder.write_string('<!DOCTYPE ')
 				builder.write_string(doctype.name)
 				builder.write_string(' SYSTEM ')
 				builder.write_string(content)
 				builder.write_string('>\n')
 				builder.str()
 			} else {
 				''
 			}
@ -100,13 +121,11 @@ fn (doctype DocumentType) pretty_str(indent string) string {
 				return ''
 			}
 			mut builder := strings.new_builder(1024)
 			builder.write_string('<!DOCTYPE ')
 			builder.write_string(doctype.name)
 			builder.write_string(' ')
 			builder.write_string(doctype.dtd.list.pretty_str(indent))
-			builder.write_string('>')
+			builder.write_string('>\n')
 			builder.write_u8(`\n`)
 			return builder.str()
 		}
 	}
@ -117,7 +136,12 @@ fn (doctype DocumentType) pretty_str(indent string) string {
 pub fn (doc XMLDocument) pretty_str(indent string) string {
 	mut document_builder := strings.new_builder(1024)
-	prolog := '<?xml version="${doc.version}" encoding="${doc.encoding}"?>'
+	document_builder.write_string('<?xml version="')
 	document_builder.write_string(doc.version)
 	document_builder.write_string('" encoding="')
 	document_builder.write_string(doc.encoding)
 	document_builder.write_string('"?>\n')
 	comments := if doc.comments.len > 0 {
 		mut comments_buffer := strings.new_builder(512)
 		for comment in doc.comments {
@ -131,11 +155,14 @@ pub fn (doc XMLDocument) pretty_str(indent string) string {
 		''
 	}
-	document_builder.write_string(prolog)
+	doctype_string := doc.doctype.pretty_str(indent)
-	document_builder.write_u8(`\n`)
+	if doctype_string.len > 0 {
-	document_builder.write_string(doc.doctype.pretty_str(indent))
+		document_builder.write_string(doctype_string)
-	document_builder.write_u8(`\n`)
+		document_builder.write_u8(`\n`)
-	document_builder.write_string(comments)
+	}
 	if comments.len > 0 {
 		document_builder.write_string(comments)
 	}
 	document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities))
 	return document_builder.str()
--- a/vlib/encoding/xml/encoding_test.v
+++ b/vlib/encoding/xml/encoding_test.v
@ -0,0 +1,179 @@
 module main
 import encoding.xml
 fn test_node() {
 	nodes := [
 		xml.XMLNode{
 			name: 'test'
 			attributes: {
 				'test:key':   ' test_value '
 				'test:other': '123456'
 			}
 			children: [
 				xml.XMLNode{
 					name: 'child'
 					attributes: {
 						'child:key': 'child_value'
 					}
 				},
 				'Sample text',
 			]
 		},
 		xml.XMLNode{
 			name: 's'
 			attributes: {
 				'k': 'v'
 			}
 			children: [
 				'Hello, world!',
 				xml.XMLNode{
 					name: 'c'
 					attributes: {
 						'k2': 'v2'
 					}
 				},
 			]
 		},
 		xml.XMLNode{
 			name: 'ext'
 			attributes: {
 				'uri':          '{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}'
 				'xmlns:xcalcf': 'http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures'
 			}
 			children: [
 				xml.XMLNode{
 					name: 'xcalcf:calcFeatures'
 					children: [
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:RD'
 							}
 						},
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:Single'
 							}
 						},
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:FV'
 							}
 						},
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:CNMTM'
 							}
 						},
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:LET_WF'
 							}
 						},
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:LAMBDA_WF'
 							}
 						},
 						xml.XMLNode{
 							name: 'xcalcf:feature'
 							attributes: {
 								'name': 'microsoft.com:ARRAYTEXT_WF'
 							}
 						},
 					]
 				},
 			]
 		},
 	]
 	values := [
 		'
 		<test test:key=" test_value " test:other="123456">
 			<child child:key="child_value"/>
 			Sample text
 		</test>'.trim_indent(),
 		'
 		<s k="v">
 			Hello, world!
 			<c k2="v2"/>
 		</s>'.trim_indent(),
 		'
 		<ext uri="{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}" xmlns:xcalcf="http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures">
 			<xcalcf:calcFeatures>
 				<xcalcf:feature name="microsoft.com:RD"/>
 				<xcalcf:feature name="microsoft.com:Single"/>
 				<xcalcf:feature name="microsoft.com:FV"/>
 				<xcalcf:feature name="microsoft.com:CNMTM"/>
 				<xcalcf:feature name="microsoft.com:LET_WF"/>
 				<xcalcf:feature name="microsoft.com:LAMBDA_WF"/>
 				<xcalcf:feature name="microsoft.com:ARRAYTEXT_WF"/>
 			</xcalcf:calcFeatures>
 		</ext>'.trim_indent(),
 	]
 	for i, node in nodes {
 		assert node.pretty_str('\t', 0, xml.default_entities_reverse) == values[i]
 	}
 }
 fn test_doc() {
 	docs := [
 		xml.XMLDocument{
 			root: xml.XMLNode{
 				name: 'test'
 				attributes: {
 					'test:key':   ' test_value '
 					'test:other': '123456'
 				}
 				children: [
 					xml.XMLNode{
 						name: 'child'
 						attributes: {
 							'child:key': 'child_value'
 						}
 					},
 					'Sample text',
 				]
 			}
 		},
 		xml.XMLDocument{
 			root: xml.XMLNode{
 				name: 's'
 				attributes: {
 					'k': 'v'
 				}
 				children: [
 					'Hello, world!',
 					xml.XMLNode{
 						name: 'c'
 						attributes: {
 							'k2': 'v2'
 						}
 					},
 				]
 			}
 		},
 	]
 	values := [
 		'
 		<?xml version="1.0" encoding="UTF-8"?>
 		<test test:key=" test_value " test:other="123456">
 			<child child:key="child_value"/>
 			Sample text
 		</test>'.trim_indent(),
 		'
 		<?xml version="1.0" encoding="UTF-8"?>
 		<s k="v">
 			Hello, world!
 			<c k2="v2"/>
 		</s>'.trim_indent(),
 	]
 	for i, doc in docs {
 		assert doc.pretty_str('\t') == values[i]
 	}
 }
--- a/vlib/encoding/xml/parser.v
+++ b/vlib/encoding/xml/parser.v
@ -541,7 +541,12 @@ fn parse_children(name string, attributes map[string]string, mut reader io.Reade
 	return error('XML node <${name}> not closed.')
 }
-fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
+// parse_single_node parses a single XML node from the reader. The first character of the tag is passed
 // in as the first_char parameter.
 // This function is meant to assist in parsing nested nodes one at a time. Using this function as
 // opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
 // XML documents without running out of memory.
 pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
 	mut contents := strings.new_builder(xml.default_string_builder_cap)
 	contents.write_u8(first_char)
@ -564,7 +569,7 @@ fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
 		// We're not looking for children and inner text
 		return XMLNode{
 			name: name
-			attributes: parse_attributes(tag_contents[name.len - 1..tag_contents.len].trim_space())!
+			attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())!
 		}
 	}
--- a/vlib/encoding/xml/parser_test.v
+++ b/vlib/encoding/xml/parser_test.v
@ -0,0 +1,125 @@
 module xml
 const (
 	sample_doc = '
 <root>
 	<c id="c1"/>
 	<c id="c2">
 		Sample Text
 	</c>
 	<c id="c3"/>
 	<abc id="c4"/>
 	<xyz id="c5"/>
 	<c id="c6"/>
 	<cx id="c7"/>
 	<cd id="c8"/>
 	<child id="c9">
 		More Sample Text
 	</child>
 	<cz id="c10"/>
 </root>'
 	xml_elements = [
 		XMLNode{
 			name: 'c'
 			attributes: {
 				'id': 'c1'
 			}
 		},
 		XMLNode{
 			name: 'c'
 			attributes: {
 				'id': 'c2'
 			}
 			children: [
 				'Sample Text',
 			]
 		},
 		XMLNode{
 			name: 'c'
 			attributes: {
 				'id': 'c3'
 			}
 		},
 		XMLNode{
 			name: 'abc'
 			attributes: {
 				'id': 'c4'
 			}
 		},
 		XMLNode{
 			name: 'xyz'
 			attributes: {
 				'id': 'c5'
 			}
 		},
 		XMLNode{
 			name: 'c'
 			attributes: {
 				'id': 'c6'
 			}
 		},
 		XMLNode{
 			name: 'cx'
 			attributes: {
 				'id': 'c7'
 			}
 		},
 		XMLNode{
 			name: 'cd'
 			attributes: {
 				'id': 'c8'
 			}
 		},
 		XMLNode{
 			name: 'child'
 			attributes: {
 				'id': 'c9'
 			}
 			children: [
 				'More Sample Text',
 			]
 		},
 		XMLNode{
 			name: 'cz'
 			attributes: {
 				'id': 'c10'
 			}
 		},
 	]
 )
 fn test_single_element_parsing() ! {
 	mut reader := FullBufferReader{
 		contents: xml.sample_doc.bytes()
 	}
 	// Skip the "<root>" tag
 	mut skip := []u8{len: 6}
 	reader.read(mut skip)!
 	mut local_buf := [u8(0)]
 	mut ch := next_char(mut reader, mut local_buf)!
 	mut count := 0
 	for count < xml.xml_elements.len {
 		match ch {
 			`<` {
 				next_ch := next_char(mut reader, mut local_buf)!
 				match next_ch {
 					`/` {}
 					else {
 						parsed_element := parse_single_node(next_ch, mut reader)!
 						assert xml.xml_elements[count] == parsed_element
 						count++
 					}
 				}
 				ch = next_char(mut reader, mut local_buf)!
 			}
 			else {
 				for ch != `<` {
 					ch = next_char(mut reader, mut local_buf)!
 				}
 			}
 		}
 	}
 }
--- a/vlib/encoding/xml/query.v
+++ b/vlib/encoding/xml/query.v
@ -1,6 +1,8 @@
 module xml
-fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
+// get_element_by_id returns the first element with the given id, or none if no
 // such element exists in the subtree rooted at this node.
 pub fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
 	// Is this the node we're looking for?
 	if attribute_id := node.attributes['id'] {
 		if attribute_id == id {
@ -27,7 +29,9 @@ fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
 	return none
 }
-fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
+// get_elements_by_tag returns all elements with the given tag name in the subtree
 // rooted at this node. If there are no such elements, an empty array is returned.
 pub fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
 	mut result := []XMLNode{}
 	if node.name == tag {
@ -48,13 +52,45 @@ fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
 	return result
 }
 // get_elements_by_attribute returns all elements with the given attribute-value pair in
 // the subtree rooted at this node. If there are no such elements, an empty array is returned.
 pub fn (node XMLNode) get_elements_by_attribute(attribute string, value string) []XMLNode {
 	mut result := []XMLNode{}
 	if attribute_value := node.attributes[attribute] {
 		if attribute_value == value {
 			result << node
 		}
 	}
 	if node.children.len == 0 {
 		return result
 	}
 	// Recurse into children
 	for child in node.children {
 		if child is XMLNode {
 			result << child.get_elements_by_attribute(attribute, value)
 		}
 	}
 	return result
 }
 // get_element_by_id returns the first element with the given id, or none if no
 // such element exists.
 pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode {
 	return doc.root.get_element_by_id(id)
 }
 // get_elements_by_attribute returns all elements with the given attribute-value pair.
 // If there are no such elements, an empty array is returned.
 pub fn (doc XMLDocument) get_elements_by_attribute(attribute string, value string) []XMLNode {
 	return doc.root.get_elements_by_attribute(attribute, value)
 }
 // get_elements_by_tag returns all elements with the given tag name.
 // If there are no such elements, an empty array is returned.
 pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode {
 	return doc.root.get_elements_by_tag(tag)
 }
--- a/vlib/encoding/xml/query_test.v
+++ b/vlib/encoding/xml/query_test.v
@ -0,0 +1,52 @@
 module main
 import encoding.xml
 const (
 	sample_document = '
 <root>
 	<a attr="value1">
 		<b id="middle-tag" attr="value2">
 			<c attr="value3">Text1</c>
 			<d attr="value4">Text2</d>
 			<e attr="value5">
 				<f id="innermost" attr="value6">Text3</f>
 				<g attr="value7">Text4</g>
 				<h attr="value8">Text5</h>
 			</e>
 			<i attr="value9">Text6</i>
 		</b>
 		<j attr="value10">Text7</j>
 	</a>
 	<k attr="value11">Text8</k>
 	<l attr="value12">Text9</l>
 </root>
 '
 )
 fn test_querying() ! {
 	doc := xml.XMLDocument.from_string(sample_document)!
 	assert doc.root.name == 'root'
 	assert doc.root.children.len == 3
 	middle_tag := doc.get_element_by_id('middle-tag')?
 	assert middle_tag.name == 'b'
 	assert middle_tag.attributes['attr'] == 'value2'
 	assert middle_tag.children.len == 4
 	innermost := middle_tag.get_element_by_id('innermost')?
 	assert innermost.name == 'f'
 	assert innermost.attributes['attr'] == 'value6'
 	for count in 1 .. 13 {
 		assert doc.get_elements_by_attribute('attr', 'value${count}').len == 1
 	}
 	i_tags := doc.get_elements_by_tag('i')
 	assert i_tags.len == 1
 	assert i_tags[0].name == 'i'
 	assert i_tags[0].attributes['attr'] == 'value9'
 	assert i_tags[0].children.len == 1
 	assert i_tags[0].children[0] as string == 'Text6'
 }
--- a/vlib/encoding/xml/types.v
+++ b/vlib/encoding/xml/types.v
@ -4,12 +4,12 @@ pub type XMLNodeContents = XMLCData | XMLComment | XMLNode | string
 pub struct XMLCData {
 pub:
-	text string [required]
+	text string @[required]
 }
 pub struct XMLComment {
 pub:
-	text string [required]
+	text string @[required]
 }
 // XMLNode represents a single XML node. It contains the node name,
@ -17,7 +17,7 @@ pub:
 // other XML nodes, CDATA, plain text, or comments.
 pub struct XMLNode {
 pub:
-	name       string            [required]
+	name       string            @[required]
 	attributes map[string]string
 	children   []XMLNodeContents
 }
@ -31,19 +31,19 @@ pub:
 pub struct XMLDocument {
 	Prolog
 pub:
-	root XMLNode [required]
+	root XMLNode @[required]
 }
 pub type DTDListItem = DTDElement | DTDEntity
 pub struct DTDEntity {
-	name  string [required]
+	name  string @[required]
-	value string [required]
+	value string @[required]
 }
 pub struct DTDElement {
-	name       string   [required]
+	name       string   @[required]
-	definition []string [required]
+	definition []string @[required]
 }
 pub struct DocumentTypeDefinition {
@ -52,7 +52,7 @@ pub struct DocumentTypeDefinition {
 }
 pub struct DocumentType {
-	name string  [required]
+	name string  @[required]
 	dtd  DTDInfo
 }