encoding.xml: make functions public, add documentation, tests, fix attribute parsing for self-closing tags (#19901)

This commit is contained in:
Subhomoy Haldar 2023-11-16 18:13:36 +00:00 committed by GitHub
parent e9258c2a08
commit c9429f3331
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 482 additions and 54 deletions

View file

@ -7,6 +7,10 @@ Note that this is not a streaming XML parser. It reads the entire document into
memory and then parses it. This is not a problem for small documents, but it memory and then parses it. This is not a problem for small documents, but it
might be a problem for extremely large documents (several hundred megabytes or more). might be a problem for extremely large documents (several hundred megabytes or more).
The public function `parse_single_node` can be used to parse a single node from
an implementation of `io.Reader`, which can help parse large XML documents on an
element-by-element basis. Sample usage is provided in the `parser_test.v` file.
## Usage ## Usage
### Parsing XML Files ### Parsing XML Files

View file

@ -26,38 +26,43 @@ pub fn (node XMLNode) pretty_str(original_indent string, depth int, reverse_enti
builder.write_string(value) builder.write_string(value)
builder.write_u8(`"`) builder.write_u8(`"`)
} }
builder.write_string('>\n') if node.children.len > 0 {
for child in node.children { builder.write_string('>\n')
match child { for child in node.children {
string { match child {
builder.write_string(indent) string {
builder.write_string(original_indent) builder.write_string(indent)
builder.write_string(escape_text(child, reverse_entities: reverse_entities)) builder.write_string(original_indent)
} builder.write_string(escape_text(child, reverse_entities: reverse_entities))
XMLNode { }
builder.write_string(child.pretty_str(original_indent, depth + 1, reverse_entities)) XMLNode {
} builder.write_string(child.pretty_str(original_indent, depth + 1,
XMLComment { reverse_entities))
builder.write_string(indent) }
builder.write_string(original_indent) XMLComment {
builder.write_string('<!--') builder.write_string(indent)
builder.write_string(child.text) builder.write_string(original_indent)
builder.write_string('-->') builder.write_string('<!--')
} builder.write_string(child.text)
XMLCData { builder.write_string('-->')
builder.write_string(indent) }
builder.write_string(original_indent) XMLCData {
builder.write_string('<![CDATA[') builder.write_string(indent)
builder.write_string(child.text) builder.write_string(original_indent)
builder.write_string(']]>') builder.write_string('<![CDATA[')
builder.write_string(child.text)
builder.write_string(']]>')
}
} }
builder.write_u8(`\n`)
} }
builder.write_u8(`\n`) builder.write_string(indent)
builder.write_string('</')
builder.write_string(node.name)
builder.write_u8(`>`)
} else {
builder.write_string('/>')
} }
builder.write_string(indent)
builder.write_string('</')
builder.write_string(node.name)
builder.write_u8(`>`)
return builder.str() return builder.str()
} }
@ -73,10 +78,20 @@ fn (list []DTDListItem) pretty_str(indent string) string {
for item in list { for item in list {
match item { match item {
DTDEntity { DTDEntity {
builder.write_string('${indent}<!ENTITY ${item.name} "${item.value}">') builder.write_string(indent)
builder.write_string('<!ENTITY ')
builder.write_string(item.name)
builder.write_string(' "')
builder.write_string(item.value)
builder.write_string('">')
} }
DTDElement { DTDElement {
builder.write_string('${indent}<!ELEMENT ${item.name} ${item.definition}>') builder.write_string(indent)
builder.write_string('<!ELEMENT ')
builder.write_string(item.name)
builder.write_string(' [')
builder.write_string(item.definition.join(', '))
builder.write_string(']>')
} }
} }
builder.write_u8(`\n`) builder.write_u8(`\n`)
@ -86,11 +101,17 @@ fn (list []DTDListItem) pretty_str(indent string) string {
} }
fn (doctype DocumentType) pretty_str(indent string) string { fn (doctype DocumentType) pretty_str(indent string) string {
mut builder := strings.new_builder(1024)
match doctype.dtd { match doctype.dtd {
string { string {
content := doctype.dtd content := doctype.dtd
return if content.len > 0 { return if content.len > 0 {
'<!DOCTYPE ${doctype.name} SYSTEM "${content}">' builder.write_string('<!DOCTYPE ')
builder.write_string(doctype.name)
builder.write_string(' SYSTEM ')
builder.write_string(content)
builder.write_string('>\n')
builder.str()
} else { } else {
'' ''
} }
@ -100,13 +121,11 @@ fn (doctype DocumentType) pretty_str(indent string) string {
return '' return ''
} }
mut builder := strings.new_builder(1024)
builder.write_string('<!DOCTYPE ') builder.write_string('<!DOCTYPE ')
builder.write_string(doctype.name) builder.write_string(doctype.name)
builder.write_string(' ') builder.write_string(' ')
builder.write_string(doctype.dtd.list.pretty_str(indent)) builder.write_string(doctype.dtd.list.pretty_str(indent))
builder.write_string('>') builder.write_string('>\n')
builder.write_u8(`\n`)
return builder.str() return builder.str()
} }
} }
@ -117,7 +136,12 @@ fn (doctype DocumentType) pretty_str(indent string) string {
pub fn (doc XMLDocument) pretty_str(indent string) string { pub fn (doc XMLDocument) pretty_str(indent string) string {
mut document_builder := strings.new_builder(1024) mut document_builder := strings.new_builder(1024)
prolog := '<?xml version="${doc.version}" encoding="${doc.encoding}"?>' document_builder.write_string('<?xml version="')
document_builder.write_string(doc.version)
document_builder.write_string('" encoding="')
document_builder.write_string(doc.encoding)
document_builder.write_string('"?>\n')
comments := if doc.comments.len > 0 { comments := if doc.comments.len > 0 {
mut comments_buffer := strings.new_builder(512) mut comments_buffer := strings.new_builder(512)
for comment in doc.comments { for comment in doc.comments {
@ -131,11 +155,14 @@ pub fn (doc XMLDocument) pretty_str(indent string) string {
'' ''
} }
document_builder.write_string(prolog) doctype_string := doc.doctype.pretty_str(indent)
document_builder.write_u8(`\n`) if doctype_string.len > 0 {
document_builder.write_string(doc.doctype.pretty_str(indent)) document_builder.write_string(doctype_string)
document_builder.write_u8(`\n`) document_builder.write_u8(`\n`)
document_builder.write_string(comments) }
if comments.len > 0 {
document_builder.write_string(comments)
}
document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities)) document_builder.write_string(doc.root.pretty_str(indent, 0, doc.parsed_reverse_entities))
return document_builder.str() return document_builder.str()

View file

@ -0,0 +1,179 @@
module main
import encoding.xml
fn test_node() {
nodes := [
xml.XMLNode{
name: 'test'
attributes: {
'test:key': ' test_value '
'test:other': '123456'
}
children: [
xml.XMLNode{
name: 'child'
attributes: {
'child:key': 'child_value'
}
},
'Sample text',
]
},
xml.XMLNode{
name: 's'
attributes: {
'k': 'v'
}
children: [
'Hello, world!',
xml.XMLNode{
name: 'c'
attributes: {
'k2': 'v2'
}
},
]
},
xml.XMLNode{
name: 'ext'
attributes: {
'uri': '{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}'
'xmlns:xcalcf': 'http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures'
}
children: [
xml.XMLNode{
name: 'xcalcf:calcFeatures'
children: [
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:RD'
}
},
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:Single'
}
},
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:FV'
}
},
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:CNMTM'
}
},
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:LET_WF'
}
},
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:LAMBDA_WF'
}
},
xml.XMLNode{
name: 'xcalcf:feature'
attributes: {
'name': 'microsoft.com:ARRAYTEXT_WF'
}
},
]
},
]
},
]
values := [
'
<test test:key=" test_value " test:other="123456">
<child child:key="child_value"/>
Sample text
</test>'.trim_indent(),
'
<s k="v">
Hello, world!
<c k2="v2"/>
</s>'.trim_indent(),
'
<ext uri="{B58B0392-4F1F-4190-BB64-5DF3571DCE5F}" xmlns:xcalcf="http://schemas.microsoft.com/office/spreadsheetml/2018/calcfeatures">
<xcalcf:calcFeatures>
<xcalcf:feature name="microsoft.com:RD"/>
<xcalcf:feature name="microsoft.com:Single"/>
<xcalcf:feature name="microsoft.com:FV"/>
<xcalcf:feature name="microsoft.com:CNMTM"/>
<xcalcf:feature name="microsoft.com:LET_WF"/>
<xcalcf:feature name="microsoft.com:LAMBDA_WF"/>
<xcalcf:feature name="microsoft.com:ARRAYTEXT_WF"/>
</xcalcf:calcFeatures>
</ext>'.trim_indent(),
]
for i, node in nodes {
assert node.pretty_str('\t', 0, xml.default_entities_reverse) == values[i]
}
}
fn test_doc() {
docs := [
xml.XMLDocument{
root: xml.XMLNode{
name: 'test'
attributes: {
'test:key': ' test_value '
'test:other': '123456'
}
children: [
xml.XMLNode{
name: 'child'
attributes: {
'child:key': 'child_value'
}
},
'Sample text',
]
}
},
xml.XMLDocument{
root: xml.XMLNode{
name: 's'
attributes: {
'k': 'v'
}
children: [
'Hello, world!',
xml.XMLNode{
name: 'c'
attributes: {
'k2': 'v2'
}
},
]
}
},
]
values := [
'
<?xml version="1.0" encoding="UTF-8"?>
<test test:key=" test_value " test:other="123456">
<child child:key="child_value"/>
Sample text
</test>'.trim_indent(),
'
<?xml version="1.0" encoding="UTF-8"?>
<s k="v">
Hello, world!
<c k2="v2"/>
</s>'.trim_indent(),
]
for i, doc in docs {
assert doc.pretty_str('\t') == values[i]
}
}

View file

@ -541,7 +541,12 @@ fn parse_children(name string, attributes map[string]string, mut reader io.Reade
return error('XML node <${name}> not closed.') return error('XML node <${name}> not closed.')
} }
fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode { // parse_single_node parses a single XML node from the reader. The first character of the tag is passed
// in as the first_char parameter.
// This function is meant to assist in parsing nested nodes one at a time. Using this function as
// opposed to the recommended static functions makes it easier to parse smaller nodes in extremely large
// XML documents without running out of memory.
pub fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
mut contents := strings.new_builder(xml.default_string_builder_cap) mut contents := strings.new_builder(xml.default_string_builder_cap)
contents.write_u8(first_char) contents.write_u8(first_char)
@ -564,7 +569,7 @@ fn parse_single_node(first_char u8, mut reader io.Reader) !XMLNode {
// We're not looking for children and inner text // We're not looking for children and inner text
return XMLNode{ return XMLNode{
name: name name: name
attributes: parse_attributes(tag_contents[name.len - 1..tag_contents.len].trim_space())! attributes: parse_attributes(tag_contents[name.len..tag_contents.len - 1].trim_space())!
} }
} }

View file

@ -0,0 +1,125 @@
module xml
const (
sample_doc = '
<root>
<c id="c1"/>
<c id="c2">
Sample Text
</c>
<c id="c3"/>
<abc id="c4"/>
<xyz id="c5"/>
<c id="c6"/>
<cx id="c7"/>
<cd id="c8"/>
<child id="c9">
More Sample Text
</child>
<cz id="c10"/>
</root>'
xml_elements = [
XMLNode{
name: 'c'
attributes: {
'id': 'c1'
}
},
XMLNode{
name: 'c'
attributes: {
'id': 'c2'
}
children: [
'Sample Text',
]
},
XMLNode{
name: 'c'
attributes: {
'id': 'c3'
}
},
XMLNode{
name: 'abc'
attributes: {
'id': 'c4'
}
},
XMLNode{
name: 'xyz'
attributes: {
'id': 'c5'
}
},
XMLNode{
name: 'c'
attributes: {
'id': 'c6'
}
},
XMLNode{
name: 'cx'
attributes: {
'id': 'c7'
}
},
XMLNode{
name: 'cd'
attributes: {
'id': 'c8'
}
},
XMLNode{
name: 'child'
attributes: {
'id': 'c9'
}
children: [
'More Sample Text',
]
},
XMLNode{
name: 'cz'
attributes: {
'id': 'c10'
}
},
]
)
fn test_single_element_parsing() ! {
mut reader := FullBufferReader{
contents: xml.sample_doc.bytes()
}
// Skip the "<root>" tag
mut skip := []u8{len: 6}
reader.read(mut skip)!
mut local_buf := [u8(0)]
mut ch := next_char(mut reader, mut local_buf)!
mut count := 0
for count < xml.xml_elements.len {
match ch {
`<` {
next_ch := next_char(mut reader, mut local_buf)!
match next_ch {
`/` {}
else {
parsed_element := parse_single_node(next_ch, mut reader)!
assert xml.xml_elements[count] == parsed_element
count++
}
}
ch = next_char(mut reader, mut local_buf)!
}
else {
for ch != `<` {
ch = next_char(mut reader, mut local_buf)!
}
}
}
}
}

View file

@ -1,6 +1,8 @@
module xml module xml
fn (node XMLNode) get_element_by_id(id string) ?XMLNode { // get_element_by_id returns the first element with the given id, or none if no
// such element exists in the subtree rooted at this node.
pub fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
// Is this the node we're looking for? // Is this the node we're looking for?
if attribute_id := node.attributes['id'] { if attribute_id := node.attributes['id'] {
if attribute_id == id { if attribute_id == id {
@ -27,7 +29,9 @@ fn (node XMLNode) get_element_by_id(id string) ?XMLNode {
return none return none
} }
fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode { // get_elements_by_tag returns all elements with the given tag name in the subtree
// rooted at this node. If there are no such elements, an empty array is returned.
pub fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
mut result := []XMLNode{} mut result := []XMLNode{}
if node.name == tag { if node.name == tag {
@ -48,13 +52,45 @@ fn (node XMLNode) get_elements_by_tag(tag string) []XMLNode {
return result return result
} }
// get_elements_by_attribute returns all elements with the given attribute-value pair in
// the subtree rooted at this node. If there are no such elements, an empty array is returned.
pub fn (node XMLNode) get_elements_by_attribute(attribute string, value string) []XMLNode {
mut result := []XMLNode{}
if attribute_value := node.attributes[attribute] {
if attribute_value == value {
result << node
}
}
if node.children.len == 0 {
return result
}
// Recurse into children
for child in node.children {
if child is XMLNode {
result << child.get_elements_by_attribute(attribute, value)
}
}
return result
}
// get_element_by_id returns the first element with the given id, or none if no // get_element_by_id returns the first element with the given id, or none if no
// such element exists. // such element exists.
pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode { pub fn (doc XMLDocument) get_element_by_id(id string) ?XMLNode {
return doc.root.get_element_by_id(id) return doc.root.get_element_by_id(id)
} }
// get_elements_by_attribute returns all elements with the given attribute-value pair.
// If there are no such elements, an empty array is returned.
pub fn (doc XMLDocument) get_elements_by_attribute(attribute string, value string) []XMLNode {
return doc.root.get_elements_by_attribute(attribute, value)
}
// get_elements_by_tag returns all elements with the given tag name. // get_elements_by_tag returns all elements with the given tag name.
// If there are no such elements, an empty array is returned.
pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode { pub fn (doc XMLDocument) get_elements_by_tag(tag string) []XMLNode {
return doc.root.get_elements_by_tag(tag) return doc.root.get_elements_by_tag(tag)
} }

View file

@ -0,0 +1,52 @@
module main
import encoding.xml
const (
sample_document = '
<root>
<a attr="value1">
<b id="middle-tag" attr="value2">
<c attr="value3">Text1</c>
<d attr="value4">Text2</d>
<e attr="value5">
<f id="innermost" attr="value6">Text3</f>
<g attr="value7">Text4</g>
<h attr="value8">Text5</h>
</e>
<i attr="value9">Text6</i>
</b>
<j attr="value10">Text7</j>
</a>
<k attr="value11">Text8</k>
<l attr="value12">Text9</l>
</root>
'
)
fn test_querying() ! {
doc := xml.XMLDocument.from_string(sample_document)!
assert doc.root.name == 'root'
assert doc.root.children.len == 3
middle_tag := doc.get_element_by_id('middle-tag')?
assert middle_tag.name == 'b'
assert middle_tag.attributes['attr'] == 'value2'
assert middle_tag.children.len == 4
innermost := middle_tag.get_element_by_id('innermost')?
assert innermost.name == 'f'
assert innermost.attributes['attr'] == 'value6'
for count in 1 .. 13 {
assert doc.get_elements_by_attribute('attr', 'value${count}').len == 1
}
i_tags := doc.get_elements_by_tag('i')
assert i_tags.len == 1
assert i_tags[0].name == 'i'
assert i_tags[0].attributes['attr'] == 'value9'
assert i_tags[0].children.len == 1
assert i_tags[0].children[0] as string == 'Text6'
}

View file

@ -4,12 +4,12 @@ pub type XMLNodeContents = XMLCData | XMLComment | XMLNode | string
pub struct XMLCData { pub struct XMLCData {
pub: pub:
text string [required] text string @[required]
} }
pub struct XMLComment { pub struct XMLComment {
pub: pub:
text string [required] text string @[required]
} }
// XMLNode represents a single XML node. It contains the node name, // XMLNode represents a single XML node. It contains the node name,
@ -17,7 +17,7 @@ pub:
// other XML nodes, CDATA, plain text, or comments. // other XML nodes, CDATA, plain text, or comments.
pub struct XMLNode { pub struct XMLNode {
pub: pub:
name string [required] name string @[required]
attributes map[string]string attributes map[string]string
children []XMLNodeContents children []XMLNodeContents
} }
@ -31,19 +31,19 @@ pub:
pub struct XMLDocument { pub struct XMLDocument {
Prolog Prolog
pub: pub:
root XMLNode [required] root XMLNode @[required]
} }
pub type DTDListItem = DTDElement | DTDEntity pub type DTDListItem = DTDElement | DTDEntity
pub struct DTDEntity { pub struct DTDEntity {
name string [required] name string @[required]
value string [required] value string @[required]
} }
pub struct DTDElement { pub struct DTDElement {
name string [required] name string @[required]
definition []string [required] definition []string @[required]
} }
pub struct DocumentTypeDefinition { pub struct DocumentTypeDefinition {
@ -52,7 +52,7 @@ pub struct DocumentTypeDefinition {
} }
pub struct DocumentType { pub struct DocumentType {
name string [required] name string @[required]
dtd DTDInfo dtd DTDInfo
} }