net.html: polish module, update docs (#7193)

This commit is contained in:
Ned Palacios 2020-12-10 03:08:15 +08:00 committed by GitHub
parent 5fa1e403ec
commit b952bf2e6b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 302 additions and 446 deletions

View file

@ -1,8 +1,9 @@
module html
import os
import strings
struct LexycalAttributes {
struct LexicalAttributes {
mut:
current_tag &Tag
open_tag bool
@ -12,44 +13,40 @@ mut:
is_attribute bool
opened_code_type string
line_count int
lexeme_builder string
lexeme_builder strings.Builder = strings.Builder{}
code_tags map[string]bool = {
'script': true
'style': true
}
}
fn (mut lxa LexycalAttributes) write_lexeme(data byte) {
mut temp := lxa.lexeme_builder
temp += data.str()
lxa.lexeme_builder = temp
'script': true
'style': true
}
}
// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
pub struct Parser {
mut:
dom DocumentObjectModel
lexycal_attributes LexycalAttributes = LexycalAttributes{
current_tag: &Tag{}
}
lexical_attributes LexicalAttributes = LexicalAttributes{
current_tag: &Tag{}
}
filename string = 'direct-parse'
initialized bool
tags []&Tag
debug_file os.File
}
// This function is used to add a tag for the parser ignore it's content.
// For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
// like `add_code_tag('script')` will make all `script` tags content be jumped,
// so you still have its content, but will not confuse the parser with it's `>` or `<`.
pub fn (mut parser Parser) add_code_tag(name string) {
if parser.lexycal_attributes.code_tags.keys().len <= 0 {
parser.lexycal_attributes.code_tags = map[string]bool{}
parser.lexycal_attributes.code_tags['script'] = true
parser.lexycal_attributes.code_tags['style'] = true
}
if name.len > 0 {
parser.lexycal_attributes.code_tags[name] = true
if name.len <= 0 {
return
}
parser.lexical_attributes.code_tags[name] = true
}
[inline]
fn (parser Parser) builder_str() string {
return parser.lexycal_attributes.lexeme_builder
return parser.lexical_attributes.lexeme_builder.after(0)
}
[if debug]
@ -65,28 +62,28 @@ fn (mut parser Parser) verify_end_comment(remove bool) bool {
lexeme := parser.builder_str()
last := lexeme[lexeme.len - 1]
penultimate := lexeme[lexeme.len - 2]
mut is_end_comment := false
if last.str() == '-' && penultimate.str() == '-' {
is_end_comment = true
}
is_end_comment := last == `-` && penultimate == `-`
if is_end_comment && remove {
temp := parser.lexycal_attributes.lexeme_builder
parser.lexycal_attributes.lexeme_builder = temp[0..temp.len - 2]
parser.lexical_attributes.lexeme_builder.go_back(2)
}
return is_end_comment
}
fn blank_string(data string) bool {
mut count := 0
for word in data {
if word == 9 || word == 32 {
for chr in data {
if chr == 9 || chr == 32 {
count++
}
}
return count == data.len
}
fn (mut parser Parser) initialize_all() {
// init initializes the parser.
fn (mut parser Parser) init() {
if parser.initialized {
return
}
parser.dom = DocumentObjectModel{
debug_file: parser.debug_file
root: &Tag{}
@ -94,181 +91,165 @@ fn (mut parser Parser) initialize_all() {
parser.add_code_tag('')
parser.tags = []&Tag{}
parser.dom.close_tags['/!document'] = true
parser.lexycal_attributes.current_tag = &Tag{}
parser.lexical_attributes.current_tag = &Tag{}
parser.initialized = true
}
fn (mut parser Parser) generate_tag() {
if !parser.lexycal_attributes.open_tag {
if parser.lexycal_attributes.current_tag.name.len > 0 ||
parser.lexycal_attributes.current_tag.content.len > 0 {
parser.tags << parser.lexycal_attributes.current_tag
}
parser.lexycal_attributes.current_tag = &Tag{}
if parser.lexical_attributes.open_tag {
return
}
if parser.lexical_attributes.current_tag.name.len > 0 ||
parser.lexical_attributes.current_tag.content.len > 0 {
parser.tags << parser.lexical_attributes.current_tag
}
parser.lexical_attributes.current_tag = &Tag{}
}
// split_parse parses the HTML fragment
pub fn (mut parser Parser) split_parse(data string) {
if !parser.initialized {
parser.initialize_all()
}
for word in data {
mut is_quotation := false // " or '
if word == 34 || word == 39 {
is_quotation = true
}
string_code := match word {
34 { 1 } // "
39 { 2 } // '
parser.init()
for chr in data {
// returns true if byte is a " or '
is_quote := chr == `"` || chr == `\'`
string_code := match chr {
`"` { 1 } // "
`\'` { 2 } // '
else { 0 }
}
if parser.lexycal_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
parser.lexycal_attributes.write_lexeme(word)
if parser.lexycal_attributes.open_string > 0 {
if parser.lexycal_attributes.open_string == string_code {
parser.lexycal_attributes.open_string = 0
}
} else if is_quotation {
parser.lexycal_attributes.open_string = string_code
} else if word == 62 { // only execute verification if is a > // here will verify < to know if code tag is finished
name_close_tag := '</' + parser.lexycal_attributes.opened_code_type + '>'
temp_string := parser.builder_str()
if temp_string.to_lower().ends_with(name_close_tag) {
parser.lexycal_attributes.open_code = false
if parser.lexical_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
parser.lexical_attributes.lexeme_builder.write_b(chr)
if parser.lexical_attributes.open_string > 0 &&
parser.lexical_attributes.open_string == string_code {
parser.lexical_attributes.open_string = 0
} else if is_quote {
parser.lexical_attributes.open_string = string_code
} else if chr == `>` { // only execute verification if is a > // here will verify < to know if code tag is finished
name_close_tag := '</$parser.lexical_attributes.opened_code_type>'
if parser.builder_str().to_lower().ends_with(name_close_tag) {
parser.lexical_attributes.open_code = false
// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom)
parser.lexycal_attributes.lexeme_builder = temp_string[0..temp_string.len -
name_close_tag.len]
parser.lexycal_attributes.current_tag.closed = true
parser.lexycal_attributes.current_tag.close_type = .new_tag
parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
parser.lexical_attributes.current_tag.closed = true
parser.lexical_attributes.current_tag.close_type = .new_tag
}
}
} else if parser.lexycal_attributes.open_comment {
if word == 62 && parser.verify_end_comment(false) { // close tag '>'
// parser.print_debug(parser.builder_str() + " >> " + parser.lexycal_attributes.line_count.str())
parser.lexycal_attributes.lexeme_builder = '' // strings.Builder{}
parser.lexycal_attributes.open_comment = false
parser.lexycal_attributes.open_tag = false
} else if parser.lexical_attributes.open_comment {
if chr == `>` && parser.verify_end_comment(false) { // close tag '>'
// parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
parser.lexical_attributes.lexeme_builder.go_back_to(0)
parser.lexical_attributes.open_comment = false
parser.lexical_attributes.open_tag = false
} else {
parser.lexycal_attributes.write_lexeme(word)
parser.lexical_attributes.lexeme_builder.write_b(chr)
}
} else if parser.lexycal_attributes.open_string > 0 {
if parser.lexycal_attributes.open_string == string_code {
parser.lexycal_attributes.open_string = 0
parser.lexycal_attributes.write_lexeme(word)
} else if parser.lexical_attributes.open_string > 0 {
if parser.lexical_attributes.open_string == string_code {
parser.lexical_attributes.open_string = 0
parser.lexical_attributes.lexeme_builder.write_b(chr)
temp_lexeme := parser.builder_str()
if parser.lexycal_attributes.current_tag.last_attribute != '' {
lattr := parser.lexycal_attributes.current_tag.last_attribute
if parser.lexical_attributes.current_tag.last_attribute != '' {
lattr := parser.lexical_attributes.current_tag.last_attribute
nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
// parser.print_debug(lattr + " = " + temp_lexeme)
parser.lexycal_attributes.current_tag.attributes[lattr] = nval
parser.lexycal_attributes.current_tag.last_attribute = ''
parser.lexical_attributes.current_tag.attributes[lattr] = nval
parser.lexical_attributes.current_tag.last_attribute = ''
} else {
parser.lexycal_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
}
parser.lexycal_attributes.lexeme_builder = ''
parser.lexical_attributes.lexeme_builder.go_back_to(0)
} else {
parser.lexycal_attributes.write_lexeme(word)
parser.lexical_attributes.lexeme_builder.write_b(chr)
}
} else if parser.lexycal_attributes.open_tag {
if parser.lexycal_attributes.lexeme_builder.len == 0 && is_quotation {
parser.lexycal_attributes.open_string = string_code
parser.lexycal_attributes.write_lexeme(word)
} else if word == 62 { // close tag >
} else if parser.lexical_attributes.open_tag {
if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
parser.lexical_attributes.open_string = string_code
parser.lexical_attributes.lexeme_builder.write_b(chr)
} else if chr == `>` { // close tag >
complete_lexeme := parser.builder_str().to_lower()
parser.lexycal_attributes.current_tag.closed = (complete_lexeme.len > 0 &&
complete_lexeme[complete_lexeme.len - 1] == 47) // if equals to /
if complete_lexeme.len > 0 && complete_lexeme[0] == 47 {
parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0 &&
complete_lexeme[complete_lexeme.len - 1] == `/`) // if equals to /
if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
parser.dom.close_tags[complete_lexeme] = true
}
/*
else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
parser.lexycal_attributes.current_tag.closed = true
parser.lexical_attributes.current_tag.closed = true
}
*/
if parser.lexycal_attributes.current_tag.name == '' {
parser.lexycal_attributes.current_tag.name = complete_lexeme
if parser.lexical_attributes.current_tag.name == '' {
parser.lexical_attributes.current_tag.name = complete_lexeme
} else if complete_lexeme != '/' {
parser.lexycal_attributes.current_tag.attributes[complete_lexeme] = ''
parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
}
parser.lexycal_attributes.open_tag = false
parser.lexycal_attributes.lexeme_builder = '' // if tag name is code
if parser.lexycal_attributes.current_tag.name in parser.lexycal_attributes.code_tags {
parser.lexycal_attributes.open_code = true
parser.lexycal_attributes.opened_code_type = parser.lexycal_attributes.current_tag.name
parser.lexical_attributes.open_tag = false
parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
parser.lexical_attributes.open_code = true
parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
}
// parser.print_debug(parser.lexycal_attributes.current_tag.name)
} else if word != 9 && word != 32 && word != 61 && word != 10 { // Tab, space, = and \n
parser.lexycal_attributes.write_lexeme(word)
} else if word != 10 {
// parser.print_debug(parser.lexical_attributes.current_tag.name)
} else if chr !in [byte(9), ` `, `=`, `\n`] { // Tab, space, = and \n
parser.lexical_attributes.lexeme_builder.write_b(chr)
} else if chr != 10 {
complete_lexeme := parser.builder_str().to_lower()
if parser.lexycal_attributes.current_tag.name == '' {
parser.lexycal_attributes.current_tag.name = complete_lexeme
if parser.lexical_attributes.current_tag.name == '' {
parser.lexical_attributes.current_tag.name = complete_lexeme
} else {
parser.lexycal_attributes.current_tag.attributes[complete_lexeme] = ''
parser.lexycal_attributes.current_tag.last_attribute = ''
if word == 61 { // if was a =
parser.lexycal_attributes.current_tag.last_attribute = complete_lexeme
parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
parser.lexical_attributes.current_tag.last_attribute = ''
if chr == `=` { // if was a =
parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
}
}
parser.lexycal_attributes.lexeme_builder = '' // strings.Builder{}
parser.lexical_attributes.lexeme_builder.go_back_to(0)
}
if parser.builder_str() == '!--' {
parser.lexycal_attributes.open_comment = true
parser.lexical_attributes.open_comment = true
}
} else if word == 60 { // open tag '<'
} else if chr == `<` { // open tag '<'
temp_string := parser.builder_str()
if parser.lexycal_attributes.lexeme_builder.len >= 1 {
if parser.lexycal_attributes.current_tag.name.len > 1 &&
parser.lexycal_attributes.current_tag.name[0] == 47 && !blank_string(temp_string) {
if parser.lexical_attributes.lexeme_builder.len >= 1 {
if parser.lexical_attributes.current_tag.name.len > 1 &&
parser.lexical_attributes.current_tag.name[0] == 47 && !blank_string(temp_string) {
parser.tags << &Tag{
name: 'text'
content: temp_string
}
} else {
parser.lexycal_attributes.current_tag.content = temp_string // verify later who has this content
parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
}
}
// parser.print_debug(parser.lexycal_attributes.current_tag.str())
parser.lexycal_attributes.lexeme_builder = ''
// parser.print_debug(parser.lexical_attributes.current_tag.str())
parser.lexical_attributes.lexeme_builder.go_back_to(0)
parser.generate_tag()
parser.lexycal_attributes.open_tag = true
parser.lexical_attributes.open_tag = true
} else {
parser.lexycal_attributes.write_lexeme(word)
parser.lexical_attributes.lexeme_builder.write_b(chr)
}
}
}
pub fn (mut parser Parser) parse_html(data string, is_file bool) {
if !parser.initialized {
parser.initialize_all()
}
mut lines := []string{}
if is_file {
file_lines := os.read_lines(data) or {
eprintln('failed to read the file $data')
return
}
lines = file_lines
} else {
lines = data.split_into_lines()
}
// parse_html parses the given HTML string
pub fn (mut parser Parser) parse_html(data string) {
parser.init()
mut lines := data.split_into_lines()
for line in lines {
parser.lexycal_attributes.line_count++
parser.lexical_attributes.line_count++
parser.split_parse(line)
}
parser.generate_tag()
parser.dom.debug_file = parser.debug_file
parser.dom.construct(parser.tags) // println(parser.close_tags.keys())
parser.dom.construct(parser.tags)
}
// finalize finishes the parsing stage .
[inline]
pub fn (mut parser Parser) finalize() {
parser.generate_tag()
}
pub fn (parser Parser) get_tags() []&Tag {
return parser.tags
}
// get_dom returns the parser's current DOM representation.
pub fn (mut parser Parser) get_dom() DocumentObjectModel {
if !parser.dom.constructed {
parser.generate_tag()
@ -276,10 +257,3 @@ pub fn (mut parser Parser) get_dom() DocumentObjectModel {
}
return parser.dom
}
/*pub fn (mut parser Parser) get_xpath() XPath {
dom := parser.get_dom()
return XPath{
dom: dom
}
}*/