x.json2.decoder2: improve decoder, add checker (part 1) (#22545)

This commit is contained in:
Hitalo Souza 2024-10-17 06:37:46 -04:00 committed by GitHub
parent e4ffc7f224
commit 8556353b43
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 759 additions and 12 deletions

View file

@ -2,18 +2,29 @@ module decoder2
import time import time
// Node represents a node in a JSON decoder tree. // Node represents a node in a JSON decoder tree. Used to decode object in JSON.
struct Node { struct Node {
key_pos int // The position of the key in the JSON string. key_pos int // The position of the key in the JSON string.
key_len int // The length of the key in the JSON string. key_len int // The length of the key in the JSON string.
children ?[]Node // The children nodes of the current node. children ?[]Node // The children nodes of the current node.
} }
// ValueInfo represents the position and length of a value, like string, number, array, object key and object value in a JSON string.
struct ValueInfo {
position int // The position of the value in the JSON string.
value_kind ValueKind // The kind of the value.
mut:
length int // The length of the value in the JSON string.
}
// Decoder represents a JSON decoder. // Decoder represents a JSON decoder.
struct Decoder { struct Decoder {
json string // json is the JSON data to be decoded. json string // json is the JSON data to be decoded.
mut: mut:
idx int // idx is the current index of the decoder. values_info []ValueInfo
idx int // idx is byte offset from the start in json
checker_idx int // checker_idx is the current index of the decoder.
value_info_idx int // value_info_idx is the current index of the values_info.
} }
pub enum ValueKind { pub enum ValueKind {
@ -23,34 +34,466 @@ pub enum ValueKind {
string_ string_
number number
boolean boolean
null
} }
// check_json checks if the JSON string is valid. // check_json checks if the JSON string is valid.
fn check_json(val string) ! { fn check_if_json_match[T](val string) ! {
// check if the JSON string is empty
if val == '' { if val == '' {
return error('empty string') return error('empty string')
} }
// check if generic type matches the JSON type
value_kind := get_value_kind(val[0])
$if T is $option {
// TODO
} $else $if T is $sumtype {
// TODO
} $else $if T is $alias {
// TODO
} $else $if T is $string {
if value_kind != .string_ {
return error('Expected string, but got ${value_kind}')
}
} $else $if T is time.Time {
if value_kind != .string_ {
return error('Expected string, but got ${value_kind}')
}
} $else $if T is $map {
if value_kind != .object {
return error('Expected object, but got ${value_kind}')
}
} $else $if T is $array {
if value_kind != .array {
return error('Expected array, but got ${value_kind}')
}
} $else $if T is $struct {
if value_kind != .object {
return error('Expected object, but got ${value_kind}')
}
} $else $if T in [$enum, $int, $float] {
if value_kind != .number {
return error('Expected number, but got ${value_kind}')
}
} $else $if T is bool {
if value_kind != .boolean {
return error('Expected boolean, but got ${value_kind}')
}
} $else {
return error('cannot encode value with ${value_kind} type')
}
}
fn (mut checker Decoder) error(message string) ! {
json := if checker.json.len < checker.checker_idx + 5 {
checker.json
} else {
checker.json[0..checker.checker_idx + 5]
}
mut error_message := '\n'
last_new_line := json.last_index_u8(`\n`)
if last_new_line != -1 {
error_message += json[last_new_line..checker.checker_idx]
} else {
error_message += json[0..checker.checker_idx]
}
error_message += [json[checker.checker_idx]].bytestr()
error_message += '\n'
if last_new_line != -1 {
error_message += ' '.repeat(checker.checker_idx - last_new_line)
} else {
error_message += ' '.repeat(checker.checker_idx)
}
error_message += '^ ${message}'
return error(error_message)
}
// check_json checks if the JSON string is valid.
fn (mut checker Decoder) check_json_format(val string) ! {
checker_end := checker.json.len
// check if the JSON string is empty
if val == '' {
return checker.error('empty string')
}
// check if generic type matches the JSON type
value_kind := get_value_kind(val[checker.checker_idx])
start_idx_position := checker.checker_idx
checker.values_info << ValueInfo{
position: start_idx_position
length: 0
value_kind: value_kind
}
value_info_index := checker.values_info.len - 1
match value_kind {
.unknown {
return checker.error('unknown value kind')
}
.null {
// check if the JSON string is a null value
if checker_end - checker.checker_idx <= 3 {
return checker.error('EOF error: expecting `null`')
}
is_not_ok := unsafe {
vmemcmp(checker.json.str + checker.checker_idx, 'null'.str, 4)
}
if is_not_ok != 0 {
return checker.error('invalid null value. Got `${checker.json[checker.checker_idx..
checker.checker_idx + 4]}` instead of `null`')
}
checker.checker_idx += 3
}
.object {
checker.checker_idx++
for val[checker.checker_idx] != `}` {
// check if the JSON string is an empty object
if checker_end - checker.checker_idx <= 2 {
continue
}
if val[checker.checker_idx] != `"` {
checker.checker_idx++
}
// skip whitespace
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
if checker.checker_idx >= checker_end - 1 {
break
}
checker.checker_idx++
}
if val[checker.checker_idx] == `}` {
continue
}
match val[checker.checker_idx] {
`"` {
// Object key
checker.check_json_format(val)!
for val[checker.checker_idx] != `:` {
if checker.checker_idx >= checker_end - 1 {
return checker.error('EOF error: key colon not found')
}
if val[checker.checker_idx] !in [` `, `\t`, `\n`] {
return checker.error('invalid value after object key')
}
checker.checker_idx++
}
}
`[`, `{`, `0`...`9`, `-`, `n`, `t`, `f` {
// skip
}
`}` {
return
}
`]` {
return checker.error('Expecting key. Found closing bracket')
}
`,` {
return checker.error('invalid object key')
}
`:` {
return checker.error('empty object key')
}
else {
return checker.error('`${[val[checker.checker_idx]].bytestr()}` is an invalid object key')
}
}
if val[checker.checker_idx] != `:` {
return checker.error('Expecting `:` after object key')
}
// skip `:`
checker.checker_idx++
// skip whitespace
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
checker.checker_idx++
}
match val[checker.checker_idx] {
`"`, `[`, `{`, `0`...`9`, `-`, `n`, `t`, `f` {
for val[checker.checker_idx] != `}` {
if checker.checker_idx >= checker_end - 1 {
return checker.error('EOF error: object value not closed')
}
checker.check_json_format(val)!
// whitespace
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
checker.checker_idx++
}
if val[checker.checker_idx] == `}` {
break
}
if checker.checker_idx >= checker_end - 1 {
return checker.error('EOF error: braces are not closed')
}
if val[checker.checker_idx] == `,` {
checker.checker_idx++
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
checker.checker_idx++
}
if val[checker.checker_idx] != `"` {
return checker.error('Expecting object key')
} else {
break
}
} else {
if val[checker.checker_idx] == `}` {
break
} else {
return
}
}
}
}
else {
return checker.error('invalid object value')
}
}
}
if checker.checker_idx < checker_end - 1 {
checker.checker_idx++
}
}
.array {
// check if the JSON string is an empty array
if checker_end >= checker.checker_idx + 2 {
checker.checker_idx++
if val[checker.checker_idx] == `]` {
return
}
} else {
return checker.error('EOF error: There are not enough length for an array')
}
for val[checker.checker_idx] != `]` {
// skip whitespace
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
if checker.checker_idx >= checker_end - 1 {
break
}
checker.checker_idx++
}
if val[checker.checker_idx] == `]` {
return
}
if checker.checker_idx >= checker_end - 1 {
return checker.error('EOF error: array not closed')
}
checker.check_json_format(val)!
// whitespace
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
checker.checker_idx++
}
if val[checker.checker_idx] == `]` {
break
}
if checker.checker_idx >= checker_end - 1 {
return checker.error('EOF error: braces are not closed')
}
if val[checker.checker_idx] == `,` {
checker.checker_idx++
for val[checker.checker_idx] in [` `, `\t`, `\n`] {
checker.checker_idx++
}
if val[checker.checker_idx] == `]` {
return checker.error('Cannot use `,`, before `]`')
}
continue
} else {
if val[checker.checker_idx] == `]` {
break
} else {
return checker.error('`]` after value')
}
}
}
}
.string_ {
// check if the JSON string is a valid string
if checker.checker_idx >= checker_end - 1 {
return checker.error('EOF error: string not closed')
}
checker.checker_idx++
// check if the JSON string is a valid escape sequence
for val[checker.checker_idx] != `"` && val[checker.checker_idx - 1] != `\\` {
if val[checker.checker_idx] == `\\` {
if checker.checker_idx + 1 >= checker_end - 1 {
return checker.error('invalid escape sequence')
}
escaped_char := val[checker.checker_idx + 1]
match escaped_char {
`/`, `b`, `f`, `n`, `r`, `t`, `"`, `\\` {}
`u` {
// check if the JSON string is a valid unicode escape sequence
escaped_char_last_index := checker.checker_idx + 5
if escaped_char_last_index < checker_end - 1 {
// 2 bytes for the unicode escape sequence `\u`
checker.checker_idx += 2
for checker.checker_idx < escaped_char_last_index {
match val[checker.checker_idx] {
`0`...`9`, `a`...`f`, `A`...`F` {
checker.checker_idx++
}
else {
return checker.error('invalid unicode escape sequence')
}
}
}
// REVIEW: Should we increment the index here?
continue
} else {
return checker.error('short unicode escape sequence ${checker.json[checker.checker_idx..
escaped_char_last_index + 1]}')
}
}
else {
return checker.error('unknown escape sequence')
}
}
}
checker.checker_idx++
}
}
.number {
// check if the JSON string is a valid float or integer
mut is_negative := val[0] == `-`
mut has_dot := false
mut digits_count := 1
if is_negative {
checker.checker_idx++
}
for checker.checker_idx < checker_end - 1
&& val[checker.checker_idx + 1] !in [`,`, `}`, `]`, ` `, `\t`, `\n`]
&& checker.checker_idx < checker_end - 1 {
if val[checker.checker_idx] == `.` {
if has_dot {
return checker.error('invalid float. Multiple dots')
}
has_dot = true
checker.checker_idx++
continue
} else if val[checker.checker_idx] == `-` {
if is_negative {
return checker.error('invalid float. Multiple negative signs')
}
checker.checker_idx++
continue
} else {
if val[checker.checker_idx] < `0` || val[checker.checker_idx] > `9` {
return checker.error('invalid number')
}
}
if digits_count >= 64 {
return checker.error('number exceeds 64 digits')
}
digits_count++
checker.checker_idx++
}
}
.boolean {
// check if the JSON string is a valid boolean
match val[checker.checker_idx] {
`t` {
if checker_end - checker.checker_idx <= 3 {
return checker.error('EOF error: expecting `true`')
}
is_not_ok := unsafe {
vmemcmp(checker.json.str + checker.checker_idx, 'true'.str, 4)
}
if is_not_ok != 0 {
return checker.error('invalid boolean value. Got `${checker.json[checker.checker_idx..
checker.checker_idx + 4]}` instead of `true`')
}
checker.checker_idx += 3
}
`f` {
if checker_end - checker.checker_idx <= 4 {
return checker.error('EOF error: expecting `false`')
}
is_not_ok := unsafe {
vmemcmp(checker.json.str + checker.checker_idx, 'false'.str, 5)
}
if is_not_ok != 0 {
return checker.error('invalid boolean value. Got `${checker.json[checker.checker_idx..
checker.checker_idx + 5]}` instead of `false`')
}
checker.checker_idx += 4
}
else {
return checker.error('invalid boolean')
}
}
}
}
checker.values_info[value_info_index].length = checker.checker_idx + 1 - start_idx_position
if checker.checker_idx < checker_end - 1 {
checker.checker_idx++
}
for checker.checker_idx < checker_end - 1 && val[checker.checker_idx] !in [`,`, `:`, `}`, `]`] {
// get trash characters after the value
if val[checker.checker_idx] !in [` `, `\t`, `\n`] {
checker.error('invalid value. Unexpected character after ${value_kind} end')!
} else {
// whitespace
}
checker.checker_idx++
}
} }
// decode decodes a JSON string into a specified type. // decode decodes a JSON string into a specified type.
pub fn decode[T](val string) !T { pub fn decode[T](val string) !T {
check_json(val)!
mut nodes := []Node{}
mut decoder := Decoder{ mut decoder := Decoder{
json: val json: val
values_info: []ValueInfo{}
} }
// TODO: needs performance improvements decoder.check_json_format(val)!
decoder.fulfill_nodes(mut nodes) check_if_json_match[T](val)!
mut result := T{} mut result := T{}
decoder.decode_value(nodes, &result) decoder.decode_value(mut &result)!
return result return result
} }
// decode_value decodes a value from the JSON nodes. // decode_value decodes a value from the JSON nodes.
fn (mut decoder Decoder) decode_value[T](nodes []Node, val &T) { fn (mut decoder Decoder) decode_value[T](mut val T) ! {
$if val is $option { $if val is $option {
} $else $if T is string { } $else $if T is string {
} $else $if T is $sumtype { } $else $if T is $sumtype {
@ -64,6 +507,10 @@ fn (mut decoder Decoder) decode_value[T](nodes []Node, val &T) {
} $else $if T is $map { } $else $if T is $map {
} $else $if T is $array { } $else $if T is $array {
} $else $if T is $struct { } $else $if T is $struct {
mut nodes := []Node{}
// TODO: needs performance improvements
decoder.fulfill_nodes(mut nodes)
decoder.decode_struct(nodes, val) decoder.decode_struct(nodes, val)
} $else $if T is $enum { } $else $if T is $enum {
} $else $if T is $int { } $else $if T is $int {
@ -81,7 +528,8 @@ fn get_value_kind(value rune) ValueKind {
`t`, `f` { .boolean } `t`, `f` { .boolean }
`{` { .object } `{` { .object }
`[` { .array } `[` { .array }
`0`...`9` { .number } `0`...`9`, `-` { .number }
`n` { .null }
else { .unknown } else { .unknown }
} }
} }

View file

@ -40,3 +40,302 @@ fn test_nodes() {
assert nodes[0].children?[0].key_pos == 10 assert nodes[0].children?[0].key_pos == 10
assert nodes[0].children?[0].children == none assert nodes[0].children?[0].children == none
} }
fn test_check_if_json_match() {
// /* Test wrong string values */
mut has_error := false
check_if_json_match[string]('{"key": "value"}') or {
assert err.str() == 'Expected string, but got object'
has_error = true
}
assert has_error, 'Expected error'
has_error = false
check_if_json_match[map[string]string]('"value"') or {
assert err.str() == 'Expected object, but got string_'
has_error = true
}
assert has_error, 'Expected error'
has_error = false
check_if_json_match[[]int]('{"key": "value"}') or {
assert err.str() == 'Expected array, but got object'
has_error = true
}
assert has_error, 'Expected error'
has_error = false
check_if_json_match[string]('[1, 2, 3]') or {
assert err.str() == 'Expected string, but got array'
has_error = true
}
assert has_error, 'Expected error'
has_error = false
check_if_json_match[int]('{"key": "value"}') or {
assert err.str() == 'Expected number, but got object'
has_error = true
}
assert has_error, 'Expected error'
has_error = false
check_if_json_match[bool]('{"key": "value"}') or {
assert err.str() == 'Expected boolean, but got object'
has_error = true
}
assert has_error, 'Expected error'
has_error = false
// /* Right string values */
check_if_json_match[string]('"value"') or { assert false }
check_if_json_match[map[string]string]('{"key": "value"}') or { assert false }
check_if_json_match[[]int]('[1, 2, 3]') or { assert false }
check_if_json_match[string]('"string"') or { assert false }
check_if_json_match[int]('123') or { assert false }
check_if_json_match[bool]('true') or { assert false }
check_if_json_match[bool]('false') or { assert false }
// TODO: test null
}
fn test_check_json_format() {
// primitives
for variable in ['""', '"string"', '123', '0', 'true'] {
mut checker := Decoder{
checker_idx: 0
json: variable
}
checker.check_json_format(variable) or { assert false, err.str() }
assert checker.checker_idx == checker.json.len - 1, 'Expected to reach the end of the json string ${checker.json}'
}
// simple objects
for variable in ['{}', '{"key": null}', '{"key": "value"}', '{"key": 123}', '{"key": true}'] {
mut checker := Decoder{
checker_idx: 0
json: variable
}
checker.check_json_format(variable) or { assert false, err.str() }
assert checker.checker_idx == checker.json.len - 1, 'Expected to reach the end of the json string ${checker.json}'
}
// Nested objects
for variable in ['{"key": {"key": 123}}'] {
mut checker := Decoder{
checker_idx: 0
json: variable
}
checker.check_json_format(variable) or { assert false, err.str() }
assert checker.checker_idx == checker.json.len - 1, 'Expected to reach the end of the json string ${checker.json}'
}
// simple arrays
for variable in ['[]', '[1, 2, 3]', '["a", "b", "c"]', '[true, false]'] {
mut checker := Decoder{
checker_idx: 0
json: variable
}
checker.check_json_format(variable) or { assert false, err.str() }
assert checker.checker_idx == checker.json.len - 1, 'Expected to reach the end of the json string ${checker.json}'
}
// Nested arrays
for variable in ['[[1, 2, 3], [4, 5, 6]]'] {
mut checker := Decoder{
checker_idx: 0
json: variable
}
checker.check_json_format(variable) or { assert false, err.str() }
// assert checker.checker_idx == checker.json.len - 1, 'Expected to reach the end of the json string ${checker.json}'
}
// Wrong jsons
json_and_error_message := [
{
'json': ']'
'error': '\n]\n^ unknown value kind'
},
{
'json': '}'
'error': '\n}\n^ unknown value kind'
},
{
'json': 'truely'
'error': '\ntruel\n ^ invalid value. Unexpected character after boolean end'
},
{
'json': '0[1]' //
'error': '\n0[\n ^ invalid number'
},
{
'json': '[1, 2, g3]'
'error': '\n[1, 2, g\n ^ unknown value kind'
},
{
'json': '[1, 2,, 3]'
'error': '\n[1, 2,,\n ^ unknown value kind'
},
{
'json': '{"key": 123'
'error': '\n{"key": 123\n ^ EOF error: braces are not closed'
},
{
'json': '{"key": 123,'
'error': '\n{"key": 123,\n ^ EOF error: braces are not closed'
},
{
'json': '{"key": 123, "key2": 456,}'
'error': '\n{"key": 123, "key2": 456,}\n ^ Expecting object key'
},
{
'json': '[[1, 2, 3], [4, 5, 6],]'
'error': '\n[[1, 2, 3], [4, 5, 6],]\n ^ Cannot use `,`, before `]`'
},
]
for json_and_error in json_and_error_message {
mut has_error := false
mut checker := Decoder{
checker_idx: 0
json: json_and_error['json']
}
checker.check_json_format(json_and_error['json']) or {
assert err.str() == json_and_error['error']
has_error = true
}
assert has_error, 'Expected error ${json_and_error['error']}'
}
}
fn test_get_value_kind() {
assert get_value_kind(`"`) == .string_
assert get_value_kind(`t`) == .boolean
assert get_value_kind(`f`) == .boolean
assert get_value_kind(`{`) == .object
assert get_value_kind(`[`) == .array
assert get_value_kind(`0`) == .number
assert get_value_kind(`-`) == .number
assert get_value_kind(`n`) == .null
assert get_value_kind(`x`) == .unknown
}
fn test_checker_values_info() {
// Test for string value
mut checker := Decoder{
checker_idx: 0
json: '"value"'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
assert checker.values_info.len == 1
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 7
assert checker.values_info[0].value_kind == .string_
// Test for number value
checker = Decoder{
checker_idx: 0
json: '123'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
assert checker.values_info.len == 1
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 3
assert checker.values_info[0].value_kind == .number
// Test for boolean value
checker = Decoder{
checker_idx: 0
json: 'true'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
assert checker.values_info.len == 1
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 4
assert checker.values_info[0].value_kind == .boolean
// Test for null value
checker = Decoder{
checker_idx: 0
json: 'null'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
assert checker.values_info.len == 1
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 4
assert checker.values_info[0].value_kind == .null
// Test for object value
checker = Decoder{
checker_idx: 0
json: '{"key": "value"}'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
assert checker.values_info.len == 3
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 16
assert checker.values_info[0].value_kind == .object
assert checker.values_info[1].position == 1
assert checker.values_info[1].length == 5
assert checker.values_info[1].value_kind == .string_
assert checker.values_info[2].position == 8
assert checker.values_info[2].length == 7
assert checker.values_info[2].value_kind == .string_
// Test for nested object value
checker = Decoder{
checker_idx: 0
// json: '0<-{1"key1": 9<-{10"key2": 18"value1"}}'
json: '{"key1": {"key2": "value1"}'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
dump(checker.values_info)
assert checker.values_info.len == 5
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 27
assert checker.values_info[0].value_kind == .object
assert checker.values_info[1].position == 1
assert checker.values_info[1].length == 6
assert checker.values_info[1].value_kind == .string_
assert checker.values_info[2].position == 9
assert checker.values_info[2].length == 18
assert checker.values_info[2].value_kind == .object
assert checker.values_info[3].position == 10
assert checker.values_info[3].length == 6
assert checker.values_info[3].value_kind == .string_
assert checker.values_info[4].position == 18
assert checker.values_info[4].length == 8
// Test for array value
checker = Decoder{
checker_idx: 0
json: '[1, 22, 333]'
}
checker.check_json_format(checker.json) or { assert false, err.str() }
assert checker.values_info.len == 4
assert checker.values_info[0].position == 0
assert checker.values_info[0].length == 12
assert checker.values_info[0].value_kind == .array
assert checker.values_info[1].position == 1
assert checker.values_info[1].length == 1
assert checker.values_info[1].value_kind == .number
assert checker.values_info[2].position == 4
assert checker.values_info[2].length == 2
assert checker.values_info[2].value_kind == .number
assert checker.values_info[3].position == 8
assert checker.values_info[3].length == 3
assert checker.values_info[3].value_kind == .number
}