encoding.iconv: fix iconv type cstrict, add support for LOCAL encoding (#22398)

This commit is contained in:
kbkpbot 2024-10-03 16:27:42 +08:00 committed by GitHub
parent 7b8e059a3c
commit e2425842b2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 51 additions and 19 deletions

View file

@ -16,21 +16,35 @@ fn reverse_u32(src u32) u32 {
// vstring_to_encoding convert V string `str` to `tocode` encoding string
// tips: use `iconv --list` check for supported encodings
pub fn vstring_to_encoding(str string, tocode string) ![]u8 {
encoding_name := tocode.to_upper()
mut encoding_name := tocode.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
return conv(tocode, 'UTF-8', str.str, str.len)
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
return conv(encoding_name, 'UTF-8', str.str, str.len)
}
// encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8)
// tips: use `iconv --list` check for supported encodings
pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
encoding_name := fromcode.to_upper()
mut encoding_name := fromcode.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
mut dst := conv('UTF-8', fromcode, bytes.data, bytes.len)!
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
mut dst := conv('UTF-8', encoding_name, bytes.data, bytes.len)!
dst << 0 // add a tail zero, to build a vstring
return unsafe { cstring_to_vstring(dst.data) }
}
@ -43,7 +57,15 @@ pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
// for utf32be, it will prepend 0x0000FEFF to the `src`
pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
mut clone := src.clone()
match utf_type.to_upper() {
mut encoding_name := utf_type.to_upper()
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
match encoding_name {
'UTF8', 'UTF-8' {
clone.prepend([u8(0xEF), 0xBB, 0xBF])
}
@ -73,7 +95,15 @@ pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
@[direct_array_access]
pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
mut clone := src.clone()
match utf_type.to_upper() {
mut encoding_name := utf_type.to_upper()
if encoding_name == 'LOCAL' {
$if windows {
encoding_name = 'ANSI'
} $else {
encoding_name = 'UTF-8'
}
}
match encoding_name {
'UTF8', 'UTF-8' {
if clone.len > 3 {
if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) {
@ -119,10 +149,6 @@ pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
// write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten.
// For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file.
pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! {
encoding_name := encoding.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
encoding_bytes := vstring_to_encoding(text, encoding)!
if bom && encoding.to_upper().starts_with('UTF') {
encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding)
@ -134,10 +160,6 @@ pub fn write_file_encoding(path string, text string, encoding string, bom bool)
// read_file_encoding reads the file in `path` with `encoding` and returns the contents
pub fn read_file_encoding(path string, encoding string) !string {
encoding_name := encoding.to_upper()
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
}
encoding_bytes := os.read_file_array[u8](path)
encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding)
return encoding_to_vstring(encoding_without_bom_bytes, encoding)!

View file

@ -5,9 +5,9 @@ module iconv
#include <iconv.h>
#flag darwin -liconv
fn C.iconv_open(tocode &u8, fromcode &u8) voidptr
fn C.iconv_open(tocode charptr, fromcode charptr) voidptr
fn C.iconv_close(cd voidptr) int
fn C.iconv(cd voidptr, inbuf &&u8, inbytesleft &usize, outbuf &&u8, outbytesleft &usize) usize
fn C.iconv(cd voidptr, inbuf &charptr, inbytesleft &usize, outbuf &charptr, outbytesleft &usize) usize
// conv convert `fromcode` encoding string to `tocode` encoding string
@[direct_array_access]
@ -35,7 +35,7 @@ fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
else {}
}
mut cd := C.iconv_open(dst_encoding.str, src_encoding.str)
mut cd := C.iconv_open(charptr(dst_encoding.str), charptr(src_encoding.str))
if isize(cd) == -1 {
return error('platform can\'t convert from ${src_encoding} to ${dst_encoding}')
}
@ -43,8 +43,8 @@ fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
mut dst := []u8{len: (src_len + 1) * 4} // this should be enough to hold the dst encoding string
mut src_ptr := &u8(src)
mut dst_ptr := &u8(dst.data)
mut src_ptr := charptr(src)
mut dst_ptr := charptr(dst.data)
mut src_left := usize(src_len)
mut dst_left := usize(dst.len)
res := C.iconv(cd, &src_ptr, &src_left, &dst_ptr, &dst_left)

View file

@ -20,6 +20,11 @@ fn test_vstring_to_encoding() {
abc_utf32be := iconv.vstring_to_encoding('abc', 'UTF-32BE')!
assert abc_utf32be == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
abc_local := iconv.vstring_to_encoding('abc', 'LOCAL')!
// Windows LOCAL: ANSI encoding
// Linux LOCAL: UTF-8 encoding
assert abc_local == [u8(97), 98, 99]
if abc_not_exist := iconv.vstring_to_encoding('abc', 'encoding_not_exist') {
assert false, 'encoding_not_exist'
}
@ -53,6 +58,11 @@ fn test_encoding_to_vstring() {
'UTF-32BE')!
assert abc_utf32be == 'abc'
abc_local := iconv.encoding_to_vstring([u8(97), 98, 99], 'LOCAL')!
// Windows LOCAL: ANSI encoding
// Linux LOCAL: UTF-8 encoding
assert abc_local == 'abc'
if abc_not_exist := iconv.encoding_to_vstring([u8(97), 98, 99], 'encoding_not_exist') {
assert false, 'encoding_not_exist'
}