mirror of
https://github.com/vlang/v.git
synced 2025-09-13 14:32:26 +03:00
encoding: add an encoding.iconv
module with tests (#22332)
This commit is contained in:
parent
d1606b22b7
commit
747794998b
6 changed files with 1067 additions and 0 deletions
2
.github/workflows/macos_ci.yml
vendored
2
.github/workflows/macos_ci.yml
vendored
|
@ -63,6 +63,8 @@ jobs:
|
|||
echo $VFLAGS
|
||||
v cmd/tools/test_if_v_test_system_works.v
|
||||
./cmd/tools/test_if_v_test_system_works
|
||||
- name: Install iconv for encoding.iconv
|
||||
run: brew install libiconv
|
||||
- name: Test pure V math module
|
||||
run: v -exclude @vlib/math/*.c.v test vlib/math/
|
||||
- name: Self tests
|
||||
|
|
|
@ -280,6 +280,7 @@ pub fn new_test_session(_vargs string, will_compile bool) TestSession {
|
|||
skip_files << 'vlib/net/openssl/openssl_compiles_test.c.v'
|
||||
skip_files << 'vlib/crypto/ecdsa/ecdsa_test.v'
|
||||
skip_files << 'vlib/x/ttf/ttf_test.v'
|
||||
skip_files << 'vlib/encoding/iconv/iconv_test.v' // needs libiconv to be installed
|
||||
}
|
||||
if github_job == 'tests-sanitize-memory-clang' {
|
||||
skip_files << 'vlib/net/openssl/openssl_compiles_test.c.v'
|
||||
|
|
144
vlib/encoding/iconv/iconv.v
Normal file
144
vlib/encoding/iconv/iconv.v
Normal file
|
@ -0,0 +1,144 @@
|
|||
module iconv
|
||||
|
||||
// Module iconv provides functions to convert between vstring(UTF8) and other encodings.
|
||||
import os
|
||||
|
||||
@[inline]
|
||||
fn reverse_u16(src u16) u16 {
|
||||
return u16(src >> 8 | src << 8)
|
||||
}
|
||||
|
||||
@[inline]
|
||||
fn reverse_u32(src u32) u32 {
|
||||
return u32(src >> 24 | ((src >> 8) & 0x0000_FF00) | ((src << 8) & 0x00FF_0000) | src << 24)
|
||||
}
|
||||
|
||||
// vstring_to_encoding convert V string `str` to `tocode` encoding string
|
||||
// tips: use `iconv --list` check for supported encodings
|
||||
pub fn vstring_to_encoding(str string, tocode string) ![]u8 {
|
||||
encoding_name := tocode.to_upper()
|
||||
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
|
||||
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
|
||||
}
|
||||
return conv(tocode, 'UTF-8', str.str, str.len)
|
||||
}
|
||||
|
||||
// encoding_to_vstring converts the given `bytes` using `fromcode` encoding, to a V string (encoded with UTF-8)
|
||||
// tips: use `iconv --list` check for supported encodings
|
||||
pub fn encoding_to_vstring(bytes []u8, fromcode string) !string {
|
||||
encoding_name := fromcode.to_upper()
|
||||
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
|
||||
return error('please use UTF16-LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
|
||||
}
|
||||
mut dst := conv('UTF-8', fromcode, bytes.data, bytes.len)!
|
||||
dst << 0 // add a tail zero, to build a vstring
|
||||
return unsafe { cstring_to_vstring(dst.data) }
|
||||
}
|
||||
|
||||
// create_utf_string_with_bom will create a utf8/utf16/utf32 string with BOM header
|
||||
// for utf8, it will prepend 0xEFBBBF to the `src`
|
||||
// for utf16le, it will prepend 0xFFFE to the `src`
|
||||
// for utf16be, it will prepend 0xFEFF to the `src`
|
||||
// for utf32le, it will prepend 0xFFFE0000 to the `src`
|
||||
// for utf32be, it will prepend 0x0000FEFF to the `src`
|
||||
pub fn create_utf_string_with_bom(src []u8, utf_type string) []u8 {
|
||||
mut clone := src.clone()
|
||||
match utf_type.to_upper() {
|
||||
'UTF8', 'UTF-8' {
|
||||
clone.prepend([u8(0xEF), 0xBB, 0xBF])
|
||||
}
|
||||
'UTF16LE', 'UTF-16LE' {
|
||||
clone.prepend([u8(0xFF), 0xFE])
|
||||
}
|
||||
'UTF16BE', 'UTF-16BE' {
|
||||
clone.prepend([u8(0xFE), 0xFF])
|
||||
}
|
||||
'UTF32LE', 'UTF-32LE' {
|
||||
clone.prepend([u8(0xFF), 0xFE, 0, 0])
|
||||
}
|
||||
'UTF32BE', 'UTF-32BE' {
|
||||
clone.prepend([u8(0), 0, 0xFE, 0xFF])
|
||||
}
|
||||
else {}
|
||||
}
|
||||
return clone
|
||||
}
|
||||
|
||||
// remove_utf_string_with_bom will remove a utf8/utf16/utf32 string's BOM header
|
||||
// for utf8, it will remove 0xEFBBBF from the `src`
|
||||
// for utf16le, it will remove 0xFFFE from the `src`
|
||||
// for utf16be, it will remove 0xFEFF from the `src`
|
||||
// for utf32le, it will remove 0xFFFE0000 from the `src`
|
||||
// for utf32be, it will remove 0x0000FEFF from the `src`
|
||||
@[direct_array_access]
|
||||
pub fn remove_utf_string_with_bom(src []u8, utf_type string) []u8 {
|
||||
mut clone := src.clone()
|
||||
match utf_type.to_upper() {
|
||||
'UTF8', 'UTF-8' {
|
||||
if clone.len > 3 {
|
||||
if clone[0] == u8(0xEF) && clone[1] == u8(0xBB) && clone[2] == u8(0xBF) {
|
||||
clone.delete_many(0, 3)
|
||||
}
|
||||
}
|
||||
}
|
||||
'UTF16LE', 'UTF-16LE' {
|
||||
if clone.len > 2 {
|
||||
if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) {
|
||||
clone.delete_many(0, 2)
|
||||
}
|
||||
}
|
||||
}
|
||||
'UTF16BE', 'UTF-16BE' {
|
||||
if clone.len > 2 {
|
||||
if clone[0] == u8(0xFE) && clone[1] == u8(0xFF) {
|
||||
clone.delete_many(0, 2)
|
||||
}
|
||||
}
|
||||
}
|
||||
'UTF32LE', 'UTF-32LE' {
|
||||
if clone.len > 4 {
|
||||
if clone[0] == u8(0xFF) && clone[1] == u8(0xFE) && clone[2] == u8(0)
|
||||
&& clone[3] == u8(0) {
|
||||
clone.delete_many(0, 4)
|
||||
}
|
||||
}
|
||||
}
|
||||
'UTF32BE', 'UTF-32BE' {
|
||||
if clone.len > 4 {
|
||||
if clone[0] == u8(0) && clone[1] == u8(0) && clone[2] == u8(0xFE)
|
||||
&& clone[3] == u8(0xFF) {
|
||||
clone.delete_many(0, 4)
|
||||
}
|
||||
}
|
||||
}
|
||||
else {}
|
||||
}
|
||||
return clone
|
||||
}
|
||||
|
||||
// write_file_encoding write_file convert `text` into `encoding` and writes to a file with the given `path`. If `path` already exists, it will be overwritten.
|
||||
// For `encoding` in UTF8/UTF16/UTF32, if `bom` is true, then a BOM header will write to the file.
|
||||
pub fn write_file_encoding(path string, text string, encoding string, bom bool) ! {
|
||||
encoding_name := encoding.to_upper()
|
||||
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
|
||||
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
|
||||
}
|
||||
encoding_bytes := vstring_to_encoding(text, encoding)!
|
||||
if bom && encoding.to_upper().starts_with('UTF') {
|
||||
encoding_bom_bytes := create_utf_string_with_bom(encoding_bytes, encoding)
|
||||
os.write_file_array(path, encoding_bom_bytes)!
|
||||
} else {
|
||||
os.write_file_array(path, encoding_bytes)!
|
||||
}
|
||||
}
|
||||
|
||||
// read_file_encoding reads the file in `path` with `encoding` and returns the contents
|
||||
pub fn read_file_encoding(path string, encoding string) !string {
|
||||
encoding_name := encoding.to_upper()
|
||||
if encoding_name in ['UTF16', 'UTF32', 'UTF-16', 'UTF-32']! {
|
||||
return error('please use UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE instead')
|
||||
}
|
||||
encoding_bytes := os.read_file_array[u8](path)
|
||||
encoding_without_bom_bytes := remove_utf_string_with_bom(encoding_bytes, encoding)
|
||||
return encoding_to_vstring(encoding_without_bom_bytes, encoding)!
|
||||
}
|
58
vlib/encoding/iconv/iconv_nix.c.v
Normal file
58
vlib/encoding/iconv/iconv_nix.c.v
Normal file
|
@ -0,0 +1,58 @@
|
|||
module iconv
|
||||
|
||||
// Module iconv provides functions convert between vstring(UTF8) to/from different encodings.
|
||||
|
||||
#include <iconv.h>
|
||||
#flag darwin -liconv
|
||||
|
||||
fn C.iconv_open(tocode &u8, fromcode &u8) voidptr
|
||||
fn C.iconv_close(cd voidptr) int
|
||||
fn C.iconv(cd voidptr, inbuf &&u8, inbytesleft &usize, outbuf &&u8, outbytesleft &usize) usize
|
||||
|
||||
// conv convert `fromcode` encoding string to `tocode` encoding string
|
||||
@[direct_array_access]
|
||||
fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
|
||||
if src_len < 0 {
|
||||
return error('src length error')
|
||||
}
|
||||
|
||||
mut src_encoding := fromcode.to_upper()
|
||||
mut dst_encoding := tocode.to_upper()
|
||||
|
||||
// As macos-12 has no UTF16LE/UTF16BE/UTF32LE/UTF32BE, change them to UTF-16LE/UTF-16BE/UTF-32LE/UTF-32BE
|
||||
match src_encoding {
|
||||
'UTF16LE' { src_encoding = 'UTF-16LE' }
|
||||
'UTF16BE' { src_encoding = 'UTF-16BE' }
|
||||
'UTF32LE' { src_encoding = 'UTF-32LE' }
|
||||
'UTF32BE' { src_encoding = 'UTF-32BE' }
|
||||
else {}
|
||||
}
|
||||
match dst_encoding {
|
||||
'UTF16LE' { dst_encoding = 'UTF-16LE' }
|
||||
'UTF16BE' { dst_encoding = 'UTF-16BE' }
|
||||
'UTF32LE' { dst_encoding = 'UTF-32LE' }
|
||||
'UTF32BE' { dst_encoding = 'UTF-32BE' }
|
||||
else {}
|
||||
}
|
||||
|
||||
mut cd := C.iconv_open(dst_encoding.str, src_encoding.str)
|
||||
if isize(cd) == -1 {
|
||||
return error('platform can\'t convert from ${src_encoding} to ${dst_encoding}')
|
||||
}
|
||||
defer { C.iconv_close(cd) }
|
||||
|
||||
mut dst := []u8{len: (src_len + 1) * 4} // this should be enough to hold the dst encoding string
|
||||
|
||||
mut src_ptr := &u8(src)
|
||||
mut dst_ptr := &u8(dst.data)
|
||||
mut src_left := usize(src_len)
|
||||
mut dst_left := usize(dst.len)
|
||||
res := C.iconv(cd, &src_ptr, &src_left, &dst_ptr, &dst_left)
|
||||
if res == usize(-1) {
|
||||
return error('convert encoding string fail, iconv return ${res}')
|
||||
}
|
||||
|
||||
// resize dst buf to real length
|
||||
dst.trim(dst.len - int(dst_left))
|
||||
return dst
|
||||
}
|
129
vlib/encoding/iconv/iconv_test.v
Normal file
129
vlib/encoding/iconv/iconv_test.v
Normal file
|
@ -0,0 +1,129 @@
|
|||
import encoding.iconv
|
||||
import os
|
||||
|
||||
fn test_vstring_to_encoding() {
|
||||
empty_utf8 := iconv.vstring_to_encoding('', 'UTF-8')!
|
||||
assert empty_utf8 == []
|
||||
|
||||
abc_utf8 := iconv.vstring_to_encoding('abc', 'UTF-8')!
|
||||
assert abc_utf8 == [u8(97), 98, 99]
|
||||
|
||||
abc_utf16le := iconv.vstring_to_encoding('abc', 'UTF-16LE')!
|
||||
assert abc_utf16le == [u8(97), 0, 98, 0, 99, 0]
|
||||
|
||||
abc_utf16be := iconv.vstring_to_encoding('abc', 'UTF-16BE')!
|
||||
assert abc_utf16be == [u8(0), 97, 0, 98, 0, 99]
|
||||
|
||||
abc_utf32le := iconv.vstring_to_encoding('abc', 'UTF-32LE')!
|
||||
assert abc_utf32le == [u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
|
||||
|
||||
abc_utf32be := iconv.vstring_to_encoding('abc', 'UTF-32BE')!
|
||||
assert abc_utf32be == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
|
||||
|
||||
if abc_not_exist := iconv.vstring_to_encoding('abc', 'encoding_not_exist') {
|
||||
assert false, 'encoding_not_exist'
|
||||
}
|
||||
|
||||
if ch_str := iconv.vstring_to_encoding('V大法好abc', 'GB2312') {
|
||||
assert ch_str == [u8(86), 180, 243, 183, 168, 186, 195, 97, 98, 99]
|
||||
} else {
|
||||
// some platforms do not support GB2312, skip
|
||||
assert true
|
||||
}
|
||||
}
|
||||
|
||||
fn test_encoding_to_vstring() {
|
||||
empty_utf8 := iconv.encoding_to_vstring([], 'UTF-8')!
|
||||
assert empty_utf8 == ''
|
||||
|
||||
abc_utf8 := iconv.encoding_to_vstring([u8(97), 98, 99], 'UTF-8')!
|
||||
assert abc_utf8 == 'abc'
|
||||
|
||||
abc_utf16le := iconv.encoding_to_vstring([u8(97), 0, 98, 0, 99, 0], 'UTF-16LE')!
|
||||
assert abc_utf16le == 'abc'
|
||||
|
||||
abc_utf16be := iconv.encoding_to_vstring([u8(0), 97, 0, 98, 0, 99], 'UTF-16BE')!
|
||||
assert abc_utf16be == 'abc'
|
||||
|
||||
abc_utf32le := iconv.encoding_to_vstring([u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0],
|
||||
'UTF-32LE')!
|
||||
assert abc_utf32le == 'abc'
|
||||
|
||||
abc_utf32be := iconv.encoding_to_vstring([u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99],
|
||||
'UTF-32BE')!
|
||||
assert abc_utf32be == 'abc'
|
||||
|
||||
if abc_not_exist := iconv.encoding_to_vstring([u8(97), 98, 99], 'encoding_not_exist') {
|
||||
assert false, 'encoding_not_exist'
|
||||
}
|
||||
|
||||
if ch_str := iconv.encoding_to_vstring([u8(86), 180, 243, 183, 168, 186, 195, 97, 98, 99],
|
||||
'GB2312')
|
||||
{
|
||||
assert ch_str == 'V大法好abc'
|
||||
} else {
|
||||
// some platforms do not support GB2312, skip
|
||||
assert true
|
||||
}
|
||||
}
|
||||
|
||||
fn test_create_utf_string_with_bom() {
|
||||
// bug ? vfmt create strange format here
|
||||
// vfmt off
|
||||
assert iconv.create_utf_string_with_bom([u8(97), 98, 99], 'UTF-8') == [u8(0xEF), 0xBB, 0xBF, 97, 98, 99]
|
||||
assert iconv.create_utf_string_with_bom([u8(97), 0, 98, 0, 99, 0], 'UTF-16LE') == [u8(0xFF), 0xFE, 97, 0, 98, 0, 99, 0]
|
||||
assert iconv.create_utf_string_with_bom([u8(0), 97, 0, 98, 0, 99], 'UTF-16BE') == [u8(0xFE), 0xFF, 0, 97, 0, 98, 0, 99]
|
||||
assert iconv.create_utf_string_with_bom([u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0], 'UTF-32LE') == [u8(0xFF), 0xFE, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
|
||||
assert iconv.create_utf_string_with_bom([u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99], 'UTF-32BE') == [u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
|
||||
// vfmt on
|
||||
}
|
||||
|
||||
fn test_remove_utf_string_with_bom() {
|
||||
// bug ? vfmt create strange format here
|
||||
// vfmt off
|
||||
assert iconv.remove_utf_string_with_bom([u8(0xEF), 0xBB, 0xBF, 97, 98, 99], 'UTF-8') == [u8(97), 98, 99]
|
||||
assert iconv.remove_utf_string_with_bom([u8(0xFF), 0xFE, 97, 0, 98, 0, 99, 0], 'UTF-16LE') == [u8(97), 0, 98, 0, 99, 0]
|
||||
assert iconv.remove_utf_string_with_bom([u8(0xFE), 0xFF, 0, 97, 0, 98, 0, 99], 'UTF-16BE') == [u8(0), 97, 0, 98, 0, 99]
|
||||
assert iconv.remove_utf_string_with_bom([u8(0xFF), 0xFE, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0], 'UTF-32LE') == [u8(97), 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0]
|
||||
assert iconv.remove_utf_string_with_bom([u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99], 'UTF-32BE') == [u8(0), 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99]
|
||||
// vfmt on
|
||||
}
|
||||
|
||||
fn my_test_read_file_encoding_write_file_encoding(txt string, encoding string, bom bool, bytes []u8) ! {
|
||||
iconv.write_file_encoding('iconv_tmp.txt', txt, encoding, bom)!
|
||||
// read bytes directly from file
|
||||
mut bytes_ref := os.read_file_array[u8]('iconv_tmp.txt')
|
||||
assert bytes_ref == bytes
|
||||
if bom {
|
||||
bytes_ref = iconv.remove_utf_string_with_bom(bytes_ref, encoding)
|
||||
}
|
||||
str_ref := iconv.encoding_to_vstring(bytes_ref, encoding)!
|
||||
assert str_ref.bytes() == txt.bytes()
|
||||
str_conv := iconv.read_file_encoding('iconv_tmp.txt', encoding)!
|
||||
assert str_conv == txt
|
||||
os.rm('iconv_tmp.txt')!
|
||||
}
|
||||
|
||||
fn test_read_file_encoding_write_file_encoding() ! {
|
||||
// vfmt off
|
||||
// UTF-8
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-8',false,[u8(86), 229, 164, 167, 230, 179, 149, 229, 165, 189, 97, 98, 99])!
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-8',true,[u8(0xEF), 0xBB, 0xBF, 86, 229, 164, 167, 230, 179, 149, 229, 165, 189, 97, 98, 99])!
|
||||
|
||||
// UTF-16LE
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16LE',false,[u8(86), 0, 39, 89, 213, 108, 125, 89, 97, 0, 98, 0, 99, 0])!
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16LE',true,[u8(0xFF), 0xFE, 86, 0, 39, 89, 213, 108, 125, 89, 97, 0, 98, 0, 99, 0])!
|
||||
|
||||
// UTF-16BE
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16BE',false,[u8(0), 86, 89, 39, 108, 213, 89, 125, 0, 97, 0, 98, 0, 99])!
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-16BE',true,[u8(0xFE), 0xFF, 0, 86, 89, 39, 108, 213, 89, 125, 0, 97, 0, 98, 0, 99])!
|
||||
|
||||
// UTF-32LE
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32LE',false,[u8(86), 0, 0, 0, 39, 89, 0, 0, 213, 108, 0, 0, 125, 89, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0])!
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32LE',true,[u8(0xFF), 0xFE, 0, 0, 86, 0, 0, 0, 39, 89, 0, 0, 213, 108, 0, 0, 125, 89, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99, 0, 0, 0])!
|
||||
|
||||
// UTF-32BE
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32BE',false,[u8(0), 0, 0, 86, 0, 0, 89, 39, 0, 0, 108, 213, 0, 0, 89, 125, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99])!
|
||||
my_test_read_file_encoding_write_file_encoding('V大法好abc','UTF-32BE',true,[u8(0), 0, 0xFE, 0xFF, 0, 0, 0, 86, 0, 0, 89, 39, 0, 0, 108, 213, 0, 0, 89, 125, 0, 0, 0, 97, 0, 0, 0, 98, 0, 0, 0, 99])!
|
||||
// vfmt on
|
||||
}
|
733
vlib/encoding/iconv/iconv_windows.c.v
Normal file
733
vlib/encoding/iconv/iconv_windows.c.v
Normal file
|
@ -0,0 +1,733 @@
|
|||
module iconv
|
||||
|
||||
// Module iconv provides functions convert between vstring(UTF8) to/from different encodings.
|
||||
// iconv implementation using Win32 API to convert
|
||||
// Idear from https://github.com/win-iconv/win-iconv
|
||||
|
||||
fn C.GetACP() int
|
||||
fn C.GetOEMCP() int
|
||||
fn C.WideCharToMultiByte(codepage u32, dwflags u32, src &u8, src_len int, dst &u8, dst_len int, default_char &u8, used_default_char &bool) int
|
||||
fn C.MultiByteToWideChar(codepage u32, dwflags u32, src &u8, src_len int, dst &u8, dst_len int) int
|
||||
|
||||
struct Codepage_Alias {
|
||||
codepage int
|
||||
name string
|
||||
}
|
||||
|
||||
const codepage_alias = [
|
||||
// NOTE! name field string MUST in uppercase!
|
||||
// vfmt off
|
||||
Codepage_Alias{65001, 'CP65001'},
|
||||
Codepage_Alias{65001, 'UTF8'},
|
||||
Codepage_Alias{65001, 'UTF-8'},
|
||||
|
||||
Codepage_Alias{1200, 'CP1200'},
|
||||
Codepage_Alias{1200, 'UTF16LE'},
|
||||
Codepage_Alias{1200, 'UTF-16LE'},
|
||||
Codepage_Alias{1200, 'UCS2LE'},
|
||||
Codepage_Alias{1200, 'UCS-2LE'},
|
||||
Codepage_Alias{1200, 'UCS-2-INTERNAL'},
|
||||
Codepage_Alias{1200, 'UNICODE'}, // for iconv
|
||||
|
||||
Codepage_Alias{1201, 'CP1201'},
|
||||
Codepage_Alias{1201, 'UTF16BE'},
|
||||
Codepage_Alias{1201, 'UTF-16BE'},
|
||||
Codepage_Alias{1201, 'UCS2BE'},
|
||||
Codepage_Alias{1201, 'UCS-2BE'},
|
||||
Codepage_Alias{1201, 'UNICODEFFFE'},
|
||||
|
||||
Codepage_Alias{12000, 'CP12000'},
|
||||
Codepage_Alias{12000, 'UTF32LE'},
|
||||
Codepage_Alias{12000, 'UTF-32LE'},
|
||||
Codepage_Alias{12000, 'UCS4LE'},
|
||||
Codepage_Alias{12000, 'UCS-4LE'},
|
||||
|
||||
Codepage_Alias{12001, 'CP12001'},
|
||||
Codepage_Alias{12001, 'UTF32BE'},
|
||||
Codepage_Alias{12001, 'UTF-32BE'},
|
||||
Codepage_Alias{12001, 'UCS4BE'},
|
||||
Codepage_Alias{12001, 'UCS-4BE'},
|
||||
|
||||
//#ifndef GLIB_COMPILATION
|
||||
// //
|
||||
// * Default is big endian.
|
||||
// * See rfc2781 4.3 Interpreting text labelled as UTF-16.
|
||||
//
|
||||
// Codepage_Alias{1201, 'UTF16'},
|
||||
// Codepage_Alias{1201, 'UTF-16'},
|
||||
// Codepage_Alias{1201, 'UCS2'},
|
||||
// Codepage_Alias{1201, 'UCS-2'},
|
||||
// Codepage_Alias{12001, 'UTF32'},
|
||||
// Codepage_Alias{12001, 'UTF-32'},
|
||||
// Codepage_Alias{12001, 'UCS-4'},
|
||||
// Codepage_Alias{12001, 'UCS4'},
|
||||
//#else
|
||||
// Default is little endian, because the platform is
|
||||
Codepage_Alias{1200, 'UTF16'},
|
||||
Codepage_Alias{1200, 'UTF-16'},
|
||||
Codepage_Alias{1200, 'UCS2'},
|
||||
Codepage_Alias{1200, 'UCS-2'},
|
||||
Codepage_Alias{12000, 'UTF32'},
|
||||
Codepage_Alias{12000, 'UTF-32'},
|
||||
Codepage_Alias{12000, 'UCS4'},
|
||||
Codepage_Alias{12000, 'UCS-4'},
|
||||
//#endif
|
||||
|
||||
// copy from libiconv `iconv -l`
|
||||
// !IsValidCodePage(367)
|
||||
Codepage_Alias{20127, 'ANSI_X3.4-1968'},
|
||||
Codepage_Alias{20127, 'ANSI_X3.4-1986'},
|
||||
Codepage_Alias{20127, 'ASCII'},
|
||||
Codepage_Alias{20127, 'CP367'},
|
||||
Codepage_Alias{20127, 'IBM367'},
|
||||
Codepage_Alias{20127, 'ISO-IR-6'},
|
||||
Codepage_Alias{20127, 'ISO646-US'},
|
||||
Codepage_Alias{20127, 'ISO_646.IRV:1991'},
|
||||
Codepage_Alias{20127, 'US'},
|
||||
Codepage_Alias{20127, 'US-ASCII'},
|
||||
Codepage_Alias{20127, 'CSASCII'},
|
||||
|
||||
// !IsValidCodePage(819)
|
||||
Codepage_Alias{1252, 'CP819'},
|
||||
Codepage_Alias{1252, 'IBM819'},
|
||||
Codepage_Alias{28591, 'ISO-8859-1'},
|
||||
Codepage_Alias{28591, 'ISO-IR-100'},
|
||||
Codepage_Alias{28591, 'ISO8859-1'},
|
||||
Codepage_Alias{28591, 'ISO_8859-1'},
|
||||
Codepage_Alias{28591, 'ISO_8859-1:1987'},
|
||||
Codepage_Alias{28591, 'L1'},
|
||||
Codepage_Alias{28591, 'LATIN1'},
|
||||
Codepage_Alias{28591, 'CSISOLATIN1'},
|
||||
|
||||
Codepage_Alias{1250, 'CP1250'},
|
||||
Codepage_Alias{1250, 'MS-EE'},
|
||||
Codepage_Alias{1250, 'WINDOWS-1250'},
|
||||
|
||||
Codepage_Alias{1251, 'CP1251'},
|
||||
Codepage_Alias{1251, 'MS-CYRL'},
|
||||
Codepage_Alias{1251, 'WINDOWS-1251'},
|
||||
|
||||
Codepage_Alias{1252, 'CP1252'},
|
||||
Codepage_Alias{1252, 'MS-ANSI'},
|
||||
Codepage_Alias{1252, 'WINDOWS-1252'},
|
||||
|
||||
Codepage_Alias{1253, 'CP1253'},
|
||||
Codepage_Alias{1253, 'MS-GREEK'},
|
||||
Codepage_Alias{1253, 'WINDOWS-1253'},
|
||||
|
||||
Codepage_Alias{1254, 'CP1254'},
|
||||
Codepage_Alias{1254, 'MS-TURK'},
|
||||
Codepage_Alias{1254, 'WINDOWS-1254'},
|
||||
|
||||
Codepage_Alias{1255, 'CP1255'},
|
||||
Codepage_Alias{1255, 'MS-HEBR'},
|
||||
Codepage_Alias{1255, 'WINDOWS-1255'},
|
||||
|
||||
Codepage_Alias{1256, 'CP1256'},
|
||||
Codepage_Alias{1256, 'MS-ARAB'},
|
||||
Codepage_Alias{1256, 'WINDOWS-1256'},
|
||||
|
||||
Codepage_Alias{1257, 'CP1257'},
|
||||
Codepage_Alias{1257, 'WINBALTRIM'},
|
||||
Codepage_Alias{1257, 'WINDOWS-1257'},
|
||||
|
||||
Codepage_Alias{1258, 'CP1258'},
|
||||
Codepage_Alias{1258, 'WINDOWS-1258'},
|
||||
|
||||
Codepage_Alias{850, '850'},
|
||||
Codepage_Alias{850, 'CP850'},
|
||||
Codepage_Alias{850, 'IBM850'},
|
||||
Codepage_Alias{850, 'CSPC850MULTILINGUAL'},
|
||||
|
||||
// !IsValidCodePage(862)
|
||||
Codepage_Alias{862, '862'},
|
||||
Codepage_Alias{862, 'CP862'},
|
||||
Codepage_Alias{862, 'IBM862'},
|
||||
Codepage_Alias{862, 'CSPC862LATINHEBREW'},
|
||||
|
||||
Codepage_Alias{866, '866'},
|
||||
Codepage_Alias{866, 'CP866'},
|
||||
Codepage_Alias{866, 'IBM866'},
|
||||
Codepage_Alias{866, 'CSIBM866'},
|
||||
|
||||
// !IsValidCodePage(154)
|
||||
Codepage_Alias{154, 'CP154'},
|
||||
Codepage_Alias{154, 'CYRILLIC-ASIAN'},
|
||||
Codepage_Alias{154, 'PT154'},
|
||||
Codepage_Alias{154, 'PTCP154'},
|
||||
Codepage_Alias{154, 'CSPTCP154'},
|
||||
|
||||
// !IsValidCodePage(1133)
|
||||
Codepage_Alias{1133, 'CP1133'},
|
||||
Codepage_Alias{1133, 'IBM-CP1133'},
|
||||
|
||||
Codepage_Alias{874, 'CP874'},
|
||||
Codepage_Alias{874, 'WINDOWS-874'},
|
||||
|
||||
// !IsValidCodePage(51932)
|
||||
Codepage_Alias{51932, 'CP51932'},
|
||||
Codepage_Alias{51932, 'MS51932'},
|
||||
Codepage_Alias{51932, 'WINDOWS-51932'},
|
||||
Codepage_Alias{51932, 'EUC-JP'},
|
||||
|
||||
Codepage_Alias{932, 'CP932'},
|
||||
Codepage_Alias{932, 'MS932'},
|
||||
Codepage_Alias{932, 'SHIFFT_JIS'},
|
||||
Codepage_Alias{932, 'SHIFFT_JIS-MS'},
|
||||
Codepage_Alias{932, 'SJIS'},
|
||||
Codepage_Alias{932, 'SJIS-MS'},
|
||||
Codepage_Alias{932, 'SJIS-OPEN'},
|
||||
Codepage_Alias{932, 'SJIS-WIN'},
|
||||
Codepage_Alias{932, 'WINDOWS-31J'},
|
||||
Codepage_Alias{932, 'WINDOWS-932'},
|
||||
Codepage_Alias{932, 'CSWINDOWS31J'},
|
||||
|
||||
Codepage_Alias{50221, 'CP50221'},
|
||||
Codepage_Alias{50221, 'ISO-2022-JP'},
|
||||
Codepage_Alias{50221, 'ISO-2022-JP-MS'},
|
||||
Codepage_Alias{50221, 'ISO2022-JP'},
|
||||
Codepage_Alias{50221, 'ISO2022-JP-MS'},
|
||||
Codepage_Alias{50221, 'MS50221'},
|
||||
Codepage_Alias{50221, 'WINDOWS-50221'},
|
||||
|
||||
Codepage_Alias{936, 'CP936'},
|
||||
Codepage_Alias{936, 'GBK'},
|
||||
Codepage_Alias{936, 'MS936'},
|
||||
Codepage_Alias{936, 'WINDOWS-936'},
|
||||
|
||||
Codepage_Alias{950, 'CP950'},
|
||||
Codepage_Alias{950, 'BIG5'},
|
||||
Codepage_Alias{950, 'BIG5HKSCS'},
|
||||
Codepage_Alias{950, 'BIG5-HKSCS'},
|
||||
|
||||
Codepage_Alias{949, 'CP949'},
|
||||
Codepage_Alias{949, 'UHC'},
|
||||
Codepage_Alias{949, 'EUC-KR'},
|
||||
|
||||
Codepage_Alias{1361, 'CP1361'},
|
||||
Codepage_Alias{1361, 'JOHAB'},
|
||||
|
||||
Codepage_Alias{437, '437'},
|
||||
Codepage_Alias{437, 'CP437'},
|
||||
Codepage_Alias{437, 'IBM437'},
|
||||
Codepage_Alias{437, 'CSPC8CODEPAGE437'},
|
||||
|
||||
Codepage_Alias{737, 'CP737'},
|
||||
|
||||
Codepage_Alias{775, 'CP775'},
|
||||
Codepage_Alias{775, 'IBM775'},
|
||||
Codepage_Alias{775, 'CSPC775BALTIC'},
|
||||
|
||||
Codepage_Alias{852, '852'},
|
||||
Codepage_Alias{852, 'CP852'},
|
||||
Codepage_Alias{852, 'IBM852'},
|
||||
Codepage_Alias{852, 'CSPCP852'},
|
||||
|
||||
// !IsValidCodePage(853)
|
||||
Codepage_Alias{853, 'CP853'},
|
||||
|
||||
Codepage_Alias{855, '855'},
|
||||
Codepage_Alias{855, 'CP855'},
|
||||
Codepage_Alias{855, 'IBM855'},
|
||||
Codepage_Alias{855, 'CSIBM855'},
|
||||
|
||||
Codepage_Alias{857, '857'},
|
||||
Codepage_Alias{857, 'CP857'},
|
||||
Codepage_Alias{857, 'IBM857'},
|
||||
Codepage_Alias{857, 'CSIBM857'},
|
||||
|
||||
// !IsValidCodePage(858)
|
||||
Codepage_Alias{858, 'CP858'},
|
||||
|
||||
Codepage_Alias{860, '860'},
|
||||
Codepage_Alias{860, 'CP860'},
|
||||
Codepage_Alias{860, 'IBM860'},
|
||||
Codepage_Alias{860, 'CSIBM860'},
|
||||
|
||||
Codepage_Alias{861, '861'},
|
||||
Codepage_Alias{861, 'CP-IS'},
|
||||
Codepage_Alias{861, 'CP861'},
|
||||
Codepage_Alias{861, 'IBM861'},
|
||||
Codepage_Alias{861, 'CSIBM861'},
|
||||
|
||||
Codepage_Alias{863, '863'},
|
||||
Codepage_Alias{863, 'CP863'},
|
||||
Codepage_Alias{863, 'IBM863'},
|
||||
Codepage_Alias{863, 'CSIBM863'},
|
||||
|
||||
Codepage_Alias{864, 'CP864'},
|
||||
Codepage_Alias{864, 'IBM864'},
|
||||
Codepage_Alias{864, 'CSIBM864'},
|
||||
|
||||
Codepage_Alias{865, '865'},
|
||||
Codepage_Alias{865, 'CP865'},
|
||||
Codepage_Alias{865, 'IBM865'},
|
||||
Codepage_Alias{865, 'CSIBM865'},
|
||||
|
||||
Codepage_Alias{869, '869'},
|
||||
Codepage_Alias{869, 'CP-GR'},
|
||||
Codepage_Alias{869, 'CP869'},
|
||||
Codepage_Alias{869, 'IBM869'},
|
||||
Codepage_Alias{869, 'CSIBM869'},
|
||||
|
||||
// !IsValidCodePage(1152)
|
||||
Codepage_Alias{1125, 'CP1125'},
|
||||
|
||||
//
|
||||
// * Code Page Identifiers
|
||||
// * https://learn.microsoft.com/en-us/windows/win32/intl/code-page-identifiers
|
||||
|
||||
Codepage_Alias{37, 'IBM037'}, // IBM EBCDIC US-Canada
|
||||
Codepage_Alias{437, 'IBM437'}, // OEM United States
|
||||
Codepage_Alias{500, 'IBM500'}, // IBM EBCDIC International
|
||||
Codepage_Alias{708, 'ASMO-708'}, // Arabic (ASMO 708)
|
||||
// 709 Arabic (ASMO-449+, BCON V4)
|
||||
// 710 Arabic - Transparent Arabic
|
||||
Codepage_Alias{720, 'DOS-720'}, // Arabic (Transparent ASMO); Arabic (DOS)
|
||||
Codepage_Alias{737, 'IBM737'}, // OEM Greek (formerly 437G); Greek (DOS)
|
||||
Codepage_Alias{775, 'IBM775'}, // OEM Baltic; Baltic (DOS)
|
||||
Codepage_Alias{850, 'IBM850'}, // OEM Multilingual Latin 1; Western European (DOS)
|
||||
Codepage_Alias{852, 'IBM852'}, // OEM Latin 2; Central European (DOS)
|
||||
Codepage_Alias{855, 'IBM855'}, // OEM Cyrillic (primarily Russian)
|
||||
Codepage_Alias{857, 'IBM857'}, // OEM Turkish; Turkish (DOS)
|
||||
Codepage_Alias{858, 'IBM00858'}, // OEM Multilingual Latin 1 + Euro symbol
|
||||
Codepage_Alias{860, 'IBM860'}, // OEM Portuguese; Portuguese (DOS)
|
||||
Codepage_Alias{861, 'IBM861'}, // OEM Icelandic; Icelandic (DOS)
|
||||
Codepage_Alias{862, 'DOS-862'}, // OEM Hebrew; Hebrew (DOS)
|
||||
Codepage_Alias{863, 'IBM863'}, // OEM French Canadian; French Canadian (DOS)
|
||||
Codepage_Alias{864, 'IBM864'}, // OEM Arabic; Arabic (864)
|
||||
Codepage_Alias{865, 'IBM865'}, // OEM Nordic; Nordic (DOS)
|
||||
Codepage_Alias{866, 'CP866'}, // OEM Russian; Cyrillic (DOS)
|
||||
Codepage_Alias{869, 'IBM869'}, // OEM Modern Greek; Greek, Modern (DOS)
|
||||
Codepage_Alias{870, 'IBM870'}, // IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
|
||||
Codepage_Alias{874, 'WINDOWS-874'}, // ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
|
||||
Codepage_Alias{875, 'CP875'}, // IBM EBCDIC Greek Modern
|
||||
Codepage_Alias{932, 'SHIFT_JIS'}, // ANSI/OEM Japanese; Japanese (Shift-JIS)
|
||||
Codepage_Alias{932, 'SHIFT-JIS'}, // alternative name for it
|
||||
Codepage_Alias{936, 'GB2312'}, // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
|
||||
Codepage_Alias{949, 'KS_C_5601-1987'}, // ANSI/OEM Korean (Unified Hangul Code)
|
||||
Codepage_Alias{950, 'BIG5'}, // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
|
||||
Codepage_Alias{950, 'BIG5HKSCS'}, // ANSI/OEM Traditional Chinese (Hong Kong SAR); Chinese Traditional (Big5-HKSCS)
|
||||
Codepage_Alias{950, 'BIG5-HKSCS'}, // alternative name for it
|
||||
Codepage_Alias{1026, 'IBM1026'}, // IBM EBCDIC Turkish (Latin 5)
|
||||
Codepage_Alias{1047, 'IBM01047'}, // IBM EBCDIC Latin 1/Open System
|
||||
Codepage_Alias{1140, 'IBM01140'}, // IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
|
||||
Codepage_Alias{1141, 'IBM01141'}, // IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
|
||||
Codepage_Alias{1142, 'IBM01142'}, // IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
|
||||
Codepage_Alias{1143, 'IBM01143'}, // IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
|
||||
Codepage_Alias{1144, 'IBM01144'}, // IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
|
||||
Codepage_Alias{1145, 'IBM01145'}, // IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
|
||||
Codepage_Alias{1146, 'IBM01146'}, // IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
|
||||
Codepage_Alias{1147, 'IBM01147'}, // IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
|
||||
Codepage_Alias{1148, 'IBM01148'}, // IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
|
||||
Codepage_Alias{1149, 'IBM01149'}, // IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
|
||||
Codepage_Alias{1200, 'UTF-16'}, // Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
|
||||
Codepage_Alias{1201, 'UNICODEFFFE'}, // Unicode UTF-16, big endian byte order; available only to managed applications
|
||||
Codepage_Alias{1250, 'WINDOWS-1250'}, // ANSI Central European; Central European (Windows)
|
||||
Codepage_Alias{1251, 'WINDOWS-1251'}, // ANSI Cyrillic; Cyrillic (Windows)
|
||||
Codepage_Alias{1252, 'WINDOWS-1252'}, // ANSI Latin 1; Western European (Windows)
|
||||
Codepage_Alias{1253, 'WINDOWS-1253'}, // ANSI Greek; Greek (Windows)
|
||||
Codepage_Alias{1254, 'WINDOWS-1254'}, // ANSI Turkish; Turkish (Windows)
|
||||
Codepage_Alias{1255, 'WINDOWS-1255'}, // ANSI Hebrew; Hebrew (Windows)
|
||||
Codepage_Alias{1256, 'WINDOWS-1256'}, // ANSI Arabic; Arabic (Windows)
|
||||
Codepage_Alias{1257, 'WINDOWS-1257'}, // ANSI Baltic; Baltic (Windows)
|
||||
Codepage_Alias{1258, 'WINDOWS-1258'}, // ANSI/OEM Vietnamese; Vietnamese (Windows)
|
||||
Codepage_Alias{1361, 'JOHAB'}, // Korean (Johab)
|
||||
Codepage_Alias{10000, 'MACINTOSH'}, // MAC Roman; Western European (Mac)
|
||||
Codepage_Alias{10001, 'X-MAC-JAPANESE'}, // Japanese (Mac)
|
||||
Codepage_Alias{10002, 'X-MAC-CHINESETRAD'}, // MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
|
||||
Codepage_Alias{10003, 'X-MAC-KOREAN'}, // Korean (Mac)
|
||||
Codepage_Alias{10004, 'X-MAC-ARABIC'}, // Arabic (Mac)
|
||||
Codepage_Alias{10005, 'X-MAC-HEBREW'}, // Hebrew (Mac)
|
||||
Codepage_Alias{10006, 'X-MAC-GREEK'}, // Greek (Mac)
|
||||
Codepage_Alias{10007, 'X-MAC-CYRILLIC'}, // Cyrillic (Mac)
|
||||
Codepage_Alias{10008, 'X-MAC-CHINESESIMP'}, // MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
|
||||
Codepage_Alias{10010, 'X-MAC-ROMANIAN'}, // Romanian (Mac)
|
||||
Codepage_Alias{10017, 'X-MAC-UKRAINIAN'}, // Ukrainian (Mac)
|
||||
Codepage_Alias{10021, 'X-MAC-THAI'}, // Thai (Mac)
|
||||
Codepage_Alias{10029, 'X-MAC-CE'}, // MAC Latin 2; Central European (Mac)
|
||||
Codepage_Alias{10079, 'X-MAC-ICELANDIC'}, // Icelandic (Mac)
|
||||
Codepage_Alias{10081, 'X-MAC-TURKISH'}, // Turkish (Mac)
|
||||
Codepage_Alias{10082, 'X-MAC-CROATIAN'}, // Croatian (Mac)
|
||||
Codepage_Alias{12000, 'UTF-32'}, // Unicode UTF-32, little endian byte order; available only to managed applications
|
||||
Codepage_Alias{12001, 'UTF-32BE'}, // Unicode UTF-32, big endian byte order; available only to managed applications
|
||||
Codepage_Alias{20000, 'X-CHINESE_CNS'}, // CNS Taiwan; Chinese Traditional (CNS)
|
||||
Codepage_Alias{20001, 'X-CP20001'}, // TCA Taiwan
|
||||
Codepage_Alias{20002, 'X_CHINESE-ETEN'}, // Eten Taiwan; Chinese Traditional (Eten)
|
||||
Codepage_Alias{20003, 'X-CP20003'}, // IBM5550 Taiwan
|
||||
Codepage_Alias{20004, 'X-CP20004'}, // TeleText Taiwan
|
||||
Codepage_Alias{20005, 'X-CP20005'}, // Wang Taiwan
|
||||
Codepage_Alias{20105, 'X-IA5'}, // IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
|
||||
Codepage_Alias{20106, 'X-IA5-GERMAN'}, // IA5 German (7-bit)
|
||||
Codepage_Alias{20107, 'X-IA5-SWEDISH'}, // IA5 Swedish (7-bit)
|
||||
Codepage_Alias{20108, 'X-IA5-NORWEGIAN'}, // IA5 Norwegian (7-bit)
|
||||
Codepage_Alias{20127, 'US-ASCII'}, // US-ASCII (7-bit)
|
||||
Codepage_Alias{20261, 'X-CP20261'}, // T.61
|
||||
Codepage_Alias{20269, 'X-CP20269'}, // ISO 6937 Non-Spacing Accent
|
||||
Codepage_Alias{20273, 'IBM273'}, // IBM EBCDIC Germany
|
||||
Codepage_Alias{20277, 'IBM277'}, // IBM EBCDIC Denmark-Norway
|
||||
Codepage_Alias{20278, 'IBM278'}, // IBM EBCDIC Finland-Sweden
|
||||
Codepage_Alias{20280, 'IBM280'}, // IBM EBCDIC Italy
|
||||
Codepage_Alias{20284, 'IBM284'}, // IBM EBCDIC Latin America-Spain
|
||||
Codepage_Alias{20285, 'IBM285'}, // IBM EBCDIC United Kingdom
|
||||
Codepage_Alias{20290, 'IBM290'}, // IBM EBCDIC Japanese Katakana Extended
|
||||
Codepage_Alias{20297, 'IBM297'}, // IBM EBCDIC France
|
||||
Codepage_Alias{20420, 'IBM420'}, // IBM EBCDIC Arabic
|
||||
Codepage_Alias{20423, 'IBM423'}, // IBM EBCDIC Greek
|
||||
Codepage_Alias{20424, 'IBM424'}, // IBM EBCDIC Hebrew
|
||||
Codepage_Alias{20833, 'X-EBCDIC-KOREANEXTENDED'}, // IBM EBCDIC Korean Extended
|
||||
Codepage_Alias{20838, 'IBM-THAI'}, // IBM EBCDIC Thai
|
||||
Codepage_Alias{20866, 'KOI8-R'}, // Russian (KOI8-R); Cyrillic (KOI8-R)
|
||||
Codepage_Alias{20871, 'IBM871'}, // IBM EBCDIC Icelandic
|
||||
Codepage_Alias{20880, 'IBM880'}, // IBM EBCDIC Cyrillic Russian
|
||||
Codepage_Alias{20905, 'IBM905'}, // IBM EBCDIC Turkish
|
||||
Codepage_Alias{20924, 'IBM00924'}, // IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
|
||||
Codepage_Alias{20932, 'EUC-JP'}, // Japanese (JIS 0208-1990 and 0121-1990)
|
||||
Codepage_Alias{20936, 'X-CP20936'}, // Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
|
||||
Codepage_Alias{20949, 'X-CP20949'}, // Korean Wansung
|
||||
Codepage_Alias{21025, 'CP1025'}, // IBM EBCDIC Cyrillic Serbian-Bulgarian
|
||||
// 21027 (deprecated)
|
||||
Codepage_Alias{21866, 'KOI8-U'}, // Ukrainian (KOI8-U); Cyrillic (KOI8-U)
|
||||
Codepage_Alias{28591, 'ISO-8859-1'}, // ISO 8859-1 Latin 1; Western European (ISO)
|
||||
Codepage_Alias{28591, 'ISO8859-1'}, // ISO 8859-1 Latin 1; Western European (ISO)
|
||||
Codepage_Alias{28591, 'ISO_8859-1'},
|
||||
Codepage_Alias{28591, 'ISO_8859_1'},
|
||||
Codepage_Alias{28592, 'ISO-8859-2'}, // ISO 8859-2 Central European; Central European (ISO)
|
||||
Codepage_Alias{28592, 'ISO8859-2'}, // ISO 8859-2 Central European; Central European (ISO)
|
||||
Codepage_Alias{28592, 'ISO_8859-2'},
|
||||
Codepage_Alias{28592, 'ISO_8859_2'},
|
||||
Codepage_Alias{28593, 'ISO-8859-3'}, // ISO 8859-3 Latin 3
|
||||
Codepage_Alias{28593, 'ISO8859-3'}, // ISO 8859-3 Latin 3
|
||||
Codepage_Alias{28593, 'ISO_8859-3'},
|
||||
Codepage_Alias{28593, 'ISO_8859_3'},
|
||||
Codepage_Alias{28594, 'ISO-8859-4'}, // ISO 8859-4 Baltic
|
||||
Codepage_Alias{28594, 'ISO8859-4'}, // ISO 8859-4 Baltic
|
||||
Codepage_Alias{28594, 'ISO_8859-4'},
|
||||
Codepage_Alias{28594, 'ISO_8859_4'},
|
||||
Codepage_Alias{28595, 'ISO-8859-5'}, // ISO 8859-5 Cyrillic
|
||||
Codepage_Alias{28595, 'ISO8859-5'}, // ISO 8859-5 Cyrillic
|
||||
Codepage_Alias{28595, 'ISO_8859-5'},
|
||||
Codepage_Alias{28595, 'ISO_8859_5'},
|
||||
Codepage_Alias{28596, 'ISO-8859-6'}, // ISO 8859-6 Arabic
|
||||
Codepage_Alias{28596, 'ISO8859-6'}, // ISO 8859-6 Arabic
|
||||
Codepage_Alias{28596, 'ISO_8859-6'},
|
||||
Codepage_Alias{28596, 'ISO_8859_6'},
|
||||
Codepage_Alias{28597, 'ISO-8859-7'}, // ISO 8859-7 Greek
|
||||
Codepage_Alias{28597, 'ISO8859-7'}, // ISO 8859-7 Greek
|
||||
Codepage_Alias{28597, 'ISO_8859-7'},
|
||||
Codepage_Alias{28597, 'ISO_8859_7'},
|
||||
Codepage_Alias{28598, 'ISO-8859-8'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
|
||||
Codepage_Alias{28598, 'ISO8859-8'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
|
||||
Codepage_Alias{28598, 'ISO_8859-8'},
|
||||
Codepage_Alias{28598, 'ISO_8859_8'},
|
||||
Codepage_Alias{28599, 'ISO-8859-9'}, // ISO 8859-9 Turkish
|
||||
Codepage_Alias{28599, 'ISO8859-9'}, // ISO 8859-9 Turkish
|
||||
Codepage_Alias{28599, 'ISO_8859-9'},
|
||||
Codepage_Alias{28599, 'ISO_8859_9'},
|
||||
Codepage_Alias{28603, 'ISO-8859-13'}, // ISO 8859-13 Estonian
|
||||
Codepage_Alias{28603, 'ISO8859-13'}, // ISO 8859-13 Estonian
|
||||
Codepage_Alias{28603, 'ISO_8859-13'},
|
||||
Codepage_Alias{28603, 'ISO_8859_13'},
|
||||
Codepage_Alias{28605, 'ISO-8859-15'}, // ISO 8859-15 Latin 9
|
||||
Codepage_Alias{28605, 'ISO8859-15'}, // ISO 8859-15 Latin 9
|
||||
Codepage_Alias{28605, 'ISO_8859-15'},
|
||||
Codepage_Alias{28605, 'ISO_8859_15'},
|
||||
Codepage_Alias{29001, 'X-EUROPA'}, // Europa 3
|
||||
Codepage_Alias{38598, 'ISO-8859-8-I'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
|
||||
Codepage_Alias{38598, 'ISO8859-8-I'}, // ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
|
||||
Codepage_Alias{38598, 'ISO_8859-8-I'},
|
||||
Codepage_Alias{38598, 'ISO_8859_8-I'},
|
||||
Codepage_Alias{50220, 'ISO-2022-JP'}, // ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
|
||||
Codepage_Alias{50221, 'CSISO2022JP'}, // ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
|
||||
Codepage_Alias{50222, 'ISO-2022-JP'}, // ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
|
||||
Codepage_Alias{50225, 'ISO-2022-KR'}, // ISO 2022 Korean
|
||||
Codepage_Alias{50225, 'ISO2022-KR'}, // ISO 2022 Korean
|
||||
Codepage_Alias{50227, 'X-CP50227'}, // ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
|
||||
// 50229 ISO 2022 Traditional Chinese
|
||||
// 50930 EBCDIC Japanese (Katakana) Extended
|
||||
// 50931 EBCDIC US-Canada and Japanese
|
||||
// 50933 EBCDIC Korean Extended and Korean
|
||||
// 50935 EBCDIC Simplified Chinese Extended and Simplified Chinese
|
||||
// 50936 EBCDIC Simplified Chinese
|
||||
// 50937 EBCDIC US-Canada and Traditional Chinese
|
||||
// 50939 EBCDIC Japanese (Latin) Extended and Japanese
|
||||
Codepage_Alias{51932, 'EUC-JP'}, // EUC Japanese
|
||||
Codepage_Alias{51936, 'EUC-CN'}, // EUC Simplified Chinese; Chinese Simplified (EUC)
|
||||
Codepage_Alias{51949, 'EUC-KR'}, // EUC Korean
|
||||
// 51950 EUC Traditional Chinese
|
||||
Codepage_Alias{52936, 'HZ-GB-2312'}, // HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
|
||||
Codepage_Alias{54936, 'GB18030'}, // Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
|
||||
Codepage_Alias{57002, 'X-ISCII-DE'}, // ISCII Devanagari
|
||||
Codepage_Alias{57003, 'X-ISCII-BE'}, // ISCII Bengali
|
||||
Codepage_Alias{57004, 'X-ISCII-TA'}, // ISCII Tamil
|
||||
Codepage_Alias{57005, 'X-ISCII-TE'}, // ISCII Telugu
|
||||
Codepage_Alias{57006, 'X-ISCII-AS'}, // ISCII Assamese
|
||||
Codepage_Alias{57007, 'X-ISCII-OR'}, // ISCII Oriya
|
||||
Codepage_Alias{57008, 'X-ISCII-KA'}, // ISCII Kannada
|
||||
Codepage_Alias{57009, 'X-ISCII-MA'}, // ISCII Malayalam
|
||||
Codepage_Alias{57010, 'X-ISCII-GU'}, // ISCII Gujarati
|
||||
Codepage_Alias{57011, 'X-ISCII-PA'}, // ISCII Punjabi
|
||||
Codepage_Alias{65000, 'UTF-7'}, // Unicode (UTF-7)
|
||||
Codepage_Alias{65001, 'UTF-8'}, // Unicode (UTF-8)
|
||||
// vfmt on
|
||||
]
|
||||
|
||||
fn name_to_codepage(name string) int {
|
||||
// performance hack
|
||||
if name == 'UTF-8' {
|
||||
return 65001
|
||||
}
|
||||
|
||||
name_upper := name.to_upper()
|
||||
if name_upper == '' || name_upper == 'CP_ACP' || name_upper == 'ANSI' {
|
||||
return C.GetACP()
|
||||
}
|
||||
if name_upper == 'CP_OEMCP' {
|
||||
return C.GetOEMCP()
|
||||
}
|
||||
if name_upper.len < 2 {
|
||||
return -1
|
||||
}
|
||||
if name_upper == 'WCHAR_T' {
|
||||
return 1200
|
||||
}
|
||||
// CP123
|
||||
if name_upper.starts_with('CP') {
|
||||
return name_upper[2..].int()
|
||||
}
|
||||
if name_upper.is_int() {
|
||||
return name_upper.int()
|
||||
}
|
||||
// XX123 for debug
|
||||
if name_upper.starts_with('xx') {
|
||||
return name_upper[2..].int()
|
||||
}
|
||||
|
||||
for x in codepage_alias {
|
||||
if x.name == name_upper {
|
||||
return x.codepage
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
||||
|
||||
// https://www.cnblogs.com/findumars/p/6376034.html
|
||||
@[direct_array_access]
|
||||
fn utf32_to_utf16(src &u8, src_len int, is_src_little_endian bool, is_dst_little_endian bool) ![]u8 {
|
||||
mut dst := []u8{len: src_len}
|
||||
mut sptr := unsafe { &u32(src) }
|
||||
mut dptr := &u16(dst.data)
|
||||
mut src_idx := 0
|
||||
mut dst_idx := 0
|
||||
mut c := u32(0)
|
||||
mut t := u16(0)
|
||||
for {
|
||||
if src_idx == src_len / 4 {
|
||||
break
|
||||
}
|
||||
unsafe {
|
||||
c = sptr[src_idx]
|
||||
}
|
||||
if !is_src_little_endian {
|
||||
c = reverse_u32(c)
|
||||
}
|
||||
src_idx++
|
||||
if c <= 0xFFFF {
|
||||
t = u16(c)
|
||||
if !is_dst_little_endian {
|
||||
t = reverse_u16(t)
|
||||
}
|
||||
unsafe {
|
||||
dptr[dst_idx] = t
|
||||
}
|
||||
dst_idx++
|
||||
} else if c <= 0xEFFFF {
|
||||
t = u16((0xD800 + (c >> 10) - 0x40)) // high
|
||||
|
||||
if !is_dst_little_endian {
|
||||
t = reverse_u16(t)
|
||||
}
|
||||
unsafe {
|
||||
dptr[dst_idx] = t
|
||||
}
|
||||
dst_idx++
|
||||
t = u16(0xDC00 + (c & 0x03FF)) // low
|
||||
if !is_dst_little_endian {
|
||||
t = reverse_u16(t)
|
||||
}
|
||||
unsafe {
|
||||
dptr[dst_idx] = t
|
||||
}
|
||||
dst_idx++
|
||||
} else {
|
||||
return error('invalid UTF-32LE encoding')
|
||||
}
|
||||
}
|
||||
dst.trim(dst_idx * 2)
|
||||
return dst
|
||||
}
|
||||
|
||||
// https://www.cnblogs.com/findumars/p/6376034.html
|
||||
@[direct_array_access]
|
||||
fn utf16_to_utf32(src &u8, src_len int, is_src_little_endian bool, is_dst_little_endian bool) ![]u8 {
|
||||
mut dst := []u8{len: src_len * 2}
|
||||
mut sptr := unsafe { &u16(src) }
|
||||
mut dptr := &u32(dst.data)
|
||||
mut w1 := u16(0)
|
||||
mut w2 := u16(0)
|
||||
mut t := u32(0)
|
||||
mut src_idx := 0
|
||||
mut dst_idx := 0
|
||||
for {
|
||||
if src_idx == src_len / 2 {
|
||||
break
|
||||
}
|
||||
unsafe {
|
||||
w1 = sptr[src_idx]
|
||||
}
|
||||
if !is_src_little_endian {
|
||||
w1 = reverse_u16(w1)
|
||||
}
|
||||
src_idx++
|
||||
if w1 >= 0xD800 && w1 <= 0xDFFF {
|
||||
if w1 < 0xDC00 {
|
||||
if src_idx == src_len / 2 {
|
||||
return error('invalid UTF-16LE encoding')
|
||||
}
|
||||
unsafe {
|
||||
w2 = sptr[src_idx]
|
||||
}
|
||||
if !is_src_little_endian {
|
||||
w2 = reverse_u16(w2)
|
||||
}
|
||||
if w2 >= 0xDC00 && w2 <= 0xDFFF {
|
||||
t = (w2 & 0x03FF) + (((w1 & 0x03FF) + 0x40) << 10)
|
||||
if !is_dst_little_endian {
|
||||
t = reverse_u32(t)
|
||||
}
|
||||
unsafe {
|
||||
dptr[dst_idx] = t
|
||||
}
|
||||
dst_idx++
|
||||
}
|
||||
} else {
|
||||
return error('invalid UTF-16LE encoding')
|
||||
}
|
||||
} else {
|
||||
t = w1
|
||||
if !is_dst_little_endian {
|
||||
t = reverse_u32(t)
|
||||
}
|
||||
unsafe {
|
||||
dptr[dst_idx] = t
|
||||
}
|
||||
dst_idx++
|
||||
}
|
||||
}
|
||||
dst.trim(dst_idx * 4)
|
||||
return dst
|
||||
}
|
||||
|
||||
// conv convert `fromcode` encoding string to `tocode` encoding string
|
||||
@[direct_array_access]
|
||||
fn conv(tocode string, fromcode string, src &u8, src_len int) ![]u8 {
|
||||
if src_len < 0 {
|
||||
return error('src length error')
|
||||
}
|
||||
src_codepage := name_to_codepage(fromcode)
|
||||
dst_codepage := name_to_codepage(tocode)
|
||||
if src_codepage <= 0 {
|
||||
return error('fromcode ${fromcode} does not exist')
|
||||
}
|
||||
if dst_codepage <= 0 {
|
||||
return error('tocode ${tocode} does not exist')
|
||||
}
|
||||
|
||||
if src_codepage == dst_codepage {
|
||||
// clone src
|
||||
mut dst_buf := []u8{len: src_len}
|
||||
unsafe { vmemcpy(dst_buf.data, src, src_len) }
|
||||
return dst_buf
|
||||
}
|
||||
|
||||
mut unicode := []u8{}
|
||||
// src codepage => Unicode
|
||||
match src_codepage {
|
||||
1200 {
|
||||
// src already in Unicode(UTF-16LE) encoding, just clone src
|
||||
unsafe {
|
||||
unicode.grow_len(src_len)
|
||||
vmemcpy(unicode.data, src, src_len)
|
||||
}
|
||||
}
|
||||
1201 {
|
||||
// Windows does not support UTF-16BE
|
||||
// byte swap each 16 bit character element
|
||||
unsafe {
|
||||
unicode.grow_len(src_len)
|
||||
vmemcpy(unicode.data, src, src_len)
|
||||
}
|
||||
mut eptr := &u16(unicode.data)
|
||||
for i in 0 .. src_len / 2 {
|
||||
unsafe {
|
||||
eptr[i] = reverse_u16(eptr[i])
|
||||
}
|
||||
}
|
||||
}
|
||||
12000 {
|
||||
// Windows does not support UTF-32LE
|
||||
unicode = utf32_to_utf16(src, src_len, true, true)!
|
||||
}
|
||||
12001 {
|
||||
// Windows does not support UTF-32BE
|
||||
unicode = utf32_to_utf16(src, src_len, false, true)!
|
||||
}
|
||||
else {
|
||||
char_num := C.MultiByteToWideChar(src_codepage, 0, src, src_len, 0, 0)
|
||||
if char_num == 0 {
|
||||
return error('MultiByteToWideChar fail: src contain zero ${fromcode} character')
|
||||
}
|
||||
unsafe { unicode.grow_len(char_num * 2) } // every char take 2 bytes
|
||||
C.MultiByteToWideChar(src_codepage, 0, src, src_len, unicode.data, unicode.len)
|
||||
}
|
||||
}
|
||||
|
||||
mut dst := []u8{}
|
||||
// Unicode => dst codepage
|
||||
match dst_codepage {
|
||||
1200 {
|
||||
// dst codepage is Unicode, just return unicode
|
||||
return unicode
|
||||
}
|
||||
1201 {
|
||||
// Windows does not support UTF-16BE
|
||||
// byte swap each 16 bit character element
|
||||
mut eptr := &u16(unicode.data)
|
||||
for i in 0 .. unicode.len / 2 {
|
||||
unsafe {
|
||||
eptr[i] = reverse_u16(eptr[i])
|
||||
}
|
||||
}
|
||||
return unicode
|
||||
}
|
||||
12000 {
|
||||
// Windows does not support UTF-32LE
|
||||
dst = utf16_to_utf32(unicode.data, unicode.len, true, true)!
|
||||
}
|
||||
12001 {
|
||||
// Windows does not support UTF-32BE
|
||||
dst = utf16_to_utf32(unicode.data, unicode.len, true, false)!
|
||||
}
|
||||
else {
|
||||
dst_len := C.WideCharToMultiByte(dst_codepage, 0, unicode.data, unicode.len / 2,
|
||||
0, 0, 0, 0)
|
||||
if dst_len == 0 {
|
||||
return error('WideCharToMultiByte fail: src contain zero unicode character')
|
||||
}
|
||||
unsafe { dst.grow_len(dst_len) }
|
||||
C.WideCharToMultiByte(dst_codepage, 0, unicode.data, unicode.len, dst.data,
|
||||
dst.len, 0, 0)
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue