math.bits: add asm implementations for some 64 bit ops (#25020)

2025-09-13 14:32:26 +03:00 · 2025-08-11 14:01:49 +08:00 · 2025-08-11 14:01:49 +08:00 · 350747793b
commit 350747793b
parent 60108cdd33
5 changed files with 517 additions and 7 deletions
--- a/vlib/math/bits/bits.amd64.v
+++ b/vlib/math/bits/bits.amd64.v
@ -0,0 +1,100 @@
 // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
 // Use of this source code is governed by an MIT license
 // that can be found in the LICENSE file.
 module bits
 fn C._umul128(x u64, y u64, result_hi &u64) u64
 fn C._addcarry_u64(carry_in u8, a u64, b u64, out &u64) u8
 fn C._udiv128(hi u64, lo u64, y u64, rem &u64) u64
 // mul_64 returns the 128-bit product of x and y: (hi, lo) = x * y
 // with the product bits' upper half returned in hi and the lower
 // half returned in lo.
 //
 // This function's execution time does not depend on the inputs.
@[inline]
 pub fn mul_64(x u64, y u64) (u64, u64) {
 	mut hi := u64(0)
 	mut lo := u64(0)
 	$if msvc {
 		lo = C._umul128(x, y, &hi)
 		return hi, lo
 	} $else $if amd64 {
 		asm amd64 {
 			mulq rdx
 			; =a (lo)
 			  =d (hi)
 			; a (x)
 			  d (y)
 			; cc
 		}
 		return hi, lo
 	}
 	// cross compile
 	return mul_64_default(x, y)
 }
 // mul_add_64 returns the 128-bit result of x * y + z: (hi, lo) = x * y + z
 // with the result bits' upper half returned in hi and the lower
 // half returned in lo.
@[inline]
 pub fn mul_add_64(x u64, y u64, z u64) (u64, u64) {
 	mut hi := u64(0)
 	mut lo := u64(0)
 	$if msvc {
 		lo = C._umul128(x, y, &hi)
 		carry := C._addcarry_u64(0, lo, z, &lo)
 		hi += carry
 		return hi, lo
 	} $else $if amd64 {
 		asm amd64 {
 			mulq rdx
 			addq rax, z
 			adcq rdx, 0
 			; =a (lo)
 			  =d (hi)
 			; a (x)
 			  d (y)
 			  r (z)
 			; cc
 		}
 		return hi, lo
 	}
 	// cross compile
 	return mul_add_64_default(x, y, z)
 }
 // div_64 returns the quotient and remainder of (hi, lo) divided by y:
 // quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
 // half in parameter hi and the lower half in parameter lo.
 // div_64 panics for y == 0 (division by zero) or y <= hi (quotient overflow).
@[inline]
 pub fn div_64(hi u64, lo u64, y1 u64) (u64, u64) {
 	mut y := y1
 	if y == 0 {
 		panic(overflow_error)
 	}
 	if y <= hi {
 		panic(overflow_error)
 	}
 	mut quo := u64(0)
 	mut rem := u64(0)
 	$if msvc {
 		quo = C._udiv128(hi, lo, y, &rem)
 		return quo, rem
 	} $else $if amd64 {
 		asm amd64 {
 			div y
 			; =a (quo)
 			  =d (rem)
 			; d (hi)
 			  a (lo)
 			  r (y)
 			; cc
 		}
 		return quo, rem
 	}
 	// cross compile
 	return div_64_default(hi, lo, y1)
 }
--- a/vlib/math/bits/bits.arm64.v
+++ b/vlib/math/bits/bits.arm64.v
@ -0,0 +1,55 @@
 // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
 // Use of this source code is governed by an MIT license
 // that can be found in the LICENSE file.
 module bits
 // mul_64 returns the 128-bit product of x and y: (hi, lo) = x * y
 // with the product bits' upper half returned in hi and the lower
 // half returned in lo.
 //
 // This function's execution time does not depend on the inputs.
@[inline]
 pub fn mul_64(x u64, y u64) (u64, u64) {
 	mut hi := u64(0)
 	mut lo := u64(0)
 	$if arm64 && !tinyc {
 		asm arm64 {
 			mul lo, x, y
 			umulh hi, x, y
 			; =&r (hi)
 			  =&r (lo)
 			; r (x)
 			  r (y)
 			; cc
 		}
 		return hi, lo
 	}
 	// cross compile
 	return mul_64_default(x, y)
 }
 // mul_add_64 returns the 128-bit result of x * y + z: (hi, lo) = x * y + z
 // with the result bits' upper half returned in hi and the lower
 // half returned in lo.
@[inline]
 pub fn mul_add_64(x u64, y u64, z u64) (u64, u64) {
 	mut hi := u64(0)
 	mut lo := u64(0)
 	$if arm64 && !tinyc {
 		asm arm64 {
 			mul lo, x, y
 			umulh hi, x, y
 			adds lo, lo, z
 			adc hi, hi, xzr
 			; =&r (hi)
 			  =&r (lo)
 			; r (x)
 			  r (y)
 			  r (z)
 			; cc
 		}
 		return hi, lo
 	}
 	// cross compile
 	return mul_add_64_default(x, y, z)
 }
--- a/vlib/math/bits/bits.c.v
+++ b/vlib/math/bits/bits.c.v
@ -0,0 +1,186 @@
 // Copyright (c) 2019-2024 Alexander Medvednikov. All rights reserved.
 // Use of this source code is governed by an MIT license
 // that can be found in the LICENSE file.
 module bits
 fn C.__builtin_clz(x u32) int
 fn C.__builtin_clzll(x u64) int
 fn C.__lzcnt(x u32) int
 fn C.__lzcnt64(x u64) int
 // --- LeadingZeros ---
 // leading_zeros_8 returns the number of leading zero bits in x; the result is 8 for x == 0.
@[inline]
 pub fn leading_zeros_8(x u8) int {
 	if x == 0 {
 		return 8
 	}
 	$if msvc {
 		return C.__lzcnt(x) - 24
 	} $else $if !tinyc {
 		return C.__builtin_clz(x) - 24
 	}
 	return leading_zeros_8_default(x)
 }
 // leading_zeros_16 returns the number of leading zero bits in x; the result is 16 for x == 0.
@[inline]
 pub fn leading_zeros_16(x u16) int {
 	if x == 0 {
 		return 16
 	}
 	$if msvc {
 		return C.__lzcnt(x) - 16
 	} $else $if !tinyc {
 		return C.__builtin_clz(x) - 16
 	}
 	return leading_zeros_16_default(x)
 }
 // leading_zeros_32 returns the number of leading zero bits in x; the result is 32 for x == 0.
@[inline]
 pub fn leading_zeros_32(x u32) int {
 	if x == 0 {
 		return 32
 	}
 	$if msvc {
 		return C.__lzcnt(x)
 	} $else $if !tinyc {
 		return C.__builtin_clz(x)
 	}
 	return leading_zeros_32_default(x)
 }
 // leading_zeros_64 returns the number of leading zero bits in x; the result is 64 for x == 0.
@[inline]
 pub fn leading_zeros_64(x u64) int {
 	if x == 0 {
 		return 64
 	}
 	$if msvc {
 		return C.__lzcnt64(x)
 	} $else $if !tinyc {
 		return C.__builtin_clzll(x)
 	}
 	return leading_zeros_64_default(x)
 }
 fn C.__builtin_ctz(x u32) int
 fn C.__builtin_ctzll(x u64) int
 fn C._BitScanForward(pos &int, x u32) u8
 fn C._BitScanForward64(pos &int, x u64) u8
 // --- TrailingZeros ---
 // trailing_zeros_8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
@[inline]
 pub fn trailing_zeros_8(x u8) int {
 	if x == 0 {
 		return 8
 	}
 	$if msvc {
 		mut pos := 0
 		_ := C._BitScanForward(&pos, x)
 		return pos
 	} $else $if !tinyc {
 		return C.__builtin_ctz(x)
 	}
 	return trailing_zeros_8_default(x)
 }
 // trailing_zeros_16 returns the number of trailing zero bits in x; the result is 16 for x == 0.
@[inline]
 pub fn trailing_zeros_16(x u16) int {
 	if x == 0 {
 		return 16
 	}
 	$if msvc {
 		mut pos := 0
 		_ := C._BitScanForward(&pos, x)
 		return pos
 	} $else $if !tinyc {
 		return C.__builtin_ctz(x)
 	}
 	return trailing_zeros_16_default(x)
 }
 // trailing_zeros_32 returns the number of trailing zero bits in x; the result is 32 for x == 0.
@[inline]
 pub fn trailing_zeros_32(x u32) int {
 	if x == 0 {
 		return 32
 	}
 	$if msvc {
 		mut pos := 0
 		_ := C._BitScanForward(&pos, x)
 		return pos
 	} $else $if !tinyc {
 		return C.__builtin_ctz(x)
 	}
 	return trailing_zeros_32_default(x)
 }
 // trailing_zeros_64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
@[inline]
 pub fn trailing_zeros_64(x u64) int {
 	if x == 0 {
 		return 64
 	}
 	$if msvc {
 		mut pos := 0
 		_ := C._BitScanForward64(&pos, x)
 		return pos
 	} $else $if !tinyc {
 		return C.__builtin_ctzll(x)
 	}
 	return trailing_zeros_64_default(x)
 }
 fn C.__builtin_popcount(x u32) int
 fn C.__builtin_popcountll(x u64) int
 fn C.__popcnt(x u32) int
 fn C.__popcnt64(x u64) int
 // --- OnesCount ---
 // ones_count_8 returns the number of one bits ("population count") in x.
@[inline]
 pub fn ones_count_8(x u8) int {
 	$if msvc {
 		return C.__popcnt(x)
 	} $else $if !tinyc {
 		return C.__builtin_popcount(x)
 	}
 	return ones_count_8_default(x)
 }
 // ones_count_16 returns the number of one bits ("population count") in x.
@[inline]
 pub fn ones_count_16(x u16) int {
 	$if msvc {
 		return C.__popcnt(x)
 	} $else $if !tinyc {
 		return C.__builtin_popcount(x)
 	}
 	return ones_count_16_default(x)
 }
 // ones_count_32 returns the number of one bits ("population count") in x.
@[inline]
 pub fn ones_count_32(x u32) int {
 	$if msvc {
 		return C.__popcnt(x)
 	} $else $if !tinyc {
 		return C.__builtin_popcount(x)
 	}
 	return ones_count_32_default(x)
 }
 // ones_count_64 returns the number of one bits ("population count") in x.
@[inline]
 pub fn ones_count_64(x u64) int {
 	$if msvc {
 		return C.__popcnt64(x)
 	} $else $if !tinyc {
 		return C.__builtin_popcountll(x)
 	}
 	return ones_count_64_default(x)
 }
--- a/vlib/math/bits/bits.v
+++ b/vlib/math/bits/bits.v
@ -24,35 +24,69 @@ const m4 = u64(0x0000ffff0000ffff)
 // --- LeadingZeros ---
 // leading_zeros_8 returns the number of leading zero bits in x; the result is 8 for x == 0.
@[inline]
 pub fn leading_zeros_8(x u8) int {
 	return leading_zeros_8_default(x)
 }
@[inline]
 fn leading_zeros_8_default(x u8) int {
 	return 8 - len_8(x)
 }
 // leading_zeros_16 returns the number of leading zero bits in x; the result is 16 for x == 0.
@[inline]
 pub fn leading_zeros_16(x u16) int {
 	return leading_zeros_16_default(x)
 }
@[inline]
 fn leading_zeros_16_default(x u16) int {
 	return 16 - len_16(x)
 }
 // leading_zeros_32 returns the number of leading zero bits in x; the result is 32 for x == 0.
@[inline]
 pub fn leading_zeros_32(x u32) int {
 	return leading_zeros_32_default(x)
 }
@[inline]
 fn leading_zeros_32_default(x u32) int {
 	return 32 - len_32(x)
 }
 // leading_zeros_64 returns the number of leading zero bits in x; the result is 64 for x == 0.
@[inline]
 pub fn leading_zeros_64(x u64) int {
 	return leading_zeros_64_default(x)
 }
@[inline]
 fn leading_zeros_64_default(x u64) int {
 	return 64 - len_64(x)
 }
 // --- TrailingZeros ---
 // trailing_zeros_8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
-@[direct_array_access]
+@[inline]
 pub fn trailing_zeros_8(x u8) int {
 	return trailing_zeros_8_default(x)
 }
@[direct_array_access; inline]
 fn trailing_zeros_8_default(x u8) int {
 	return int(ntz_8_tab[x])
 }
 // trailing_zeros_16 returns the number of trailing zero bits in x; the result is 16 for x == 0.
-@[direct_array_access]
+@[inline]
 pub fn trailing_zeros_16(x u16) int {
 	return trailing_zeros_16_default(x)
 }
@[direct_array_access; inline]
 fn trailing_zeros_16_default(x u16) int {
 	if x == 0 {
 		return 16
 	}
@ -61,8 +95,13 @@ pub fn trailing_zeros_16(x u16) int {
 }
 // trailing_zeros_32 returns the number of trailing zero bits in x; the result is 32 for x == 0.
-@[direct_array_access]
+@[inline]
 pub fn trailing_zeros_32(x u32) int {
 	return trailing_zeros_32_default(x)
 }
@[direct_array_access; inline]
 fn trailing_zeros_32_default(x u32) int {
 	if x == 0 {
 		return 32
 	}
@ -71,8 +110,13 @@ pub fn trailing_zeros_32(x u32) int {
 }
 // trailing_zeros_64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
-@[direct_array_access]
+@[inline]
 pub fn trailing_zeros_64(x u64) int {
 	return trailing_zeros_64_default(x)
 }
@[direct_array_access; inline]
 fn trailing_zeros_64_default(x u64) int {
 	if x == 0 {
 		return 64
 	}
@ -92,26 +136,46 @@ pub fn trailing_zeros_64(x u64) int {
 // --- OnesCount ---
 // ones_count_8 returns the number of one bits ("population count") in x.
-@[direct_array_access]
+@[inline]
 pub fn ones_count_8(x u8) int {
 	return ones_count_8_default(x)
 }
@[direct_array_access; inline]
 fn ones_count_8_default(x u8) int {
 	return int(pop_8_tab[x])
 }
 // ones_count_16 returns the number of one bits ("population count") in x.
-@[direct_array_access]
+@[inline]
 pub fn ones_count_16(x u16) int {
 	return ones_count_16_default(x)
 }
@[direct_array_access; inline]
 fn ones_count_16_default(x u16) int {
 	return int(pop_8_tab[x >> 8] + pop_8_tab[x & u16(0xff)])
 }
 // ones_count_32 returns the number of one bits ("population count") in x.
-@[direct_array_access]
+@[inline]
 pub fn ones_count_32(x u32) int {
 	return ones_count_32_default(x)
 }
@[direct_array_access; inline]
 fn ones_count_32_default(x u32) int {
 	return int(pop_8_tab[x >> 24] + pop_8_tab[(x >> 16) & 0xff] + pop_8_tab[(x >> 8) & 0xff] +
 		pop_8_tab[x & u32(0xff)])
 }
 // ones_count_64 returns the number of one bits ("population count") in x.
@[inline]
 pub fn ones_count_64(x u64) int {
 	return ones_count_64_default(x)
 }
 fn ones_count_64_default(x u64) int {
 	// Implementation: Parallel summing of adjacent bits.
 	// See "Hacker's Delight", Chap. 5: Counting Bits.
 	// The following pattern shows the general approach:
@ -376,7 +440,13 @@ const divide_error = 'Divide Error'
 // half returned in lo.
 //
 // This function's execution time does not depend on the inputs.
@[inline]
 pub fn mul_32(x u32, y u32) (u32, u32) {
 	return mul_32_default(x, y)
 }
@[inline]
 fn mul_32_default(x u32, y u32) (u32, u32) {
 	tmp := u64(x) * u64(y)
 	hi := u32(tmp >> 32)
 	lo := u32(tmp)
@ -388,7 +458,12 @@ pub fn mul_32(x u32, y u32) (u32, u32) {
 // half returned in lo.
 //
 // This function's execution time does not depend on the inputs.
@[inline]
 pub fn mul_64(x u64, y u64) (u64, u64) {
 	return mul_64_default(x, y)
 }
 fn mul_64_default(x u64, y u64) (u64, u64) {
 	x0 := x & mask32
 	x1 := x >> 32
 	y0 := y & mask32
@ -403,12 +478,49 @@ pub fn mul_64(x u64, y u64) (u64, u64) {
 	return hi, lo
 }
 // mul_add_32 returns the 64-bit result of x * y + z: (hi, lo) = x * y + z
 // with the result bits' upper half returned in hi and the lower
 // half returned in lo.
@[inline]
 pub fn mul_add_32(x u32, y u32, z u32) (u32, u32) {
 	return mul_add_32_default(x, y, z)
 }
@[inline]
 fn mul_add_32_default(x u32, y u32, z u32) (u32, u32) {
 	tmp := u64(x) * u64(y) + u64(z)
 	hi := u32(tmp >> 32)
 	lo := u32(tmp)
 	return hi, lo
 }
 // mul_add_64 returns the 128-bit result of x * y + z: (hi, lo) = x * y + z
 // with the result bits' upper half returned in hi and the lower
 // half returned in lo.
@[inline]
 pub fn mul_add_64(x u64, y u64, z u64) (u64, u64) {
 	return mul_add_64_default(x, y, z)
 }
@[inline]
 fn mul_add_64_default(x u64, y u64, z u64) (u64, u64) {
 	h, l := mul_64(x, y)
 	lo := l + z
 	hi := h + u64(lo < l)
 	return hi, lo
 }
 // --- Full-width divide ---
 // div_32 returns the quotient and remainder of (hi, lo) divided by y:
 // quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
 // half in parameter hi and the lower half in parameter lo.
 // div_32 panics for y == 0 (division by zero) or y <= hi (quotient overflow).
@[inline]
 pub fn div_32(hi u32, lo u32, y u32) (u32, u32) {
 	return div_32_default(hi, lo, y)
 }
 fn div_32_default(hi u32, lo u32, y u32) (u32, u32) {
 	if y != 0 && y <= hi {
 		panic(overflow_error)
 	}
@ -422,7 +534,12 @@ pub fn div_32(hi u32, lo u32, y u32) (u32, u32) {
 // quo = (hi, lo)/y, rem = (hi, lo)%y with the dividend bits' upper
 // half in parameter hi and the lower half in parameter lo.
 // div_64 panics for y == 0 (division by zero) or y <= hi (quotient overflow).
@[inline]
 pub fn div_64(hi u64, lo u64, y1 u64) (u64, u64) {
 	return div_64_default(hi, lo, y1)
 }
 fn div_64_default(hi u64, lo u64, y1 u64) (u64, u64) {
 	mut y := y1
 	if y == 0 {
 		panic(overflow_error)
--- a/vlib/math/bits/bits_test.v
+++ b/vlib/math/bits/bits_test.v
@ -17,6 +17,7 @@ fn test_bits() {
 		// C.printf("x:%02x lz: %d cmp: %d\n", i << x, leading_zeros_8(i << x), 7-x)
 		assert leading_zeros_8(u8(u8(i) << x)) == 7 - x
 	}
 	assert leading_zeros_8(0) == 8
 	// 16 bit
 	i = 1
@ -24,6 +25,7 @@ fn test_bits() {
 		// C.printf("x:%04x lz: %d cmp: %d\n", u16(i) << x, leading_zeros_16(u16(i) << x), 15-x)
 		assert leading_zeros_16(u16(i) << x) == 15 - x
 	}
 	assert leading_zeros_16(0) == 16
 	// 32 bit
 	i = 1
@ -31,6 +33,7 @@ fn test_bits() {
 		// C.printf("x:%08x lz: %d cmp: %d\n", u32(i) << x, leading_zeros_32(u32(i) << x), 31-x)
 		assert leading_zeros_32(u32(i) << x) == 31 - x
 	}
 	assert leading_zeros_32(0) == 32
 	// 64 bit
 	i = 1
@ -38,6 +41,39 @@ fn test_bits() {
 		// C.printf("x:%016llx lz: %llu cmp: %d\n", u64(i) << x, leading_zeros_64(u64(i) << x), 63-x)
 		assert leading_zeros_64(u64(i) << x) == 63 - x
 	}
 	assert leading_zeros_64(0) == 64
 	//
 	// --- TrailingZeros ---
 	//
 	// 8 bit
 	i = 1
 	for x in 0 .. 8 {
 		assert trailing_zeros_8(u8(u8(i) << x)) == x
 	}
 	assert trailing_zeros_8(0) == 8
 	// 16 bit
 	i = 1
 	for x in 0 .. 16 {
 		assert trailing_zeros_16(u16(i) << x) == x
 	}
 	assert trailing_zeros_16(0) == 16
 	// 32 bit
 	i = 1
 	for x in 0 .. 32 {
 		assert trailing_zeros_32(u32(i) << x) == x
 	}
 	assert trailing_zeros_32(0) == 32
 	// 64 bit
 	i = 1
 	for x in 0 .. 64 {
 		assert trailing_zeros_64(u64(i) << x) == x
 	}
 	assert trailing_zeros_64(0) == 64
 	//
 	// --- ones_count ---
@ -50,6 +86,8 @@ fn test_bits() {
 		assert ones_count_8(u8(i)) == x
 		i = int(u32(i) << 1) + 1
 	}
 	assert ones_count_8(0) == 0
 	assert ones_count_8(0xFF) == 8
 	// 16 bit
 	i = 0
@ -58,6 +96,8 @@ fn test_bits() {
 		assert ones_count_16(u16(i)) == x
 		i = int(u32(i) << 1) + 1
 	}
 	assert ones_count_16(0) == 0
 	assert ones_count_16(0xFFFF) == 16
 	// 32 bit
 	i = 0
@ -66,6 +106,8 @@ fn test_bits() {
 		assert ones_count_32(u32(i)) == x
 		i = int(u32(i) << 1) + 1
 	}
 	assert ones_count_32(0) == 0
 	assert ones_count_32(0xFFFF_FFFF) == 32
 	// 64 bit
 	i1 = 0
@ -74,6 +116,8 @@ fn test_bits() {
 		assert ones_count_64(i1) == x
 		i1 = (i1 << 1) + 1
 	}
 	assert ones_count_64(0) == 0
 	assert ones_count_64(0xFFFF_FFFF_FFFF_FFFF) == 64
 	//
 	// --- rotate_left/right ---
@ -241,6 +285,9 @@ fn test_bits() {
 		v1 := v0 - 1
 		hi, lo := mul_32(v0, v1)
 		assert (u64(hi) << 32) | (u64(lo)) == u64(v0) * u64(v1)
 		v2 := u32(x)
 		h, l := mul_add_32(v0, v1, v2)
 		assert (u64(h) << 32) | (u64(l)) == u64(v0) * u64(v1) + u64(v2)
 	}
 	// 64 bit
@ -252,6 +299,11 @@ fn test_bits() {
 		// C.printf("v0: %llu v1: %llu [%llu,%llu] tt: %llu\n", v0, v1, hi, lo, (v0 >> 32) * (v1 >> 32))
 		assert (hi & 0xFFFF_FFFF_0000_0000) == (((v0 >> 32) * (v1 >> 32)) & 0xFFFF_FFFF_0000_0000)
 		assert (lo & 0x0000_0000_FFFF_FFFF) == (((v0 & 0x0000_0000_FFFF_FFFF) * (v1 & 0x0000_0000_FFFF_FFFF)) & 0x0000_0000_FFFF_FFFF)
 		v2 := u64(x)
 		h, l := mul_add_64(v0, v1, v2)
 		assert (h & 0xFFFF_FFFF_0000_0000) == (((v0 >> 32) * (v1 >> 32)) & 0xFFFF_FFFF_0000_0000)
 		assert (l & 0x0000_0000_FFFF_FFFF) == ((
 			(v0 & 0x0000_0000_FFFF_FFFF) * (v1 & 0x0000_0000_FFFF_FFFF) + v2) & 0x0000_0000_FFFF_FFFF)
 	}
 	//