examples: add a cpu_features/ folder, with several examples, using SSE and MMX assembly instructions (#22645)

2025-09-13 22:42:26 +03:00 · 2024-11-21 19:20:51 -04:00 · 2024-11-21 19:20:51 -04:00 · 05377f3c03
commit 05377f3c03
parent 27b728b600
7 changed files with 279 additions and 0 deletions
--- a/examples/cpu_features/SSE_and_MMX_Extensions/README.md
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/README.md
@ -0,0 +1,63 @@
+Note: To more deep study see https://en.wikibooks.org/wiki/X86_Assembly
+
+# SSE and MMX Extensions
+
+This document provides an overview of the SSE and MMX extensions used in the project.
+
+## Table of Contents
+
+- [Introduction](#introduction)
+- [SSE Extensions](#sse-extensions)
+- [MMX Extensions](#mmx-extensions)
+- [Usage](#usage)
+
+## Introduction
+
+SSE (Streaming SIMD Extensions) and MMX (MultiMedia eXtensions) are instruction sets used to
+enhance the performance of multimedia and signal processing applications.
+
+## SSE Extensions
+
+SSE extensions provide a set of instructions that can handle multiple data with a single
+instruction, improving the performance of applications that require heavy mathematical
+computations.
+
+from: [wikibooks](https://en.wikibooks.org/wiki/X86_Assembly/SSE#SSE_Instruction_Set)
+There are literally hundreds of SSE instructions, some of which are capable of much more than
+simple SIMD arithmetic. For more in-depth references take a look at the resources chapter of this
+book.
+
+You may notice that many floating point SSE instructions end with something like PS or SD. These
+suffixes differentiate between different versions of the operation. The first letter describes
+whether the instruction should be Packed or Scalar. Packed operations are applied to every member
+of the register, while scalar operations are applied to only the first value. For example, in
+pseudo-code, a packed add would be executed as:
+
+```
+v1[0] = v1[0] + v2[0]
+v1[1] = v1[1] + v2[1]
+v1[2] = v1[2] + v2[2]
+v1[3] = v1[3] + v2[3]
+```
+
+While a scalar add would only be:
+
+```
+v1[0] = v1[0] + v2[0]
+```
+
+The second letter refers to the data size: either Single or Double. This simply tells the
+processor whether to use the register as four 32-bit floats or two 64-bit doubles, respectively.
+
+## MMX Extensions
+
+MMX extensions are designed to accelerate multimedia and communication applications by providing
+instructions that can process multiple data elements in parallel.
+
+## Usage
+
+To use these extensions in your project, ensure that your compiler supports them and that you have
+enabled the appropriate flags.
+On Linux, you can run the command `lscpu`
+
+Note: the examples here will compile, but not run on CPU architectures != amd64, like ARM or RISCV .
--- a/examples/cpu_features/SSE_and_MMX_Extensions/mmx.v
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/mmx.v
@ -0,0 +1,39 @@
+// MMX Instruction Set
+// Several suffixes are used to indicate what data size the instruction operates on:
+// Byte (8 bits)
+// Word (16 bits)
+// Double word (32 bits)
+// Quad word (64 bits)
+// The signedness of the operation is also signified by the suffix: US for unsigned and S for signed.
+// For example, PSUBUSB subtracts unsigned bytes, while PSUBSD subtracts signed double words.
+// MMX defined over 40 new instructions, listed below.
+// EMMS, MOVD, MOVQ, PACKSSDW, PACKSSWB, PACKUSWB, PADDB, PADDD, PADDSB, PADDSW, PADDUSB, PADDUSW,
+// PADDW, PAND, PANDN, PCMPEQB, PCMPEQD, PCMPEQW, PCMPGTB, PCMPGTD, PCMPGTW, PMADDWD, PMULHW, PMULLW,
+// POR, PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, PSUBB, PSUBD, PSUBSB, PSUBSW, PSUBUSB,
+// PSUBUSW, PSUBW, PUNPCKHBW, PUNPCKHDQ, PUNPCKHWD, PUNPCKLBW, PUNPCKLDQ, PUNPCKLWD, PXOR
+
+@[if amd64 && !tinyc && !msvc]
+fn add_vectors_mmx(a &u8, b &u8, result &u8) {
+	unsafe {
+		asm volatile amd64 {
+			movq mm0, [a] // Load 8 bytes from a into MMX register mm0
+			movq mm1, [b] // Load 8 bytes from b into MMX register mm1
+			paddb mm0, mm1 // Add the two vectors using MMX instruction
+			movq [result], mm0 // Store the result back to memory
+			; ; r (a)
+			  r (b)
+			  r (result)
+			; mm0
+			  mm1
+		}
+	}
+}
+
+fn main() {
+	a := [u8(1), 2, 3, 4, 5, 6, 7, 8]
+	b := [u8(8), 7, 6, 5, 4, 3, 2, 1]
+	result := []u8{len: 8}
+	add_vectors_mmx(&a[0], &b[0], &result[0])
+	println(result)
+	assert result == [u8(9), 9, 9, 9, 9, 9, 9, 9]
+}
--- a/examples/cpu_features/SSE_and_MMX_Extensions/sse.v
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse.v
@ -0,0 +1,37 @@
+// SSE Instruction Set
+// SSE: Added with Pentium III
+// Floating-point Instructions:
+// ADDPS, ADDSS, CMPPS, CMPSS, COMISS, CVTPI2PS, CVTPS2PI, CVTSI2SS, CVTSS2SI, CVTTPS2PI, CVTTSS2SI,
+// DIVPS, DIVSS, LDMXCSR, MAXPS, MAXSS, MINPS, MINSS, MOVAPS, MOVHLPS, MOVHPS, MOVLHPS, MOVLPS,
+// MOVMSKPS, MOVNTPS, MOVSS, MOVUPS, MULPS, MULSS, RCPPS, RCPSS, RSQRTPS, RSQRTSS, SHUFPS, SQRTPS,
+// SQRTSS, STMXCSR, SUBPS, SUBSS, UCOMISS, UNPCKHPS, UNPCKLPS
+//
+// Integer Instructions:
+// ANDNPS, ANDPS, ORPS, PAVGB, PAVGW, PEXTRW, PINSRW, PMAXSW, PMAXUB, PMINSW, PMINUB, PMOVMSKB, PMULHUW, PSADBW, PSHUFW, XORPS
+// The ADDPS instruction adds two vectors of floats using SSE instructions.
+
+@[if amd64 && !tinyc && !msvc]
+fn add_vectors_sse(a &f32, b &f32, result &f32) {
+	unsafe {
+		asm volatile amd64 {
+			movups xmm0, [a] // Load 4 floats from array a into SSE register xmm0
+			movups xmm1, [b] // Load 4 floats from array b into SSE register xmm1
+			addps xmm0, xmm1 // Add the two vectors using SSE instruction
+			movups [result], xmm0 // Store the result back to memory
+			; ; r (a)
+			  r (b)
+			  r (result)
+			; xmm0
+			  xmm1
+		}
+	}
+}
+
+fn main() {
+	a := [f32(1.0), 2.0, 3.0, 4.0]
+	b := [f32(4.0), 3.0, 2.0, 1.0]
+	result := []f32{len: 4}
+	add_vectors_sse(&a[0], &b[0], &result[0])
+	println(result)
+	assert result == [f32(5.0), 5.0, 5.0, 5.0]
+}
--- a/examples/cpu_features/SSE_and_MMX_Extensions/sse2.v
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse2.v
@ -0,0 +1,42 @@
+// SSE Instruction Set
+// SSE2: Added with Pentium 4
+// Floating-point Instructions:
+// ADDPD, ADDSD, ANDNPD, ANDPD, CMPPD, CMPSD*, COMISD, CVTDQ2PD, CVTDQ2PS, CVTPD2DQ, CVTPD2PI,
+// CVTPD2PS, CVTPI2PD, CVTPS2DQ, CVTPS2PD, CVTSD2SI, CVTSD2SS, CVTSI2SD, CVTSS2SD, CVTTPD2DQ,
+// CVTTPD2PI, CVTTPS2DQ, CVTTSD2SI, DIVPD, DIVSD, MAXPD, MAXSD, MINPD, MINSD, MOVAPD, MOVHPD,
+// MOVLPD, MOVMSKPD, MOVSD*, MOVUPD, MULPD, MULSD, ORPD, SHUFPD, SQRTPD, SQRTSD, SUBPD, SUBSD,
+// UCOMISD, UNPCKHPD, UNPCKLPD, XORPD
+// * CMPSD and MOVSD have the same name as the string instruction mnemonics CMPSD (CMPS) and
+// MOVSD (MOVS); however, the former refer to scalar double-precision floating-points whereas
+// the latter refer to doubleword strings.
+// Integer Instructions:
+// MOVDQ2Q, MOVDQA, MOVDQU, MOVQ2DQ, PADDQ, PSUBQ, PMULUDQ, PSHUFHW, PSHUFLW, PSHUFD, PSLLDQ, PSRLDQ, PUNPCKHQDQ, PUNPCKLQDQ
+// The MULPD instruction multiplies two vectors of doubles using SSE2 instructions.
+
+@[if amd64 && !tinyc && !msvc]
+fn multiply_vectors_sse2(a &f64, b &f64, result &f64) {
+	unsafe {
+		asm volatile amd64 {
+			movupd xmm0, [a] // Load 2 doubles from array a into SSE2 register xmm0
+			movupd xmm1, [b] // Load 2 doubles from array b into SSE2 register xmm1
+			mulpd xmm0, xmm1 // Multiply the two vectors using SSE2 instruction
+			movupd [result], xmm0 // Store the result back to memory
+			; ; r (a)
+			  r (b)
+			  r (result)
+			; xmm0
+			  xmm1
+		}
+	}
+}
+
+fn main() {
+	a := [f64(1.5), 2.5]
+	b := [f64(3.5), 4.5]
+	result := []f64{len: 2}
+	multiply_vectors_sse2(&a[0], &b[0], &result[0])
+	println(result)
+	// 5.25 = 1.5 * 3.5
+	// 11.25 = 2.5 * 4.5
+	assert result == [f64(5.25), 11.25]
+}
--- a/examples/cpu_features/SSE_and_MMX_Extensions/sse3.v
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse3.v
@ -0,0 +1,36 @@
+// SSE Instruction Set
+// SSE3: Added with later Pentium 4
+// ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP
+// The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3
+// instructions.
+
+@[if amd64 && !tinyc && !msvc]
+fn horizontal_add_sse3(a &f32, b &f32, result &f32) {
+	unsafe {
+		asm volatile amd64 {
+			movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0
+			movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1
+			haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1
+			movaps [result], xmm0 // Store the result back to memory
+			; ; r (a)
+			  r (b)
+			  r (result)
+			; xmm0
+			  xmm1
+		}
+	}
+}
+
+fn main() {
+	a := [f32(1.0), 2.0, 3.0, 4.0]
+	b := [f32(5.0), 6.0, 7.0, 8.0]
+	result := []f32{len: 4}
+	horizontal_add_sse3(&a[0], &b[0], &result[0])
+	println(result)
+	// The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition.
+	// 1.0 + 2.0 = 3.0
+	// 3.0 + 4.0 = 7.0
+	// 5.0 + 6.0 = 11.0
+	// 7.0 + 8.0 = 15.0
+	assert result == [f32(3.0), 7.0, 11.0, 15.0]
+}
--- a/examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v
@ -0,0 +1,31 @@
+// SSE Instruction Set
+// SSE4.1: Added with later Core 2
+// MPSADBW, PHMINPOSUW, PMULLD, PMULDQ, DPPS, DPPD, BLENDPS, BLENDPD, BLENDVPS, BLENDVPD,
+// PBLENDVB, PBLENDW, PMINSB, PMAXSB, PMINUW, PMAXUW, PMINUD, PMAXUD, PMINSD, PMAXSD, ROUNDPS,
+// ROUNDSS, ROUNDPD, ROUNDSD, INSERTPS, PINSRB, PINSRD, PINSRQ, EXTRACTPS, PEXTRB, PEXTRW,
+// PEXTRD, PEXTRQ, PMOVSXBW, PMOVZXBW, PMOVSXBD, PMOVZXBD, PMOVSXBQ, PMOVZXBQ, PMOVSXWD,
+// PMOVZXWD, PMOVSXWQ, PMOVZXWQ, PMOVSXDQ, PMOVZXDQ, PTEST, PCMPEQQ, PACKUSDW, MOVNTDQA
+
+@[if amd64 && !tinyc && !msvc]
+fn round_floats_sse4_1(a &f32, result &f32) {
+	unsafe {
+		asm volatile amd64 {
+			movups xmm0, [a] // Load 4 floats from array a into xmm0
+			roundps xmm0, xmm0, 0 // Round to nearest integer
+			movups [result], xmm0 // Store the result in result array
+			; ; r (a)
+			  r (result)
+			; xmm0
+		}
+	}
+}
+
+fn main() {
+	a := [f32(1.2), 2.5, 3.8, 4.4]
+	result := []f32{len: 4}
+	// Rounding mode 0 corresponds to rounding to the nearest integer
+	round_floats_sse4_1(&a[0], &result[0])
+	println(result)
+	// The expected rounded result should be [1.0, 2.0, 4.0, 4.0]
+	assert result == [f32(1.0), 2.0, 4.0, 4.0]
+}
--- a/examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v
+++ b/examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v
@ -0,0 +1,31 @@
+// SSE Instruction Set
+// SSSE3: Added with Xeon 5100 and early Core 2
+// PSIGNW, PSIGND, PSIGNB, PSHUFB, PMULHRSW, PMADDUBSW, PHSUBW, PHSUBSW, PHSUBD, PHADDW, PHADDSW,
+// PHADDD, PALIGNR, PABSW, PABSD, PABSB
+// The PSIGNW instruction negates or leaves elements unchanged based on another vector's signs.
+
+@[if amd64 && !tinyc && !msvc]
+fn psignw_example(a &i16, b &i16, result &i16) {
+	unsafe {
+		asm volatile amd64 {
+			movdqa xmm0, [a] // Load 8 signed 16-bit integers from array a into xmm0
+			movdqa xmm1, [b] // Load 8 signed 16-bit integers from array b into xmm1
+			psignw xmm0, xmm1 // Adjust the sign of elements in xmm0 based on xmm1
+			movdqa [result], xmm0 // Store the result back to memory
+			; ; r (a)
+			  r (b)
+			  r (result)
+			; xmm0
+			  xmm1
+		}
+	}
+}
+
+fn main() {
+	a0 := [i16(1), -2, 3, -4, 5, -6, 7, -8]
+	b0 := [i16(1), -1, 1, -1, 1, -1, 1, -1]
+	result0 := []i16{len: 8}
+	psignw_example(&a0[0], &b0[0], &result0[0])
+	dump(result0)
+	assert result0 == [i16(1), 2, 3, 4, 5, 6, 7, 8]
+}