mirror of
https://github.com/vlang/v.git
synced 2025-09-13 22:42:26 +03:00
examples: add a cpu_features/ folder, with several examples, using SSE and MMX assembly instructions (#22645)
This commit is contained in:
parent
27b728b600
commit
05377f3c03
7 changed files with 279 additions and 0 deletions
63
examples/cpu_features/SSE_and_MMX_Extensions/README.md
Normal file
63
examples/cpu_features/SSE_and_MMX_Extensions/README.md
Normal file
|
@ -0,0 +1,63 @@
|
|||
Note: To more deep study see https://en.wikibooks.org/wiki/X86_Assembly
|
||||
|
||||
# SSE and MMX Extensions
|
||||
|
||||
This document provides an overview of the SSE and MMX extensions used in the project.
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Introduction](#introduction)
|
||||
- [SSE Extensions](#sse-extensions)
|
||||
- [MMX Extensions](#mmx-extensions)
|
||||
- [Usage](#usage)
|
||||
|
||||
## Introduction
|
||||
|
||||
SSE (Streaming SIMD Extensions) and MMX (MultiMedia eXtensions) are instruction sets used to
|
||||
enhance the performance of multimedia and signal processing applications.
|
||||
|
||||
## SSE Extensions
|
||||
|
||||
SSE extensions provide a set of instructions that can handle multiple data with a single
|
||||
instruction, improving the performance of applications that require heavy mathematical
|
||||
computations.
|
||||
|
||||
from: [wikibooks](https://en.wikibooks.org/wiki/X86_Assembly/SSE#SSE_Instruction_Set)
|
||||
There are literally hundreds of SSE instructions, some of which are capable of much more than
|
||||
simple SIMD arithmetic. For more in-depth references take a look at the resources chapter of this
|
||||
book.
|
||||
|
||||
You may notice that many floating point SSE instructions end with something like PS or SD. These
|
||||
suffixes differentiate between different versions of the operation. The first letter describes
|
||||
whether the instruction should be Packed or Scalar. Packed operations are applied to every member
|
||||
of the register, while scalar operations are applied to only the first value. For example, in
|
||||
pseudo-code, a packed add would be executed as:
|
||||
|
||||
```
|
||||
v1[0] = v1[0] + v2[0]
|
||||
v1[1] = v1[1] + v2[1]
|
||||
v1[2] = v1[2] + v2[2]
|
||||
v1[3] = v1[3] + v2[3]
|
||||
```
|
||||
|
||||
While a scalar add would only be:
|
||||
|
||||
```
|
||||
v1[0] = v1[0] + v2[0]
|
||||
```
|
||||
|
||||
The second letter refers to the data size: either Single or Double. This simply tells the
|
||||
processor whether to use the register as four 32-bit floats or two 64-bit doubles, respectively.
|
||||
|
||||
## MMX Extensions
|
||||
|
||||
MMX extensions are designed to accelerate multimedia and communication applications by providing
|
||||
instructions that can process multiple data elements in parallel.
|
||||
|
||||
## Usage
|
||||
|
||||
To use these extensions in your project, ensure that your compiler supports them and that you have
|
||||
enabled the appropriate flags.
|
||||
On Linux, you can run the command `lscpu`
|
||||
|
||||
Note: the examples here will compile, but not run on CPU architectures != amd64, like ARM or RISCV .
|
39
examples/cpu_features/SSE_and_MMX_Extensions/mmx.v
Normal file
39
examples/cpu_features/SSE_and_MMX_Extensions/mmx.v
Normal file
|
@ -0,0 +1,39 @@
|
|||
// MMX Instruction Set
|
||||
// Several suffixes are used to indicate what data size the instruction operates on:
|
||||
// Byte (8 bits)
|
||||
// Word (16 bits)
|
||||
// Double word (32 bits)
|
||||
// Quad word (64 bits)
|
||||
// The signedness of the operation is also signified by the suffix: US for unsigned and S for signed.
|
||||
// For example, PSUBUSB subtracts unsigned bytes, while PSUBSD subtracts signed double words.
|
||||
// MMX defined over 40 new instructions, listed below.
|
||||
// EMMS, MOVD, MOVQ, PACKSSDW, PACKSSWB, PACKUSWB, PADDB, PADDD, PADDSB, PADDSW, PADDUSB, PADDUSW,
|
||||
// PADDW, PAND, PANDN, PCMPEQB, PCMPEQD, PCMPEQW, PCMPGTB, PCMPGTD, PCMPGTW, PMADDWD, PMULHW, PMULLW,
|
||||
// POR, PSLLD, PSLLQ, PSLLW, PSRAD, PSRAW, PSRLD, PSRLQ, PSRLW, PSUBB, PSUBD, PSUBSB, PSUBSW, PSUBUSB,
|
||||
// PSUBUSW, PSUBW, PUNPCKHBW, PUNPCKHDQ, PUNPCKHWD, PUNPCKLBW, PUNPCKLDQ, PUNPCKLWD, PXOR
|
||||
|
||||
@[if amd64 && !tinyc && !msvc]
|
||||
fn add_vectors_mmx(a &u8, b &u8, result &u8) {
|
||||
unsafe {
|
||||
asm volatile amd64 {
|
||||
movq mm0, [a] // Load 8 bytes from a into MMX register mm0
|
||||
movq mm1, [b] // Load 8 bytes from b into MMX register mm1
|
||||
paddb mm0, mm1 // Add the two vectors using MMX instruction
|
||||
movq [result], mm0 // Store the result back to memory
|
||||
; ; r (a)
|
||||
r (b)
|
||||
r (result)
|
||||
; mm0
|
||||
mm1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
a := [u8(1), 2, 3, 4, 5, 6, 7, 8]
|
||||
b := [u8(8), 7, 6, 5, 4, 3, 2, 1]
|
||||
result := []u8{len: 8}
|
||||
add_vectors_mmx(&a[0], &b[0], &result[0])
|
||||
println(result)
|
||||
assert result == [u8(9), 9, 9, 9, 9, 9, 9, 9]
|
||||
}
|
37
examples/cpu_features/SSE_and_MMX_Extensions/sse.v
Normal file
37
examples/cpu_features/SSE_and_MMX_Extensions/sse.v
Normal file
|
@ -0,0 +1,37 @@
|
|||
// SSE Instruction Set
|
||||
// SSE: Added with Pentium III
|
||||
// Floating-point Instructions:
|
||||
// ADDPS, ADDSS, CMPPS, CMPSS, COMISS, CVTPI2PS, CVTPS2PI, CVTSI2SS, CVTSS2SI, CVTTPS2PI, CVTTSS2SI,
|
||||
// DIVPS, DIVSS, LDMXCSR, MAXPS, MAXSS, MINPS, MINSS, MOVAPS, MOVHLPS, MOVHPS, MOVLHPS, MOVLPS,
|
||||
// MOVMSKPS, MOVNTPS, MOVSS, MOVUPS, MULPS, MULSS, RCPPS, RCPSS, RSQRTPS, RSQRTSS, SHUFPS, SQRTPS,
|
||||
// SQRTSS, STMXCSR, SUBPS, SUBSS, UCOMISS, UNPCKHPS, UNPCKLPS
|
||||
//
|
||||
// Integer Instructions:
|
||||
// ANDNPS, ANDPS, ORPS, PAVGB, PAVGW, PEXTRW, PINSRW, PMAXSW, PMAXUB, PMINSW, PMINUB, PMOVMSKB, PMULHUW, PSADBW, PSHUFW, XORPS
|
||||
// The ADDPS instruction adds two vectors of floats using SSE instructions.
|
||||
|
||||
@[if amd64 && !tinyc && !msvc]
|
||||
fn add_vectors_sse(a &f32, b &f32, result &f32) {
|
||||
unsafe {
|
||||
asm volatile amd64 {
|
||||
movups xmm0, [a] // Load 4 floats from array a into SSE register xmm0
|
||||
movups xmm1, [b] // Load 4 floats from array b into SSE register xmm1
|
||||
addps xmm0, xmm1 // Add the two vectors using SSE instruction
|
||||
movups [result], xmm0 // Store the result back to memory
|
||||
; ; r (a)
|
||||
r (b)
|
||||
r (result)
|
||||
; xmm0
|
||||
xmm1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
a := [f32(1.0), 2.0, 3.0, 4.0]
|
||||
b := [f32(4.0), 3.0, 2.0, 1.0]
|
||||
result := []f32{len: 4}
|
||||
add_vectors_sse(&a[0], &b[0], &result[0])
|
||||
println(result)
|
||||
assert result == [f32(5.0), 5.0, 5.0, 5.0]
|
||||
}
|
42
examples/cpu_features/SSE_and_MMX_Extensions/sse2.v
Normal file
42
examples/cpu_features/SSE_and_MMX_Extensions/sse2.v
Normal file
|
@ -0,0 +1,42 @@
|
|||
// SSE Instruction Set
|
||||
// SSE2: Added with Pentium 4
|
||||
// Floating-point Instructions:
|
||||
// ADDPD, ADDSD, ANDNPD, ANDPD, CMPPD, CMPSD*, COMISD, CVTDQ2PD, CVTDQ2PS, CVTPD2DQ, CVTPD2PI,
|
||||
// CVTPD2PS, CVTPI2PD, CVTPS2DQ, CVTPS2PD, CVTSD2SI, CVTSD2SS, CVTSI2SD, CVTSS2SD, CVTTPD2DQ,
|
||||
// CVTTPD2PI, CVTTPS2DQ, CVTTSD2SI, DIVPD, DIVSD, MAXPD, MAXSD, MINPD, MINSD, MOVAPD, MOVHPD,
|
||||
// MOVLPD, MOVMSKPD, MOVSD*, MOVUPD, MULPD, MULSD, ORPD, SHUFPD, SQRTPD, SQRTSD, SUBPD, SUBSD,
|
||||
// UCOMISD, UNPCKHPD, UNPCKLPD, XORPD
|
||||
// * CMPSD and MOVSD have the same name as the string instruction mnemonics CMPSD (CMPS) and
|
||||
// MOVSD (MOVS); however, the former refer to scalar double-precision floating-points whereas
|
||||
// the latter refer to doubleword strings.
|
||||
// Integer Instructions:
|
||||
// MOVDQ2Q, MOVDQA, MOVDQU, MOVQ2DQ, PADDQ, PSUBQ, PMULUDQ, PSHUFHW, PSHUFLW, PSHUFD, PSLLDQ, PSRLDQ, PUNPCKHQDQ, PUNPCKLQDQ
|
||||
// The MULPD instruction multiplies two vectors of doubles using SSE2 instructions.
|
||||
|
||||
@[if amd64 && !tinyc && !msvc]
|
||||
fn multiply_vectors_sse2(a &f64, b &f64, result &f64) {
|
||||
unsafe {
|
||||
asm volatile amd64 {
|
||||
movupd xmm0, [a] // Load 2 doubles from array a into SSE2 register xmm0
|
||||
movupd xmm1, [b] // Load 2 doubles from array b into SSE2 register xmm1
|
||||
mulpd xmm0, xmm1 // Multiply the two vectors using SSE2 instruction
|
||||
movupd [result], xmm0 // Store the result back to memory
|
||||
; ; r (a)
|
||||
r (b)
|
||||
r (result)
|
||||
; xmm0
|
||||
xmm1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
a := [f64(1.5), 2.5]
|
||||
b := [f64(3.5), 4.5]
|
||||
result := []f64{len: 2}
|
||||
multiply_vectors_sse2(&a[0], &b[0], &result[0])
|
||||
println(result)
|
||||
// 5.25 = 1.5 * 3.5
|
||||
// 11.25 = 2.5 * 4.5
|
||||
assert result == [f64(5.25), 11.25]
|
||||
}
|
36
examples/cpu_features/SSE_and_MMX_Extensions/sse3.v
Normal file
36
examples/cpu_features/SSE_and_MMX_Extensions/sse3.v
Normal file
|
@ -0,0 +1,36 @@
|
|||
// SSE Instruction Set
|
||||
// SSE3: Added with later Pentium 4
|
||||
// ADDSUBPD, ADDSUBPS, HADDPD, HADDPS, HSUBPD, HSUBPS, MOVDDUP, MOVSHDUP, MOVSLDUP
|
||||
// The HADDPS instruction performs horizontal addition of two vectors of floats using SSE3
|
||||
// instructions.
|
||||
|
||||
@[if amd64 && !tinyc && !msvc]
|
||||
fn horizontal_add_sse3(a &f32, b &f32, result &f32) {
|
||||
unsafe {
|
||||
asm volatile amd64 {
|
||||
movaps xmm0, [a] // Load 4 floats from array a into SSE3 register xmm0
|
||||
movaps xmm1, [b] // Load 4 floats from array b into SSE3 register xmm1
|
||||
haddps xmm0, xmm1 // Perform horizontal add of xmm0 and xmm1
|
||||
movaps [result], xmm0 // Store the result back to memory
|
||||
; ; r (a)
|
||||
r (b)
|
||||
r (result)
|
||||
; xmm0
|
||||
xmm1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
a := [f32(1.0), 2.0, 3.0, 4.0]
|
||||
b := [f32(5.0), 6.0, 7.0, 8.0]
|
||||
result := []f32{len: 4}
|
||||
horizontal_add_sse3(&a[0], &b[0], &result[0])
|
||||
println(result)
|
||||
// The result should be [3.0, 7.0, 11.0, 15.0] due to horizontal addition.
|
||||
// 1.0 + 2.0 = 3.0
|
||||
// 3.0 + 4.0 = 7.0
|
||||
// 5.0 + 6.0 = 11.0
|
||||
// 7.0 + 8.0 = 15.0
|
||||
assert result == [f32(3.0), 7.0, 11.0, 15.0]
|
||||
}
|
31
examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v
Normal file
31
examples/cpu_features/SSE_and_MMX_Extensions/sse4_1.v
Normal file
|
@ -0,0 +1,31 @@
|
|||
// SSE Instruction Set
|
||||
// SSE4.1: Added with later Core 2
|
||||
// MPSADBW, PHMINPOSUW, PMULLD, PMULDQ, DPPS, DPPD, BLENDPS, BLENDPD, BLENDVPS, BLENDVPD,
|
||||
// PBLENDVB, PBLENDW, PMINSB, PMAXSB, PMINUW, PMAXUW, PMINUD, PMAXUD, PMINSD, PMAXSD, ROUNDPS,
|
||||
// ROUNDSS, ROUNDPD, ROUNDSD, INSERTPS, PINSRB, PINSRD, PINSRQ, EXTRACTPS, PEXTRB, PEXTRW,
|
||||
// PEXTRD, PEXTRQ, PMOVSXBW, PMOVZXBW, PMOVSXBD, PMOVZXBD, PMOVSXBQ, PMOVZXBQ, PMOVSXWD,
|
||||
// PMOVZXWD, PMOVSXWQ, PMOVZXWQ, PMOVSXDQ, PMOVZXDQ, PTEST, PCMPEQQ, PACKUSDW, MOVNTDQA
|
||||
|
||||
@[if amd64 && !tinyc && !msvc]
|
||||
fn round_floats_sse4_1(a &f32, result &f32) {
|
||||
unsafe {
|
||||
asm volatile amd64 {
|
||||
movups xmm0, [a] // Load 4 floats from array a into xmm0
|
||||
roundps xmm0, xmm0, 0 // Round to nearest integer
|
||||
movups [result], xmm0 // Store the result in result array
|
||||
; ; r (a)
|
||||
r (result)
|
||||
; xmm0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
a := [f32(1.2), 2.5, 3.8, 4.4]
|
||||
result := []f32{len: 4}
|
||||
// Rounding mode 0 corresponds to rounding to the nearest integer
|
||||
round_floats_sse4_1(&a[0], &result[0])
|
||||
println(result)
|
||||
// The expected rounded result should be [1.0, 2.0, 4.0, 4.0]
|
||||
assert result == [f32(1.0), 2.0, 4.0, 4.0]
|
||||
}
|
31
examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v
Normal file
31
examples/cpu_features/SSE_and_MMX_Extensions/ssse3.v
Normal file
|
@ -0,0 +1,31 @@
|
|||
// SSE Instruction Set
|
||||
// SSSE3: Added with Xeon 5100 and early Core 2
|
||||
// PSIGNW, PSIGND, PSIGNB, PSHUFB, PMULHRSW, PMADDUBSW, PHSUBW, PHSUBSW, PHSUBD, PHADDW, PHADDSW,
|
||||
// PHADDD, PALIGNR, PABSW, PABSD, PABSB
|
||||
// The PSIGNW instruction negates or leaves elements unchanged based on another vector's signs.
|
||||
|
||||
@[if amd64 && !tinyc && !msvc]
|
||||
fn psignw_example(a &i16, b &i16, result &i16) {
|
||||
unsafe {
|
||||
asm volatile amd64 {
|
||||
movdqa xmm0, [a] // Load 8 signed 16-bit integers from array a into xmm0
|
||||
movdqa xmm1, [b] // Load 8 signed 16-bit integers from array b into xmm1
|
||||
psignw xmm0, xmm1 // Adjust the sign of elements in xmm0 based on xmm1
|
||||
movdqa [result], xmm0 // Store the result back to memory
|
||||
; ; r (a)
|
||||
r (b)
|
||||
r (result)
|
||||
; xmm0
|
||||
xmm1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn main() {
|
||||
a0 := [i16(1), -2, 3, -4, 5, -6, 7, -8]
|
||||
b0 := [i16(1), -1, 1, -1, 1, -1, 1, -1]
|
||||
result0 := []i16{len: 8}
|
||||
psignw_example(&a0[0], &b0[0], &result0[0])
|
||||
dump(result0)
|
||||
assert result0 == [i16(1), 2, 3, 4, 5, 6, 7, 8]
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue