This repository was archived by the owner on Dec 1, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 115
This repository was archived by the owner on Dec 1, 2021. It is now read-only.
best clang and gcc flags and more examples from start to finish #7
Copy link
Copy link
Open
Labels
Description
What are the best clang and gcc flags for getting everything to work well in c2goasm?
I should be clarify. I attempting to reproduce how to build the example in a way that resembles the method for influxdata.com's apache-arrow golang implementation:
https://www.influxdata.com/blog/influxdata-apache-arrow-go-implementation/
https://github.com/influxdata/arrow
I answered my own question.
github.com/influxdata/arrow/
holds a thorough example of c2goasm usage.
Here's the output of the influxdata/arrow/ build:
make -B
make[1]: Entering directory '/home/dma2/Code/go/src/github.com/influxdata/arrow/memory'
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/memory.c -o _lib/memory_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/memory_avx2.s
c2goasm -a -f -a -f _lib/memory_avx2.s memory_avx2_amd64.s
Processing _lib/memory_avx2.s
Invoking asm2plan9s on memory_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/memory.c -o _lib/memory_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/memory_sse4.s
c2goasm -a -f -a -f _lib/memory_sse4.s memory_sse4_amd64.s
Processing _lib/memory_sse4.s
Invoking asm2plan9s on memory_sse4_amd64.s
make[1]: Leaving directory '/home/dma2/Code/go/src/github.com/influxdata/arrow/memory'
make[1]: Entering directory '/home/dma2/Code/go/src/github.com/influxdata/arrow/math'
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/float64.c -o _lib/float64_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/float64_avx2.s
c2goasm -a -f -a -f _lib/float64_avx2.s float64_avx2_amd64.s
Processing _lib/float64_avx2.s
Invoking asm2plan9s on float64_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/float64.c -o _lib/float64_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/float64_sse4.s
c2goasm -a -f -a -f _lib/float64_sse4.s float64_sse4_amd64.s
Processing _lib/float64_sse4.s
Invoking asm2plan9s on float64_sse4_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/int64.c -o _lib/int64_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/int64_avx2.s
c2goasm -a -f -a -f _lib/int64_avx2.s int64_avx2_amd64.s
Processing _lib/int64_avx2.s
Invoking asm2plan9s on int64_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/int64.c -o _lib/int64_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/int64_sse4.s
c2goasm -a -f -a -f _lib/int64_sse4.s int64_sse4_amd64.s
Processing _lib/int64_sse4.s
Invoking asm2plan9s on int64_sse4_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/uint64.c -o _lib/uint64_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/uint64_avx2.s
c2goasm -a -f -a -f _lib/uint64_avx2.s uint64_avx2_amd64.s
Processing _lib/uint64_avx2.s
Invoking asm2plan9s on uint64_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/uint64.c -o _lib/uint64_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/uint64_sse4.s
c2goasm -a -f -a -f _lib/uint64_sse4.s uint64_sse4_amd64.s
Processing _lib/uint64_sse4.s
Invoking asm2plan9s on uint64_sse4_amd64.s
Here is an example from start to finish for others to follow:
cd github.com/ermig1979/Simd/prj/cmake
rm CMakeCache.txt
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++
cmake -DTOOLCHAIN="" -DTARGET="" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON .
make -B
go get -u github.com/minio/c2goasm
go get -u github.com/klauspost/asmfmt/cmd/asmfmt
go get -u github.com/minio/asm2plan9s
cat original.c
#include <x86intrin.h>
#include_next <immintrin.h>
void MultiplyAndAdd(float* arg1, float* arg2, float* arg3, float* result) {
__m256 vec1 = _mm256_load_ps(arg1);
__m256 vec2 = _mm256_load_ps(arg2);
__m256 vec3 = _mm256_load_ps(arg3);
__m256 res = _mm256_fmadd_ps(vec1, vec2, vec3);
_mm256_storeu_ps(result, res);
}
cat original.c.plan9s.go
//go:noescape
func _MultiplyAndAdd(vec1, vec2, vec3, result unsafe.Pointer)
func MultiplyAndAdd(someObj Object) {
_MultiplyAndAdd(someObj.GetVec1(), someObj.GetVec2(), someObj.GetVec3(), someObj.GetResult()))
}
1) generate the clang assembler code
/usr/bin/clang -mno-red-zone -mstackrealign -fPIC -mavx2 -mavx512bw -o original.c.s -S original.c
original.c.s has been generated.
2) we need original.c.plan9s.go
declaring what's in the original.c.s
3) generate the asm2plan9s assembler code
c2goasm -a original.c.s original.c.plan9s.s
4) original.c.plan9s.s has been generated.
cat original.c.plan9s.s
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ·_MultiplyAndAdd(SB), $0-32
MOVQ vec1+0(FP), DI
MOVQ vec2+8(FP), SI
MOVQ vec3+16(FP), DX
MOVQ result+24(FP), CX
.cfi_startproc
// pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
// movq %rsp, %rbp
.cfi_def_cfa_register %rbp
// andq $-32, %rsp
// subq $384, %rsp
// movq %rdi, 176(%rsp)
// movq %rsi, 168(%rsp)
// movq %rdx, 160(%rsp)
// movq %rcx, 152(%rsp)
// movq 176(%rsp), %rcx
// movq %rcx, 184(%rsp)
// movq 184(%rsp), %rcx
// vmovaps (%rcx), %ymm0
// vmovaps %ymm0, 96(%rsp)
// movq 168(%rsp), %rcx
// movq %rcx, 360(%rsp)
// movq 360(%rsp), %rcx
// vmovaps (%rcx), %ymm0
// vmovaps %ymm0, 64(%rsp)
// movq 160(%rsp), %rcx
// movq %rcx, 352(%rsp)
// movq 352(%rsp), %rcx
// vmovaps (%rcx), %ymm0
// vmovaps %ymm0, 32(%rsp)
// vmovaps 96(%rsp), %ymm0
// vmovaps 64(%rsp), %ymm1
// vmovaps 32(%rsp), %ymm2
// vmovaps %ymm0, 320(%rsp)
// vmovaps %ymm1, 288(%rsp)
// vmovaps %ymm2, 256(%rsp)
// vmovaps 320(%rsp), %ymm0
// vmovaps 288(%rsp), %ymm1
// vmovaps 256(%rsp), %ymm2
// vfmadd213ps %ymm2, %ymm0, %ymm1
// vmovaps %ymm1, (%rsp)
// movq 152(%rsp), %rcx
// vmovaps (%rsp), %ymm0
// movq %rcx, 248(%rsp)
// vmovaps %ymm0, 192(%rsp)
// vmovaps 192(%rsp), %ymm0
// movq 248(%rsp), %rcx
// vmovups %ymm0, (%rcx)
// movq %rbp, %rsp
// popq %rbp
// vzeroupper
// retq