Skip to content
This repository was archived by the owner on Dec 1, 2021. It is now read-only.
This repository was archived by the owner on Dec 1, 2021. It is now read-only.

best clang and gcc flags and more examples from start to finish #7

@omac777

Description

@omac777

What are the best clang and gcc flags for getting everything to work well in c2goasm?

I should be clarify. I attempting to reproduce how to build the example in a way that resembles the method for influxdata.com's apache-arrow golang implementation:
https://www.influxdata.com/blog/influxdata-apache-arrow-go-implementation/
https://github.com/influxdata/arrow

I answered my own question.
github.com/influxdata/arrow/
holds a thorough example of c2goasm usage.

Here's the output of the influxdata/arrow/ build:

make -B
make[1]: Entering directory '/home/dma2/Code/go/src/github.com/influxdata/arrow/memory'
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/memory.c -o _lib/memory_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/memory_avx2.s
c2goasm -a -f -a -f _lib/memory_avx2.s memory_avx2_amd64.s
Processing _lib/memory_avx2.s
Invoking asm2plan9s on memory_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/memory.c -o _lib/memory_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/memory_sse4.s
c2goasm -a -f -a -f _lib/memory_sse4.s memory_sse4_amd64.s
Processing _lib/memory_sse4.s
Invoking asm2plan9s on memory_sse4_amd64.s
make[1]: Leaving directory '/home/dma2/Code/go/src/github.com/influxdata/arrow/memory'
make[1]: Entering directory '/home/dma2/Code/go/src/github.com/influxdata/arrow/math'
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/float64.c -o _lib/float64_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/float64_avx2.s
c2goasm -a -f -a -f _lib/float64_avx2.s float64_avx2_amd64.s
Processing _lib/float64_avx2.s
Invoking asm2plan9s on float64_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/float64.c -o _lib/float64_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/float64_sse4.s
c2goasm -a -f -a -f _lib/float64_sse4.s float64_sse4_amd64.s
Processing _lib/float64_sse4.s
Invoking asm2plan9s on float64_sse4_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/int64.c -o _lib/int64_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/int64_avx2.s
c2goasm -a -f -a -f _lib/int64_avx2.s int64_avx2_amd64.s
Processing _lib/int64_avx2.s
Invoking asm2plan9s on int64_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/int64.c -o _lib/int64_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/int64_sse4.s
c2goasm -a -f -a -f _lib/int64_sse4.s int64_sse4_amd64.s
Processing _lib/int64_sse4.s
Invoking asm2plan9s on int64_sse4_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -mavx2 -mfma -mllvm -force-vector-width=32 _lib/uint64.c -o _lib/uint64_avx2.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/uint64_avx2.s
c2goasm -a -f -a -f _lib/uint64_avx2.s uint64_avx2_amd64.s
Processing _lib/uint64_avx2.s
Invoking asm2plan9s on uint64_avx2_amd64.s
clang -S -target x86_64-unknown-none -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -O3 -fno-builtin -ffast-math -fno-jump-tables -I_lib -msse4 _lib/uint64.c -o _lib/uint64_sse4.s ; perl -i -pe 's/(ro[rl]\s+\w{2,3})$/\1, 1/' _lib/uint64_sse4.s
c2goasm -a -f -a -f _lib/uint64_sse4.s uint64_sse4_amd64.s
Processing _lib/uint64_sse4.s
Invoking asm2plan9s on uint64_sse4_amd64.s

Here is an example from start to finish for others to follow:

cd github.com/ermig1979/Simd/prj/cmake
rm CMakeCache.txt
export CC=/usr/bin/clang
export CXX=/usr/bin/clang++
cmake -DTOOLCHAIN="" -DTARGET="" -DCMAKE_VERBOSE_MAKEFILE:BOOL=ON .
make -B

go get -u github.com/minio/c2goasm
go get -u github.com/klauspost/asmfmt/cmd/asmfmt
go get -u github.com/minio/asm2plan9s

cat original.c

#include <x86intrin.h>
#include_next <immintrin.h>

void MultiplyAndAdd(float* arg1, float* arg2, float* arg3, float* result) {
    __m256 vec1 = _mm256_load_ps(arg1);
    __m256 vec2 = _mm256_load_ps(arg2);
    __m256 vec3 = _mm256_load_ps(arg3);
    __m256 res  = _mm256_fmadd_ps(vec1, vec2, vec3);
    _mm256_storeu_ps(result, res);
}

cat original.c.plan9s.go
//go:noescape
func _MultiplyAndAdd(vec1, vec2, vec3, result unsafe.Pointer)

func MultiplyAndAdd(someObj Object) {

	_MultiplyAndAdd(someObj.GetVec1(), someObj.GetVec2(), someObj.GetVec3(), someObj.GetResult()))
}

1) generate the clang assembler code

/usr/bin/clang -mno-red-zone -mstackrealign -fPIC -mavx2 -mavx512bw -o original.c.s -S original.c

original.c.s has been generated.

2) we need original.c.plan9s.go
   declaring what's in the original.c.s

3) generate the asm2plan9s assembler code

c2goasm -a original.c.s original.c.plan9s.s

4) original.c.plan9s.s has been generated.

cat original.c.plan9s.s
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT

TEXT ·_MultiplyAndAdd(SB), $0-32

    MOVQ vec1+0(FP), DI
    MOVQ vec2+8(FP), SI
    MOVQ vec3+16(FP), DX
    MOVQ result+24(FP), CX

	.cfi_startproc
                                 // pushq	%rbp
	.cfi_def_cfa_offset 16
	.cfi_offset %rbp, -16
                                 // movq	%rsp, %rbp
	.cfi_def_cfa_register %rbp
                                 // andq	$-32, %rsp
                                 // subq	$384, %rsp
                                 // movq	%rdi, 176(%rsp)
                                 // movq	%rsi, 168(%rsp)
                                 // movq	%rdx, 160(%rsp)
                                 // movq	%rcx, 152(%rsp)
                                 // movq	176(%rsp), %rcx
                                 // movq	%rcx, 184(%rsp)
                                 // movq	184(%rsp), %rcx
                                 // vmovaps	(%rcx), %ymm0
                                 // vmovaps	%ymm0, 96(%rsp)
                                 // movq	168(%rsp), %rcx
                                 // movq	%rcx, 360(%rsp)
                                 // movq	360(%rsp), %rcx
                                 // vmovaps	(%rcx), %ymm0
                                 // vmovaps	%ymm0, 64(%rsp)
                                 // movq	160(%rsp), %rcx
                                 // movq	%rcx, 352(%rsp)
                                 // movq	352(%rsp), %rcx
                                 // vmovaps	(%rcx), %ymm0
                                 // vmovaps	%ymm0, 32(%rsp)
                                 // vmovaps	96(%rsp), %ymm0
                                 // vmovaps	64(%rsp), %ymm1
                                 // vmovaps	32(%rsp), %ymm2
                                 // vmovaps	%ymm0, 320(%rsp)
                                 // vmovaps	%ymm1, 288(%rsp)
                                 // vmovaps	%ymm2, 256(%rsp)
                                 // vmovaps	320(%rsp), %ymm0
                                 // vmovaps	288(%rsp), %ymm1
                                 // vmovaps	256(%rsp), %ymm2
                                 // vfmadd213ps	%ymm2, %ymm0, %ymm1
                                 // vmovaps	%ymm1, (%rsp)
                                 // movq	152(%rsp), %rcx
                                 // vmovaps	(%rsp), %ymm0
                                 // movq	%rcx, 248(%rsp)
                                 // vmovaps	%ymm0, 192(%rsp)
                                 // vmovaps	192(%rsp), %ymm0
                                 // movq	248(%rsp), %rcx
                                 // vmovups	%ymm0, (%rcx)
                                 // movq	%rbp, %rsp
                                 // popq	%rbp
                                 // vzeroupper
                                 // retq
				 

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions