diff --git a/hash.go b/hash.go index 9223c27..f350440 100644 --- a/hash.go +++ b/hash.go @@ -25,11 +25,9 @@ package gohashtree import ( "fmt" - "reflect" - "unsafe" ) -func _hash(digests *byte, p [][32]byte, count uint32) +func _hash(digests *byte, p *byte, count uint32) func Hash(digests [][32]byte, chunks [][32]byte) error { if len(chunks) == 0 { @@ -43,7 +41,7 @@ func Hash(digests [][32]byte, chunks [][32]byte) error { return fmt.Errorf("not enough digest length, need at least %v, got %v", len(chunks)/2, len(digests)) } if supportedCPU { - _hash(&digests[0][0], chunks, uint32(len(chunks)/2)) + _hash(&digests[0][0], &chunks[0][0], uint32(len(chunks)/2)) } else { sha256_1_generic(digests, chunks) } @@ -51,7 +49,7 @@ func Hash(digests [][32]byte, chunks [][32]byte) error { } func HashChunks(digests [][32]byte, chunks [][32]byte) { - _hash(&digests[0][0], chunks, uint32(len(chunks)/2)) + _hash(&digests[0][0], &chunks[0][0], uint32(len(chunks)/2)) } func HashByteSlice(digests []byte, chunks []byte) error { @@ -69,18 +67,17 @@ func HashByteSlice(digests []byte, chunks []byte) error { } // We use an unsafe pointer to cast []byte to [][32]byte. The length and // capacity of the slice need to be divided accordingly by 32. - header := *(*reflect.SliceHeader)(unsafe.Pointer(&chunks)) - header.Len <<= 5 - header.Cap <<= 5 - chunkedChunks := *(*[][32]byte)(unsafe.Pointer(&header)) - if supportedCPU { - _hash(&digests[0], chunkedChunks, uint32(len(chunks)/64)) + _hash(&digests[0], &chunks[0], uint32(len(chunks)/64)) } else { - headerDigest := *(*reflect.SliceHeader)(unsafe.Pointer(&digests)) - headerDigest.Len <<= 5 - headerDigest.Cap <<= 5 - chunkedDigest := *(*[][32]byte)(unsafe.Pointer(&headerDigest)) + chunkedChunks := make([][32]byte, len(chunks)/32) + for i := range chunkedChunks { + copy(chunkedChunks[i][:], chunks[32*i:32*i+32]) + } + chunkedDigest := make([][32]byte, len(digests)/32) + for i := range chunkedDigest { + copy(chunkedDigest[i][:], digests[32*i:32*i+32]) + } sha256_1_generic(chunkedDigest, chunkedChunks) } return nil diff --git a/hash_amd64.s b/hash_amd64.s index a107304..dce336e 100644 --- a/hash_amd64.s +++ b/hash_amd64.s @@ -783,8 +783,8 @@ TEXT ·_hash(SB), 0, $928-36 JE avx2 MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte - MOVQ p_base+8(FP), DATA_PTR // p [][32]byte - MOVL count+32(FP), NUM_BLKS // NUM_BLKS uint32 + MOVQ p+8(FP), DATA_PTR // p *[][32]byte or *[]byte + MOVL count+16(FP), NUM_BLKS // NUM_BLKS uint32 avx1: CMPL NUM_BLKS, $4 @@ -1314,9 +1314,9 @@ sha256_1_avx_epilog: // 8 blocks at a time with AVX2 avx2: - MOVL count+32(FP), NUM_BLKS // NUMBLKS uint32 - MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte - MOVQ p_base+8(FP), DATA_PTR // p [][32]byte + MOVL count+16(FP), NUM_BLKS // NUMBLKS uint32 + MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte or *[]byte + MOVQ p+8(FP), DATA_PTR // p *[][32]byte or p *[]byte sha256_8_avx2_loop: CMPL NUM_BLKS, $8 @@ -1591,8 +1591,8 @@ sha256_8_avx2_loop: // AVX 512 section avx512: MOVQ digests+0(FP), OUTPUT_PTR - MOVQ p_base+8(FP), DATA_PTR - MOVL count+32(FP), NUM_BLKS + MOVQ p+8(FP), DATA_PTR + MOVL count+16(FP), NUM_BLKS MOVQ $_DIGEST_16<>(SB), DIGESTAVX512 MOVQ $_PADDING_16<>(SB), PADDINGAVX512 @@ -2046,9 +2046,9 @@ avx512_loop: // SHA-ni section shani: - MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte - MOVQ p_base+8(FP), DATA_PTR // p [][32]byte - MOVL count+32(FP), NUM_BLKS // NUM_BLKS uint32 + MOVQ digests+0(FP), OUTPUT_PTR // digests *[][32]byte or *[]byte + MOVQ p+8(FP), DATA_PTR // p *[][32]byte or *[]byte + MOVL count+16(FP), NUM_BLKS // NUM_BLKS uint32 // Golang assembly does not guarantee stack aligned at 16 bytes MOVQ SP, SAVE_SP diff --git a/hash_arm64.s b/hash_arm64.s index dc5c7c5..36dba36 100644 --- a/hash_arm64.s +++ b/hash_arm64.s @@ -461,8 +461,8 @@ Copied parts are TEXT ·_hash(SB), 0, $1024-36 MOVD digests+0(FP), OUTPUT_PTR - MOVD p_base+8(FP), DATA_PTR - MOVWU count+32(FP), NUM_BLKS + MOVD p+8(FP), DATA_PTR + MOVWU count+16(FP), NUM_BLKS MOVBU ·hasShani(SB), check_shani CBNZ check_shani, shani diff --git a/hash_test.go b/hash_test.go index 53dbb69..8d2e814 100644 --- a/hash_test.go +++ b/hash_test.go @@ -316,6 +316,15 @@ func BenchmarkHash_1(b *testing.B) { } } +func BenchmarkHash_slice_1(b *testing.B) { + chunks := make([]byte, 64) + digests := make([]byte, 32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHash_4_minio(b *testing.B) { chunks := [64 * 4]byte{'A'} digests := make([][32]byte, 4) @@ -336,6 +345,15 @@ func BenchmarkHash_4(b *testing.B) { } } +func BenchmarkHash_slice_4(b *testing.B) { + chunks := make([]byte, 8*32) + digests := make([]byte, 4*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHash_8_minio(b *testing.B) { chunks := [64 * 8]byte{'A'} digests := make([][32]byte, 8) @@ -356,6 +374,15 @@ func BenchmarkHash_8(b *testing.B) { } } +func BenchmarkHash_slice_8(b *testing.B) { + chunks := make([]byte, 16*32) + digests := make([]byte, 8*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHash_16_minio(b *testing.B) { chunks := [64 * 16]byte{'A'} digests := make([][32]byte, 16) @@ -376,6 +403,15 @@ func BenchmarkHash_16(b *testing.B) { } } +func BenchmarkHash_slice_16(b *testing.B) { + chunks := make([]byte, 32*32) + digests := make([]byte, 16*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, chunks) + } +} + func BenchmarkHashLargeList_minio(b *testing.B) { balances := make([][32]byte, 400000) for i := 0; i < len(balances); i++ { @@ -402,3 +438,15 @@ func BenchmarkHashList(b *testing.B) { gohashtree.Hash(digests, balances) } } + +func BenchmarkHashList_slice(b *testing.B) { + balances := make([]byte, 400000*32) + for i := 0; i < len(balances); i += 32 { + balances[i] = byte('A') + } + digests := make([]byte, 200000*32) + b.ResetTimer() + for i := 0; i < b.N; i++ { + gohashtree.HashByteSlice(digests, balances) + } +}