diff --git a/README b/README index 1b43ba22d..2dd93ac58 100644 --- a/README +++ b/README @@ -42,7 +42,7 @@ Architecture-specific notes: To use NEON instructions, add "-mfpu=neon" to CFLAGS. x86: The miner checks for SSE2 instructions support at runtime, and uses them if they are available. - x86-64: The miner can take advantage of AVX and XOP instructions, + x86-64: The miner can take advantage of AVX, AVX2 and XOP instructions, but only if both the CPU and the operating system support them. * Linux supports AVX starting from kernel version 2.6.30. * FreeBSD supports AVX starting with 9.1-RELEASE. @@ -50,7 +50,7 @@ Architecture-specific notes: * Windows supports AVX starting from Windows 7 SP1 and Windows Server 2008 R2 SP1. The configure script outputs a warning if the assembler - cannot compile AVX or XOP instructions. In that case, the miner + doesn't support some instruction sets. In that case, the miner can still be built, but unavailable optimizations are left off. Usage instructions: Run "minerd --help" to see options. diff --git a/configure.ac b/configure.ac index f5260e991..6663ae06c 100644 --- a/configure.ac +++ b/configure.ac @@ -77,6 +77,14 @@ then AC_MSG_RESULT(no) AC_MSG_WARN([The assembler does not support the XOP instruction set.]) ) + AC_MSG_CHECKING(whether we can compile AVX2 code) + AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,[asm ("vpaddd %ymm0, %ymm1, %ymm2");])], + AC_DEFINE(USE_AVX2, 1, [Define to 1 if AVX2 assembly is available.]) + AC_MSG_RESULT(yes) + , + AC_MSG_RESULT(no) + AC_MSG_WARN([The assembler does not support the AVX2 instruction set.]) + ) , AC_MSG_RESULT(no) AC_MSG_WARN([The assembler does not support the AVX instruction set.]) diff --git a/cpu-miner.c b/cpu-miner.c index 7317692cc..e61eff8bb 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -668,7 +668,7 @@ static void *miner_thread(void *userdata) int thr_id = mythr->id; struct work work; uint32_t max_nonce; - uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x10; + uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; unsigned char *scratchbuf = NULL; char s[16]; int i; diff --git a/miner.h b/miner.h index 867a1e8a1..4aace442f 100644 --- a/miner.h +++ b/miner.h @@ -141,6 +141,13 @@ void sha256_init_4way(uint32_t *state); void sha256_transform_4way(uint32_t *state, const uint32_t *block, int swap); #endif +#if defined(__x86_64__) && defined(USE_AVX2) +#define HAVE_SHA256_8WAY 1 +int sha256_use_8way(); +void sha256_init_8way(uint32_t *state); +void sha256_transform_8way(uint32_t *state, const uint32_t *block, int swap); +#endif + extern int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done); diff --git a/scrypt-x64.S b/scrypt-x64.S index 0b935c582..ab1f3ed5f 100644 --- a/scrypt-x64.S +++ b/scrypt-x64.S @@ -1,5 +1,5 @@ /* - * Copyright 2011-2012 pooler@litecoinpool.org + * Copyright 2011-2013 pooler@litecoinpool.org * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -39,6 +39,30 @@ scrypt_best_throughput: _scrypt_best_throughput: pushq %rbx +#if defined(USE_AVX2) + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne scrypt_best_throughput_no_avx2 + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne scrypt_best_throughput_no_avx2 + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne scrypt_best_throughput_no_avx2 + movl $6, %eax + jmp scrypt_best_throughput_exit +scrypt_best_throughput_no_avx2: +#endif /* Check for AuthenticAMD */ xorq %rax, %rax cpuid @@ -2239,4 +2263,617 @@ scrypt_core_3way_xmm_loop2: scrypt_core_3way_cleanup ret + +#if defined(USE_AVX2) + +.macro salsa8_core_6way_avx2_doubleround + vpaddd %ymm0, %ymm1, %ymm4 + vpaddd %ymm8, %ymm9, %ymm6 + vpaddd %ymm12, %ymm13, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm0, %ymm4 + vpaddd %ymm11, %ymm8, %ymm6 + vpaddd %ymm15, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm3, %ymm4 + vpaddd %ymm10, %ymm11, %ymm6 + vpaddd %ymm14, %ymm15, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm3, %ymm3 + vpshufd $0x93, %ymm11, %ymm11 + vpshufd $0x93, %ymm15, %ymm15 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm2, %ymm4 + vpaddd %ymm9, %ymm10, %ymm6 + vpaddd %ymm13, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpshufd $0x4e, %ymm14, %ymm14 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 + + vpaddd %ymm0, %ymm3, %ymm4 + vpaddd %ymm8, %ymm11, %ymm6 + vpaddd %ymm12, %ymm15, %ymm7 + vpslld $7, %ymm4, %ymm5 + vpsrld $25, %ymm4, %ymm4 + vpshufd $0x39, %ymm1, %ymm1 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm4, %ymm1, %ymm1 + vpslld $7, %ymm6, %ymm5 + vpsrld $25, %ymm6, %ymm6 + vpshufd $0x39, %ymm9, %ymm9 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm9, %ymm9 + vpslld $7, %ymm7, %ymm5 + vpsrld $25, %ymm7, %ymm7 + vpshufd $0x39, %ymm13, %ymm13 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm7, %ymm13, %ymm13 + + vpaddd %ymm1, %ymm0, %ymm4 + vpaddd %ymm9, %ymm8, %ymm6 + vpaddd %ymm13, %ymm12, %ymm7 + vpslld $9, %ymm4, %ymm5 + vpsrld $23, %ymm4, %ymm4 + vpxor %ymm5, %ymm2, %ymm2 + vpxor %ymm4, %ymm2, %ymm2 + vpslld $9, %ymm6, %ymm5 + vpsrld $23, %ymm6, %ymm6 + vpxor %ymm5, %ymm10, %ymm10 + vpxor %ymm6, %ymm10, %ymm10 + vpslld $9, %ymm7, %ymm5 + vpsrld $23, %ymm7, %ymm7 + vpxor %ymm5, %ymm14, %ymm14 + vpxor %ymm7, %ymm14, %ymm14 + + vpaddd %ymm2, %ymm1, %ymm4 + vpaddd %ymm10, %ymm9, %ymm6 + vpaddd %ymm14, %ymm13, %ymm7 + vpslld $13, %ymm4, %ymm5 + vpsrld $19, %ymm4, %ymm4 + vpshufd $0x93, %ymm1, %ymm1 + vpshufd $0x93, %ymm9, %ymm9 + vpshufd $0x93, %ymm13, %ymm13 + vpxor %ymm5, %ymm3, %ymm3 + vpxor %ymm4, %ymm3, %ymm3 + vpslld $13, %ymm6, %ymm5 + vpsrld $19, %ymm6, %ymm6 + vpxor %ymm5, %ymm11, %ymm11 + vpxor %ymm6, %ymm11, %ymm11 + vpslld $13, %ymm7, %ymm5 + vpsrld $19, %ymm7, %ymm7 + vpxor %ymm5, %ymm15, %ymm15 + vpxor %ymm7, %ymm15, %ymm15 + + vpaddd %ymm3, %ymm2, %ymm4 + vpaddd %ymm11, %ymm10, %ymm6 + vpaddd %ymm15, %ymm14, %ymm7 + vpslld $18, %ymm4, %ymm5 + vpsrld $14, %ymm4, %ymm4 + vpshufd $0x4e, %ymm2, %ymm2 + vpshufd $0x4e, %ymm10, %ymm10 + vpxor %ymm5, %ymm0, %ymm0 + vpxor %ymm4, %ymm0, %ymm0 + vpslld $18, %ymm6, %ymm5 + vpsrld $14, %ymm6, %ymm6 + vpshufd $0x4e, %ymm14, %ymm14 + vpshufd $0x39, %ymm11, %ymm11 + vpxor %ymm5, %ymm8, %ymm8 + vpxor %ymm6, %ymm8, %ymm8 + vpslld $18, %ymm7, %ymm5 + vpsrld $14, %ymm7, %ymm7 + vpshufd $0x39, %ymm3, %ymm3 + vpshufd $0x39, %ymm15, %ymm15 + vpxor %ymm5, %ymm12, %ymm12 + vpxor %ymm7, %ymm12, %ymm12 +.endm + +.macro salsa8_core_6way_avx2 + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround + salsa8_core_6way_avx2_doubleround +.endm + + .text + .p2align 6 + .globl scrypt_core_6way + .globl _scrypt_core_6way +scrypt_core_6way: +_scrypt_core_6way: + pushq %rbx + pushq %rbp +#if defined(WIN64) + subq $176, %rsp + vmovdqa %xmm6, 8(%rsp) + vmovdqa %xmm7, 24(%rsp) + vmovdqa %xmm8, 40(%rsp) + vmovdqa %xmm9, 56(%rsp) + vmovdqa %xmm10, 72(%rsp) + vmovdqa %xmm11, 88(%rsp) + vmovdqa %xmm12, 104(%rsp) + vmovdqa %xmm13, 120(%rsp) + vmovdqa %xmm14, 136(%rsp) + vmovdqa %xmm15, 152(%rsp) + pushq %rdi + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi +#endif + movq %rsp, %rdx + subq $768, %rsp + andq $-128, %rsp + +.macro scrypt_core_6way_cleanup + movq %rdx, %rsp +#if defined(WIN64) + popq %rsi + popq %rdi + vmovdqa 8(%rsp), %xmm6 + vmovdqa 24(%rsp), %xmm7 + vmovdqa 40(%rsp), %xmm8 + vmovdqa 56(%rsp), %xmm9 + vmovdqa 72(%rsp), %xmm10 + vmovdqa 88(%rsp), %xmm11 + vmovdqa 104(%rsp), %xmm12 + vmovdqa 120(%rsp), %xmm13 + vmovdqa 136(%rsp), %xmm14 + vmovdqa 152(%rsp), %xmm15 + addq $176, %rsp +#endif + popq %rbp + popq %rbx +.endm + +.macro scrypt_shuffle_pack2 src, so, dest, do + vmovdqa \so+0*16(\src), %xmm0 + vmovdqa \so+1*16(\src), %xmm1 + vmovdqa \so+2*16(\src), %xmm2 + vmovdqa \so+3*16(\src), %xmm3 + vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0 + vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1 + vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2 + vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %ymm0, \do+0*32(\dest) + vmovdqa %ymm1, \do+1*32(\dest) + vmovdqa %ymm2, \do+2*32(\dest) + vmovdqa %ymm3, \do+3*32(\dest) +.endm + +.macro scrypt_shuffle_unpack2 src, so, dest, do + vmovdqa \so+0*32(\src), %ymm0 + vmovdqa \so+1*32(\src), %ymm1 + vmovdqa \so+2*32(\src), %ymm2 + vmovdqa \so+3*32(\src), %ymm3 + vpblendd $0x33, %ymm0, %ymm2, %ymm4 + vpblendd $0xcc, %ymm1, %ymm3, %ymm5 + vpblendd $0x33, %ymm2, %ymm0, %ymm6 + vpblendd $0xcc, %ymm3, %ymm1, %ymm7 + vpblendd $0x55, %ymm7, %ymm6, %ymm3 + vpblendd $0x55, %ymm6, %ymm5, %ymm2 + vpblendd $0x55, %ymm5, %ymm4, %ymm1 + vpblendd $0x55, %ymm4, %ymm7, %ymm0 + vmovdqa %xmm0, \do+0*16(\dest) + vmovdqa %xmm1, \do+1*16(\dest) + vmovdqa %xmm2, \do+2*16(\dest) + vmovdqa %xmm3, \do+3*16(\dest) + vextracti128 $1, %ymm0, \do+128+0*16(\dest) + vextracti128 $1, %ymm1, \do+128+1*16(\dest) + vextracti128 $1, %ymm2, \do+128+2*16(\dest) + vextracti128 $1, %ymm3, \do+128+3*16(\dest) +.endm + +scrypt_core_6way_avx2: + scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128 + scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128 + scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128 + scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128 + scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128 + scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128 + + vmovdqa 0*256+4*32(%rsp), %ymm0 + vmovdqa 0*256+5*32(%rsp), %ymm1 + vmovdqa 0*256+6*32(%rsp), %ymm2 + vmovdqa 0*256+7*32(%rsp), %ymm3 + vmovdqa 1*256+4*32(%rsp), %ymm8 + vmovdqa 1*256+5*32(%rsp), %ymm9 + vmovdqa 1*256+6*32(%rsp), %ymm10 + vmovdqa 1*256+7*32(%rsp), %ymm11 + vmovdqa 2*256+4*32(%rsp), %ymm12 + vmovdqa 2*256+5*32(%rsp), %ymm13 + vmovdqa 2*256+6*32(%rsp), %ymm14 + vmovdqa 2*256+7*32(%rsp), %ymm15 + + movq %rsi, %rbx + leaq 6*131072(%rsi), %rax +scrypt_core_6way_avx2_loop1: + vmovdqa %ymm0, 0*256+4*32(%rbx) + vmovdqa %ymm1, 0*256+5*32(%rbx) + vmovdqa %ymm2, 0*256+6*32(%rbx) + vmovdqa %ymm3, 0*256+7*32(%rbx) + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vmovdqa %ymm8, 1*256+4*32(%rbx) + vmovdqa %ymm9, 1*256+5*32(%rbx) + vmovdqa %ymm10, 1*256+6*32(%rbx) + vmovdqa %ymm11, 1*256+7*32(%rbx) + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vmovdqa %ymm12, 2*256+4*32(%rbx) + vmovdqa %ymm13, 2*256+5*32(%rbx) + vmovdqa %ymm14, 2*256+6*32(%rbx) + vmovdqa %ymm15, 2*256+7*32(%rbx) + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rbx) + vmovdqa %ymm1, 0*256+1*32(%rbx) + vmovdqa %ymm2, 0*256+2*32(%rbx) + vmovdqa %ymm3, 0*256+3*32(%rbx) + vmovdqa %ymm8, 1*256+0*32(%rbx) + vmovdqa %ymm9, 1*256+1*32(%rbx) + vmovdqa %ymm10, 1*256+2*32(%rbx) + vmovdqa %ymm11, 1*256+3*32(%rbx) + vmovdqa %ymm12, 2*256+0*32(%rbx) + vmovdqa %ymm13, 2*256+1*32(%rbx) + vmovdqa %ymm14, 2*256+2*32(%rbx) + vmovdqa %ymm15, 2*256+3*32(%rbx) + + salsa8_core_6way_avx2 + vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rbx), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rbx), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rbx), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rbx), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rbx), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rbx), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rbx), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rbx), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vpxor 0*256+4*32(%rbx), %ymm0, %ymm0 + vpxor 0*256+5*32(%rbx), %ymm1, %ymm1 + vpxor 0*256+6*32(%rbx), %ymm2, %ymm2 + vpxor 0*256+7*32(%rbx), %ymm3, %ymm3 + vpxor 1*256+4*32(%rbx), %ymm8, %ymm8 + vpxor 1*256+5*32(%rbx), %ymm9, %ymm9 + vpxor 1*256+6*32(%rbx), %ymm10, %ymm10 + vpxor 1*256+7*32(%rbx), %ymm11, %ymm11 + vpxor 2*256+4*32(%rbx), %ymm12, %ymm12 + vpxor 2*256+5*32(%rbx), %ymm13, %ymm13 + vpxor 2*256+6*32(%rbx), %ymm14, %ymm14 + vpxor 2*256+7*32(%rbx), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + + addq $6*128, %rbx + cmpq %rax, %rbx + jne scrypt_core_6way_avx2_loop1 + + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + movq $1024, %rcx +scrypt_core_6way_avx2_loop2: + vmovd %xmm0, %ebp + vmovd %xmm8, %ebx + vmovd %xmm12, %eax + vextracti128 $1, %ymm0, %xmm4 + vextracti128 $1, %ymm8, %xmm5 + vextracti128 $1, %ymm12, %xmm6 + vmovd %xmm4, %r8d + vmovd %xmm5, %r9d + vmovd %xmm6, %r10d + vpxor 0*256+0*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+1*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+2*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+3*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+0*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+1*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+2*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+3*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+0*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+1*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+2*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+3*32(%rsp), %ymm15, %ymm15 + andl $1023, %ebp + leaq 0(%rbp, %rbp, 2), %rbp + shll $8, %ebp + andl $1023, %ebx + leaq 1(%rbx, %rbx, 2), %rbx + shll $8, %ebx + andl $1023, %eax + leaq 2(%rax, %rax, 2), %rax + shll $8, %eax + andl $1023, %r8d + leaq 0(%r8, %r8, 2), %r8 + shll $8, %r8d + andl $1023, %r9d + leaq 1(%r9, %r9, 2), %r9 + shll $8, %r9d + andl $1023, %r10d + leaq 2(%r10, %r10, 2), %r10 + shll $8, %r10d + vmovdqa 0*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 0*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 0*32(%rsi, %rax), %xmm4 + vinserti128 $1, 0*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 1*32(%rsi, %rax), %xmm5 + vinserti128 $1, 1*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 2*32(%rsi, %rax), %xmm6 + vinserti128 $1, 2*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 3*32(%rsi, %rax), %xmm7 + vinserti128 $1, 3*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+3*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+0*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+1*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+2*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+3*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+0*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+1*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+2*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+3*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+0*32(%rsp) + vmovdqa %ymm1, 0*256+1*32(%rsp) + vmovdqa %ymm2, 0*256+2*32(%rsp) + vmovdqa %ymm3, 0*256+3*32(%rsp) + vmovdqa %ymm8, 1*256+0*32(%rsp) + vmovdqa %ymm9, 1*256+1*32(%rsp) + vmovdqa %ymm10, 1*256+2*32(%rsp) + vmovdqa %ymm11, 1*256+3*32(%rsp) + vmovdqa %ymm12, 2*256+0*32(%rsp) + vmovdqa %ymm13, 2*256+1*32(%rsp) + vmovdqa %ymm14, 2*256+2*32(%rsp) + vmovdqa %ymm15, 2*256+3*32(%rsp) + + vmovdqa 4*32(%rsi, %rbp), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r8), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbp), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r8), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbp), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r8), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbp), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r8), %ymm7, %ymm7 + vpxor %ymm4, %ymm0, %ymm0 + vpxor %ymm5, %ymm1, %ymm1 + vpxor %ymm6, %ymm2, %ymm2 + vpxor %ymm7, %ymm3, %ymm3 + vmovdqa 4*32(%rsi, %rbx), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r9), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rbx), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r9), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rbx), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r9), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rbx), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r9), %ymm7, %ymm7 + vpxor %ymm4, %ymm8, %ymm8 + vpxor %ymm5, %ymm9, %ymm9 + vpxor %ymm6, %ymm10, %ymm10 + vpxor %ymm7, %ymm11, %ymm11 + vmovdqa 4*32(%rsi, %rax), %xmm4 + vinserti128 $1, 4*32+16(%rsi, %r10), %ymm4, %ymm4 + vmovdqa 5*32(%rsi, %rax), %xmm5 + vinserti128 $1, 5*32+16(%rsi, %r10), %ymm5, %ymm5 + vmovdqa 6*32(%rsi, %rax), %xmm6 + vinserti128 $1, 6*32+16(%rsi, %r10), %ymm6, %ymm6 + vmovdqa 7*32(%rsi, %rax), %xmm7 + vinserti128 $1, 7*32+16(%rsi, %r10), %ymm7, %ymm7 + vpxor %ymm4, %ymm12, %ymm12 + vpxor %ymm5, %ymm13, %ymm13 + vpxor %ymm6, %ymm14, %ymm14 + vpxor %ymm7, %ymm15, %ymm15 + vpxor 0*256+4*32(%rsp), %ymm0, %ymm0 + vpxor 0*256+5*32(%rsp), %ymm1, %ymm1 + vpxor 0*256+6*32(%rsp), %ymm2, %ymm2 + vpxor 0*256+7*32(%rsp), %ymm3, %ymm3 + vpxor 1*256+4*32(%rsp), %ymm8, %ymm8 + vpxor 1*256+5*32(%rsp), %ymm9, %ymm9 + vpxor 1*256+6*32(%rsp), %ymm10, %ymm10 + vpxor 1*256+7*32(%rsp), %ymm11, %ymm11 + vpxor 2*256+4*32(%rsp), %ymm12, %ymm12 + vpxor 2*256+5*32(%rsp), %ymm13, %ymm13 + vpxor 2*256+6*32(%rsp), %ymm14, %ymm14 + vpxor 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + salsa8_core_6way_avx2 + vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 + vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 + vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 + vpaddd 0*256+7*32(%rsp), %ymm3, %ymm3 + vpaddd 1*256+4*32(%rsp), %ymm8, %ymm8 + vpaddd 1*256+5*32(%rsp), %ymm9, %ymm9 + vpaddd 1*256+6*32(%rsp), %ymm10, %ymm10 + vpaddd 1*256+7*32(%rsp), %ymm11, %ymm11 + vpaddd 2*256+4*32(%rsp), %ymm12, %ymm12 + vpaddd 2*256+5*32(%rsp), %ymm13, %ymm13 + vpaddd 2*256+6*32(%rsp), %ymm14, %ymm14 + vpaddd 2*256+7*32(%rsp), %ymm15, %ymm15 + vmovdqa %ymm0, 0*256+4*32(%rsp) + vmovdqa %ymm1, 0*256+5*32(%rsp) + vmovdqa %ymm2, 0*256+6*32(%rsp) + vmovdqa %ymm3, 0*256+7*32(%rsp) + vmovdqa %ymm8, 1*256+4*32(%rsp) + vmovdqa %ymm9, 1*256+5*32(%rsp) + vmovdqa %ymm10, 1*256+6*32(%rsp) + vmovdqa %ymm11, 1*256+7*32(%rsp) + vmovdqa %ymm12, 2*256+4*32(%rsp) + vmovdqa %ymm13, 2*256+5*32(%rsp) + vmovdqa %ymm14, 2*256+6*32(%rsp) + vmovdqa %ymm15, 2*256+7*32(%rsp) + + subq $1, %rcx + ja scrypt_core_6way_avx2_loop2 + + scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0 + scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64 + scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0 + scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64 + scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0 + scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64 + + scrypt_core_6way_cleanup + ret + +#endif /* USE_AVX2 */ + #endif diff --git a/scrypt.c b/scrypt.c index bc4f512ed..06fd76a19 100644 --- a/scrypt.c +++ b/scrypt.c @@ -1,5 +1,5 @@ /* - * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2012 pooler + * Copyright 2009 Colin Percival, 2011 ArtForz, 2011-2013 pooler * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -256,6 +256,128 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, #endif /* HAVE_SHA256_4WAY */ +#ifdef HAVE_SHA256_8WAY + +static const uint32_t finalblk_8way[8 * 16] __attribute__((aligned(32))) = { + 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, 0x00000001, + 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620, 0x00000620 +}; + +static inline void HMAC_SHA256_80_init_8way(const uint32_t *key, + uint32_t *tstate, uint32_t *ostate) +{ + uint32_t ihash[8 * 8] __attribute__((aligned(32))); + uint32_t pad[8 * 16] __attribute__((aligned(32))); + int i; + + /* tstate is assumed to contain the midstate of key */ + memcpy(pad, key + 8 * 16, 8 * 16); + for (i = 0; i < 8; i++) + pad[8 * 4 + i] = 0x80000000; + memset(pad + 8 * 5, 0x00, 8 * 40); + for (i = 0; i < 8; i++) + pad[8 * 15 + i] = 0x00000280; + sha256_transform_8way(tstate, pad, 0); + memcpy(ihash, tstate, 8 * 32); + + sha256_init_8way(ostate); + for (i = 0; i < 8 * 8; i++) + pad[i] = ihash[i] ^ 0x5c5c5c5c; + for (; i < 8 * 16; i++) + pad[i] = 0x5c5c5c5c; + sha256_transform_8way(ostate, pad, 0); + + sha256_init_8way(tstate); + for (i = 0; i < 8 * 8; i++) + pad[i] = ihash[i] ^ 0x36363636; + for (; i < 8 * 16; i++) + pad[i] = 0x36363636; + sha256_transform_8way(tstate, pad, 0); +} + +static inline void PBKDF2_SHA256_80_128_8way(const uint32_t *tstate, + const uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t istate[8 * 8] __attribute__((aligned(32))); + uint32_t ostate2[8 * 8] __attribute__((aligned(32))); + uint32_t ibuf[8 * 16] __attribute__((aligned(32))); + uint32_t obuf[8 * 16] __attribute__((aligned(32))); + int i, j; + + memcpy(istate, tstate, 8 * 32); + sha256_transform_8way(istate, salt, 0); + + memcpy(ibuf, salt + 8 * 16, 8 * 16); + for (i = 0; i < 8; i++) + ibuf[8 * 5 + i] = 0x80000000; + memset(ibuf + 8 * 6, 0x00, 8 * 36); + for (i = 0; i < 8; i++) + ibuf[8 * 15 + i] = 0x000004a0; + + for (i = 0; i < 8; i++) + obuf[8 * 8 + i] = 0x80000000; + memset(obuf + 8 * 9, 0x00, 8 * 24); + for (i = 0; i < 8; i++) + obuf[8 * 15 + i] = 0x00000300; + + for (i = 0; i < 4; i++) { + memcpy(obuf, istate, 8 * 32); + ibuf[8 * 4 + 0] = i + 1; + ibuf[8 * 4 + 1] = i + 1; + ibuf[8 * 4 + 2] = i + 1; + ibuf[8 * 4 + 3] = i + 1; + ibuf[8 * 4 + 4] = i + 1; + ibuf[8 * 4 + 5] = i + 1; + ibuf[8 * 4 + 6] = i + 1; + ibuf[8 * 4 + 7] = i + 1; + sha256_transform_8way(obuf, ibuf, 0); + + memcpy(ostate2, ostate, 8 * 32); + sha256_transform_8way(ostate2, obuf, 0); + for (j = 0; j < 8 * 8; j++) + output[8 * 8 * i + j] = swab32(ostate2[j]); + } +} + +static inline void PBKDF2_SHA256_128_32_8way(uint32_t *tstate, + uint32_t *ostate, const uint32_t *salt, uint32_t *output) +{ + uint32_t buf[8 * 16] __attribute__((aligned(32))); + int i; + + sha256_transform_8way(tstate, salt, 1); + sha256_transform_8way(tstate, salt + 8 * 16, 1); + sha256_transform_8way(tstate, finalblk_8way, 0); + + memcpy(buf, tstate, 8 * 32); + for (i = 0; i < 8; i++) + buf[8 * 8 + i] = 0x80000000; + memset(buf + 8 * 9, 0x00, 8 * 24); + for (i = 0; i < 8; i++) + buf[8 * 15 + i] = 0x00000300; + sha256_transform_8way(ostate, buf, 0); + + for (i = 0; i < 8 * 8; i++) + output[i] = swab32(ostate[i]); +} + +#endif /* HAVE_SHA256_8WAY */ + + #if defined(__x86_64__) #define SCRYPT_MAX_WAYS 12 @@ -263,6 +385,12 @@ static inline void PBKDF2_SHA256_128_32_4way(uint32_t *tstate, int scrypt_best_throughput(); void scrypt_core(uint32_t *X, uint32_t *V); void scrypt_core_3way(uint32_t *X, uint32_t *V); +#if defined(USE_AVX2) +#undef SCRYPT_MAX_WAYS +#define SCRYPT_MAX_WAYS 24 +#define HAVE_SCRYPT_6WAY 1 +void scrypt_core_6way(uint32_t *X, uint32_t *V); +#endif #elif defined(__i386__) @@ -410,47 +538,32 @@ static void scrypt_1024_1_1_256_4way(const uint32_t *input, uint32_t W[4 * 32] __attribute__((aligned(128))); uint32_t X[4 * 32] __attribute__((aligned(128))); uint32_t *V; - int i; + int i, k; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - for (i = 0; i < 20; i++) { - W[4 * i + 0] = input[0 * 20 + i]; - W[4 * i + 1] = input[1 * 20 + i]; - W[4 * i + 2] = input[2 * 20 + i]; - W[4 * i + 3] = input[3 * 20 + i]; - } - for (i = 0; i < 8; i++) { - tstate[4 * i + 0] = midstate[i]; - tstate[4 * i + 1] = midstate[i]; - tstate[4 * i + 2] = midstate[i]; - tstate[4 * i + 3] = midstate[i]; - } + for (i = 0; i < 20; i++) + for (k = 0; k < 4; k++) + W[4 * i + k] = input[k * 20 + i]; + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + tstate[4 * i + k] = midstate[i]; HMAC_SHA256_80_init_4way(W, tstate, ostate); PBKDF2_SHA256_80_128_4way(tstate, ostate, W, W); - for (i = 0; i < 32; i++) { - X[0 * 32 + i] = W[4 * i + 0]; - X[1 * 32 + i] = W[4 * i + 1]; - X[2 * 32 + i] = W[4 * i + 2]; - X[3 * 32 + i] = W[4 * i + 3]; - } + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + X[k * 32 + i] = W[4 * i + k]; scrypt_core(X + 0 * 32, V); scrypt_core(X + 1 * 32, V); scrypt_core(X + 2 * 32, V); scrypt_core(X + 3 * 32, V); - for (i = 0; i < 32; i++) { - W[4 * i + 0] = X[0 * 32 + i]; - W[4 * i + 1] = X[1 * 32 + i]; - W[4 * i + 2] = X[2 * 32 + i]; - W[4 * i + 3] = X[3 * 32 + i]; - } + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + W[4 * i + k] = X[k * 32 + i]; PBKDF2_SHA256_128_32_4way(tstate, ostate, W, W); - for (i = 0; i < 8; i++) { - output[0 * 8 + i] = W[4 * i + 0]; - output[1 * 8 + i] = W[4 * i + 1]; - output[2 * 8 + i] = W[4 * i + 2]; - output[3 * 8 + i] = W[4 * i + 3]; - } + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + output[k * 8 + i] = W[4 * i + k]; } #endif /* HAVE_SHA256_4WAY */ @@ -491,68 +604,97 @@ static void scrypt_1024_1_1_256_12way(const uint32_t *input, uint32_t W[12 * 32] __attribute__((aligned(128))); uint32_t X[12 * 32] __attribute__((aligned(128))); uint32_t *V; - int i, j; + int i, j, k; V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); - for (j = 0; j < 3; j++) { - for (i = 0; i < 20; i++) { - W[128 * j + 4 * i + 0] = input[80 * j + 0 * 20 + i]; - W[128 * j + 4 * i + 1] = input[80 * j + 1 * 20 + i]; - W[128 * j + 4 * i + 2] = input[80 * j + 2 * 20 + i]; - W[128 * j + 4 * i + 3] = input[80 * j + 3 * 20 + i]; - } - } - for (j = 0; j < 3; j++) { - for (i = 0; i < 8; i++) { - tstate[32 * j + 4 * i + 0] = midstate[i]; - tstate[32 * j + 4 * i + 1] = midstate[i]; - tstate[32 * j + 4 * i + 2] = midstate[i]; - tstate[32 * j + 4 * i + 3] = midstate[i]; - } - } + for (j = 0; j < 3; j++) + for (i = 0; i < 20; i++) + for (k = 0; k < 4; k++) + W[128 * j + 4 * i + k] = input[80 * j + k * 20 + i]; + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + tstate[32 * j + 4 * i + k] = midstate[i]; HMAC_SHA256_80_init_4way(W + 0, tstate + 0, ostate + 0); HMAC_SHA256_80_init_4way(W + 128, tstate + 32, ostate + 32); HMAC_SHA256_80_init_4way(W + 256, tstate + 64, ostate + 64); PBKDF2_SHA256_80_128_4way(tstate + 0, ostate + 0, W + 0, W + 0); PBKDF2_SHA256_80_128_4way(tstate + 32, ostate + 32, W + 128, W + 128); PBKDF2_SHA256_80_128_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) { - for (i = 0; i < 32; i++) { - X[128 * j + 0 * 32 + i] = W[128 * j + 4 * i + 0]; - X[128 * j + 1 * 32 + i] = W[128 * j + 4 * i + 1]; - X[128 * j + 2 * 32 + i] = W[128 * j + 4 * i + 2]; - X[128 * j + 3 * 32 + i] = W[128 * j + 4 * i + 3]; - } - } + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + X[128 * j + k * 32 + i] = W[128 * j + 4 * i + k]; scrypt_core_3way(X + 0 * 96, V); scrypt_core_3way(X + 1 * 96, V); scrypt_core_3way(X + 2 * 96, V); scrypt_core_3way(X + 3 * 96, V); - for (j = 0; j < 3; j++) { - for (i = 0; i < 32; i++) { - W[128 * j + 4 * i + 0] = X[128 * j + 0 * 32 + i]; - W[128 * j + 4 * i + 1] = X[128 * j + 1 * 32 + i]; - W[128 * j + 4 * i + 2] = X[128 * j + 2 * 32 + i]; - W[128 * j + 4 * i + 3] = X[128 * j + 3 * 32 + i]; - } - } + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 4; k++) + W[128 * j + 4 * i + k] = X[128 * j + k * 32 + i]; PBKDF2_SHA256_128_32_4way(tstate + 0, ostate + 0, W + 0, W + 0); PBKDF2_SHA256_128_32_4way(tstate + 32, ostate + 32, W + 128, W + 128); PBKDF2_SHA256_128_32_4way(tstate + 64, ostate + 64, W + 256, W + 256); - for (j = 0; j < 3; j++) { - for (i = 0; i < 8; i++) { - output[32 * j + 0 * 8 + i] = W[128 * j + 4 * i + 0]; - output[32 * j + 1 * 8 + i] = W[128 * j + 4 * i + 1]; - output[32 * j + 2 * 8 + i] = W[128 * j + 4 * i + 2]; - output[32 * j + 3 * 8 + i] = W[128 * j + 4 * i + 3]; - } - } + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 4; k++) + output[32 * j + k * 8 + i] = W[128 * j + 4 * i + k]; } #endif /* HAVE_SHA256_4WAY */ #endif /* HAVE_SCRYPT_3WAY */ +#ifdef HAVE_SCRYPT_6WAY +static void scrypt_1024_1_1_256_24way(const uint32_t *input, + uint32_t *output, uint32_t *midstate, unsigned char *scratchpad) +{ + uint32_t tstate[24 * 8] __attribute__((aligned(128))); + uint32_t ostate[24 * 8] __attribute__((aligned(128))); + uint32_t W[24 * 32] __attribute__((aligned(128))); + uint32_t X[24 * 32] __attribute__((aligned(128))); + uint32_t *V; + int i, j, k; + + V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63)); + + for (j = 0; j < 3; j++) + for (i = 0; i < 20; i++) + for (k = 0; k < 8; k++) + W[8 * 32 * j + 8 * i + k] = input[8 * 20 * j + k * 20 + i]; + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 8; k++) + tstate[8 * 8 * j + 8 * i + k] = midstate[i]; + HMAC_SHA256_80_init_8way(W + 0, tstate + 0, ostate + 0); + HMAC_SHA256_80_init_8way(W + 256, tstate + 64, ostate + 64); + HMAC_SHA256_80_init_8way(W + 512, tstate + 128, ostate + 128); + PBKDF2_SHA256_80_128_8way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_80_128_8way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_80_128_8way(tstate + 128, ostate + 128, W + 512, W + 512); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 8; k++) + X[8 * 32 * j + k * 32 + i] = W[8 * 32 * j + 8 * i + k]; + scrypt_core_6way(X + 0 * 32, V); + scrypt_core_6way(X + 6 * 32, V); + scrypt_core_6way(X + 12 * 32, V); + scrypt_core_6way(X + 18 * 32, V); + for (j = 0; j < 3; j++) + for (i = 0; i < 32; i++) + for (k = 0; k < 8; k++) + W[8 * 32 * j + 8 * i + k] = X[8 * 32 * j + k * 32 + i]; + PBKDF2_SHA256_128_32_8way(tstate + 0, ostate + 0, W + 0, W + 0); + PBKDF2_SHA256_128_32_8way(tstate + 64, ostate + 64, W + 256, W + 256); + PBKDF2_SHA256_128_32_8way(tstate + 128, ostate + 128, W + 512, W + 512); + for (j = 0; j < 3; j++) + for (i = 0; i < 8; i++) + for (k = 0; k < 8; k++) + output[8 * 8 * j + k * 8 + i] = W[8 * 32 * j + 8 * i + k]; +} +#endif /* HAVE_SCRYPT_6WAY */ + int scanhash_scrypt(int thr_id, uint32_t *pdata, unsigned char *scratchbuf, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) @@ -589,6 +731,11 @@ int scanhash_scrypt(int thr_id, uint32_t *pdata, scrypt_1024_1_1_256_12way(data, hash, midstate, scratchbuf); else #endif +#if defined(HAVE_SCRYPT_6WAY) + if (throughput == 24) + scrypt_1024_1_1_256_24way(data, hash, midstate, scratchbuf); + else +#endif #if defined(HAVE_SCRYPT_3WAY) if (throughput == 3) scrypt_1024_1_1_256_3way(data, hash, midstate, scratchbuf); diff --git a/sha2-x64.S b/sha2-x64.S index 1fe87a652..a1581dd9d 100644 --- a/sha2-x64.S +++ b/sha2-x64.S @@ -1,5 +1,5 @@ /* - * Copyright 2012 pooler@litecoinpool.org + * Copyright 2012-2013 pooler@litecoinpool.org * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -107,6 +107,102 @@ sha256d_4preext2_30: .long 0x00400022, 0x00400022, 0x00400022, 0x00400022 +#ifdef USE_AVX2 + + .data + .p2align 7 +sha256_8h: + .long 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667 + .long 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85, 0xbb67ae85 + .long 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372, 0x3c6ef372 + .long 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a, 0xa54ff53a + .long 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f, 0x510e527f + .long 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c, 0x9b05688c + .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab + .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 + + .data + .p2align 7 +sha256_8k: + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491, 0x71374491 + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be, 0x243185be + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967, 0x14292967 + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 + + .data + .p2align 6 +sha256d_8preext2_17: + .long 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000, 0x00a00000 +sha256d_8preext2_23: + .long 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000, 0x11002000 +sha256d_8preext2_24: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 +sha256d_8preext2_30: + .long 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022, 0x00400022 + +#endif /* USE_AVX2 */ + + .text .p2align 6 .globl sha256_init_4way @@ -139,6 +235,40 @@ _sha256_init_4way: ret +#ifdef USE_AVX2 + .text + .p2align 6 + .globl sha256_init_8way + .globl _sha256_init_8way +sha256_init_8way: +_sha256_init_8way: +#if defined(WIN64) + pushq %rdi + movq %rcx, %rdi +#endif + vpbroadcastd sha256_4h+0(%rip), %ymm0 + vpbroadcastd sha256_4h+16(%rip), %ymm1 + vpbroadcastd sha256_4h+32(%rip), %ymm2 + vpbroadcastd sha256_4h+48(%rip), %ymm3 + vmovdqu %ymm0, 0*32(%rdi) + vmovdqu %ymm1, 1*32(%rdi) + vmovdqu %ymm2, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vpbroadcastd sha256_4h+64(%rip), %ymm0 + vpbroadcastd sha256_4h+80(%rip), %ymm1 + vpbroadcastd sha256_4h+96(%rip), %ymm2 + vpbroadcastd sha256_4h+112(%rip), %ymm3 + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm1, 5*32(%rdi) + vmovdqu %ymm2, 6*32(%rdi) + vmovdqu %ymm3, 7*32(%rdi) +#if defined(WIN64) + popq %rdi +#endif + ret +#endif /* USE_AVX2 */ + + .macro sha256_sse2_extend_round i movdqa (\i-15)*16(%rax), %xmm0 movdqa %xmm0, %xmm2 @@ -441,6 +571,143 @@ _sha256_init_4way: #endif /* USE_AVX */ +#if defined(USE_AVX2) + +.macro sha256_avx2_extend_round i + vmovdqa (\i-15)*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpsrld $10, %ymm3, %ymm3 + vpsrld $7, %ymm3, %ymm1 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpsrld $2, %ymm1, %ymm1 + vpslld $2, %ymm2, %ymm2 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm2, %ymm3, %ymm3 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, \i*32(%rax) +.endm + +.macro sha256_avx2_extend_doubleround i + vmovdqa (\i-15)*32(%rax), %ymm0 + vmovdqa (\i-14)*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + + vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 + vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 + + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, \i*32(%rax) + vmovdqa %ymm7, (\i+1)*32(%rax) +.endm + +.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 + vpaddd 32*(\i)(%rax), \r0, %ymm6 + vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 + + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 + + vpand \r6, \r5, %ymm2 + vpand \r7, \r5, \r4 + vpand \r7, \r6, %ymm1 + vpxor \r4, %ymm1, %ymm1 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + + vpslld $10, \r7, %ymm2 + vpsrld $2, \r7, \r4 + vpsrld $11, \r4, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $9, %ymm2, %ymm2 + vpsrld $9, %ymm1, %ymm1 + vpxor %ymm2, \r4, \r4 + vpxor %ymm1, \r4, \r4 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm2, \r4, \r4 + vpaddd %ymm6, \r4, \r4 +.endm + +.macro sha256_avx2_main_quadround i + sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 +.endm + +#endif /* USE_AVX2 */ + + #if defined(USE_XOP) .macro sha256_xop_extend_round i @@ -955,6 +1222,201 @@ sha256_transform_4way_finish: popq %rdi #endif ret + + +#ifdef USE_AVX2 + + .text + .p2align 6 +sha256_transform_8way_core_avx2: + leaq 8*64(%rsp), %rax + vmovdqa -2*32(%rax), %ymm3 + vmovdqa -1*32(%rax), %ymm7 + sha256_avx2_extend_doubleround 0 + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + sha256_avx2_extend_doubleround 6 + sha256_avx2_extend_doubleround 8 + sha256_avx2_extend_doubleround 10 + sha256_avx2_extend_doubleround 12 + sha256_avx2_extend_doubleround 14 + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + vmovdqu 0*32(%rdi), %ymm7 + vmovdqu 1*32(%rdi), %ymm5 + vmovdqu 2*32(%rdi), %ymm4 + vmovdqu 3*32(%rdi), %ymm3 + vmovdqu 4*32(%rdi), %ymm0 + vmovdqu 5*32(%rdi), %ymm8 + vmovdqu 6*32(%rdi), %ymm9 + vmovdqu 7*32(%rdi), %ymm10 + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + sha256_avx2_main_quadround 0 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_quadround 56 + sha256_avx2_main_quadround 60 + jmp sha256_transform_8way_finish + +.macro p2bswap_avx2_rsi_rsp i + vmovdqu \i*32(%rsi), %ymm0 + vmovdqu (\i+1)*32(%rsi), %ymm2 + vpshuflw $0xb1, %ymm0, %ymm0 + vpshuflw $0xb1, %ymm2, %ymm2 + vpshufhw $0xb1, %ymm0, %ymm0 + vpshufhw $0xb1, %ymm2, %ymm2 + vpsrlw $8, %ymm0, %ymm1 + vpsrlw $8, %ymm2, %ymm3 + vpsllw $8, %ymm0, %ymm0 + vpsllw $8, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm3, %ymm2, %ymm2 + vmovdqa %ymm0, \i*32(%rsp) + vmovdqa %ymm2, (\i+1)*32(%rsp) +.endm + + .text + .p2align 6 + .globl sha256_transform_8way + .globl _sha256_transform_8way +sha256_transform_8way: +_sha256_transform_8way: +#if defined(WIN64) + pushq %rdi + subq $96, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + vmovdqa %xmm11, 80(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx +#endif + movq %rsp, %r8 + subq $64*32, %rsp + andq $-128, %rsp + + testq %rdx, %rdx + jnz sha256_transform_8way_swap + + vmovdqu 0*32(%rsi), %ymm0 + vmovdqu 1*32(%rsi), %ymm1 + vmovdqu 2*32(%rsi), %ymm2 + vmovdqu 3*32(%rsi), %ymm3 + vmovdqu 4*32(%rsi), %ymm4 + vmovdqu 5*32(%rsi), %ymm5 + vmovdqu 6*32(%rsi), %ymm6 + vmovdqu 7*32(%rsi), %ymm7 + vmovdqa %ymm0, 0*32(%rsp) + vmovdqa %ymm1, 1*32(%rsp) + vmovdqa %ymm2, 2*32(%rsp) + vmovdqa %ymm3, 3*32(%rsp) + vmovdqa %ymm4, 4*32(%rsp) + vmovdqa %ymm5, 5*32(%rsp) + vmovdqa %ymm6, 6*32(%rsp) + vmovdqa %ymm7, 7*32(%rsp) + vmovdqu 8*32(%rsi), %ymm0 + vmovdqu 9*32(%rsi), %ymm1 + vmovdqu 10*32(%rsi), %ymm2 + vmovdqu 11*32(%rsi), %ymm3 + vmovdqu 12*32(%rsi), %ymm4 + vmovdqu 13*32(%rsi), %ymm5 + vmovdqu 14*32(%rsi), %ymm6 + vmovdqu 15*32(%rsi), %ymm7 + vmovdqa %ymm0, 8*32(%rsp) + vmovdqa %ymm1, 9*32(%rsp) + vmovdqa %ymm2, 10*32(%rsp) + vmovdqa %ymm3, 11*32(%rsp) + vmovdqa %ymm4, 12*32(%rsp) + vmovdqa %ymm5, 13*32(%rsp) + vmovdqa %ymm6, 14*32(%rsp) + vmovdqa %ymm7, 15*32(%rsp) + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_swap: + p2bswap_avx2_rsi_rsp 0 + p2bswap_avx2_rsi_rsp 2 + p2bswap_avx2_rsi_rsp 4 + p2bswap_avx2_rsi_rsp 6 + p2bswap_avx2_rsi_rsp 8 + p2bswap_avx2_rsi_rsp 10 + p2bswap_avx2_rsi_rsp 12 + p2bswap_avx2_rsi_rsp 14 + jmp sha256_transform_8way_core_avx2 + + .p2align 6 +sha256_transform_8way_finish: + vmovdqu 0*32(%rdi), %ymm2 + vmovdqu 1*32(%rdi), %ymm6 + vmovdqu 2*32(%rdi), %ymm11 + vmovdqu 3*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm7, %ymm7 + vpaddd %ymm6, %ymm5, %ymm5 + vpaddd %ymm11, %ymm4, %ymm4 + vpaddd %ymm1, %ymm3, %ymm3 + vmovdqu 4*32(%rdi), %ymm2 + vmovdqu 5*32(%rdi), %ymm6 + vmovdqu 6*32(%rdi), %ymm11 + vmovdqu 7*32(%rdi), %ymm1 + vpaddd %ymm2, %ymm0, %ymm0 + vpaddd %ymm6, %ymm8, %ymm8 + vpaddd %ymm11, %ymm9, %ymm9 + vpaddd %ymm1, %ymm10, %ymm10 + + vmovdqu %ymm7, 0*32(%rdi) + vmovdqu %ymm5, 1*32(%rdi) + vmovdqu %ymm4, 2*32(%rdi) + vmovdqu %ymm3, 3*32(%rdi) + vmovdqu %ymm0, 4*32(%rdi) + vmovdqu %ymm8, 5*32(%rdi) + vmovdqu %ymm9, 6*32(%rdi) + vmovdqu %ymm10, 7*32(%rdi) + + movq %r8, %rsp +#if defined(WIN64) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + vmovdqa 80(%rsp), %xmm11 + addq $96, %rsp + popq %rdi +#endif + ret + +#endif /* USE_AVX2 */ .data @@ -2604,4 +3066,596 @@ sha256_use_4way_done: movl $1, %eax ret + +#if defined(USE_AVX2) + + .text + .p2align 6 + .globl sha256d_ms_8way + .globl _sha256d_ms_8way +sha256d_ms_8way: +_sha256d_ms_8way: +sha256d_ms_8way_avx2: +#if defined(WIN64) + pushq %rdi + subq $80, %rsp + vmovdqa %xmm6, 0(%rsp) + vmovdqa %xmm7, 16(%rsp) + vmovdqa %xmm8, 32(%rsp) + vmovdqa %xmm9, 48(%rsp) + vmovdqa %xmm10, 64(%rsp) + pushq %rsi + movq %rcx, %rdi + movq %rdx, %rsi + movq %r8, %rdx + movq %r9, %rcx +#endif + pushq %rbp + movq %rsp, %rbp + subq $64*32, %rsp + andq $-128, %rsp + + leaq 16*32(%rsi), %rax + +sha256d_ms_8way_avx2_extend_loop1: + vmovdqa 3*32(%rsi), %ymm0 + vmovdqa 2*32(%rax), %ymm3 + vmovdqa 3*32(%rax), %ymm7 + vmovdqa %ymm3, 2*32(%rsp) + vmovdqa %ymm7, 3*32(%rsp) + vpaddd %ymm0, %ymm7, %ymm7 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm0 + vpsrld $4, %ymm0, %ymm1 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm0, %ymm0 + vpxor %ymm2, %ymm0, %ymm0 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 2*32(%rax) + vmovdqa %ymm7, 3*32(%rax) + + vmovdqa 4*32(%rax), %ymm0 + vmovdqa %ymm0, 4*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vmovdqa %ymm3, 4*32(%rax) + vmovdqa %ymm7, 5*32(%rax) + + vmovdqa 6*32(%rax), %ymm0 + vmovdqa 7*32(%rax), %ymm4 + vmovdqa %ymm0, 6*32(%rsp) + vmovdqa %ymm4, 7*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vmovdqa 8*32(%rax), %ymm0 + vmovdqa 2*32(%rax), %ymm4 + vmovdqa %ymm0, 8*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa 14*32(%rax), %ymm0 + vmovdqa 15*32(%rax), %ymm4 + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm4, 15*32(%rsp) + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + +sha256d_ms_8way_avx2_extend_loop2: + sha256_avx2_extend_doubleround 16 + sha256_avx2_extend_doubleround 18 + sha256_avx2_extend_doubleround 20 + sha256_avx2_extend_doubleround 22 + sha256_avx2_extend_doubleround 24 + sha256_avx2_extend_doubleround 26 + sha256_avx2_extend_doubleround 28 + sha256_avx2_extend_doubleround 30 + sha256_avx2_extend_doubleround 32 + sha256_avx2_extend_doubleround 34 + sha256_avx2_extend_doubleround 36 + sha256_avx2_extend_doubleround 38 + sha256_avx2_extend_doubleround 40 + sha256_avx2_extend_doubleround 42 + jz sha256d_ms_8way_avx2_extend_coda2 + sha256_avx2_extend_doubleround 44 + sha256_avx2_extend_doubleround 46 + + vmovdqa 0(%rcx), %ymm7 + vmovdqa 32(%rcx), %ymm8 + vmovdqa 64(%rcx), %ymm9 + vmovdqa 96(%rcx), %ymm10 + vmovdqa 128(%rcx), %ymm0 + vmovdqa 160(%rcx), %ymm5 + vmovdqa 192(%rcx), %ymm4 + vmovdqa 224(%rcx), %ymm3 + + movq %rsi, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop1 + +sha256d_ms_8way_avx2_main_loop2: + sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 +sha256d_ms_8way_avx2_main_loop1: + sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 4 + sha256_avx2_main_quadround 8 + sha256_avx2_main_quadround 12 + sha256_avx2_main_quadround 16 + sha256_avx2_main_quadround 20 + sha256_avx2_main_quadround 24 + sha256_avx2_main_quadround 28 + sha256_avx2_main_quadround 32 + sha256_avx2_main_quadround 36 + sha256_avx2_main_quadround 40 + sha256_avx2_main_quadround 44 + sha256_avx2_main_quadround 48 + sha256_avx2_main_quadround 52 + sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + jz sha256d_ms_8way_avx2_finish + sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 + sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 + sha256_avx2_main_quadround 60 + + vmovdqa 2*32(%rsp), %ymm1 + vmovdqa 3*32(%rsp), %ymm2 + vmovdqa 4*32(%rsp), %ymm6 + vmovdqa %ymm1, 18*32(%rsi) + vmovdqa %ymm2, 19*32(%rsi) + vmovdqa %ymm6, 20*32(%rsi) + vmovdqa 6*32(%rsp), %ymm1 + vmovdqa 7*32(%rsp), %ymm2 + vmovdqa 8*32(%rsp), %ymm6 + vmovdqa %ymm1, 22*32(%rsi) + vmovdqa %ymm2, 23*32(%rsi) + vmovdqa %ymm6, 24*32(%rsi) + vmovdqa 14*32(%rsp), %ymm1 + vmovdqa 15*32(%rsp), %ymm2 + vmovdqa %ymm1, 30*32(%rsi) + vmovdqa %ymm2, 31*32(%rsi) + + vpaddd 0(%rdx), %ymm7, %ymm7 + vpaddd 32(%rdx), %ymm5, %ymm5 + vpaddd 64(%rdx), %ymm4, %ymm4 + vpaddd 96(%rdx), %ymm3, %ymm3 + vpaddd 128(%rdx), %ymm0, %ymm0 + vpaddd 160(%rdx), %ymm8, %ymm8 + vpaddd 192(%rdx), %ymm9, %ymm9 + vpaddd 224(%rdx), %ymm10, %ymm10 + + vmovdqa %ymm7, 0(%rsp) + vmovdqa %ymm5, 32(%rsp) + vmovdqa %ymm4, 64(%rsp) + vmovdqa %ymm3, 96(%rsp) + vmovdqa %ymm0, 128(%rsp) + vmovdqa %ymm8, 160(%rsp) + vmovdqa %ymm9, 192(%rsp) + vmovdqa %ymm10, 224(%rsp) + + vpxor %ymm0, %ymm0, %ymm0 + movq $0x8000000000000100, %rax + vmovd %rax, %xmm1 + vinserti128 $1, %xmm1, %ymm1, %ymm1 + vpshufd $0x55, %ymm1, %ymm2 + vpshufd $0x00, %ymm1, %ymm1 + vmovdqa %ymm2, 8*32(%rsp) + vmovdqa %ymm0, 9*32(%rsp) + vmovdqa %ymm0, 10*32(%rsp) + vmovdqa %ymm0, 11*32(%rsp) + vmovdqa %ymm0, 12*32(%rsp) + vmovdqa %ymm0, 13*32(%rsp) + vmovdqa %ymm0, 14*32(%rsp) + vmovdqa %ymm1, 15*32(%rsp) + + leaq 16*32(%rsp), %rax + cmpq %rax, %rax + + vmovdqa -15*32(%rax), %ymm0 + vmovdqa -14*32(%rax), %ymm4 + vpslld $14, %ymm0, %ymm2 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm0, %ymm8 + vpsrld $3, %ymm4, %ymm4 + vpsrld $7, %ymm0, %ymm1 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpsrld $11, %ymm1, %ymm1 + vpsrld $11, %ymm5, %ymm5 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpslld $11, %ymm2, %ymm2 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm2, %ymm8, %ymm8 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd %ymm0, %ymm4, %ymm4 + vpaddd -16*32(%rax), %ymm8, %ymm3 + vpaddd sha256d_8preext2_17(%rip), %ymm4, %ymm7 + vmovdqa %ymm3, 0*32(%rax) + vmovdqa %ymm7, 1*32(%rax) + + sha256_avx2_extend_doubleround 2 + sha256_avx2_extend_doubleround 4 + + vmovdqa -9*32(%rax), %ymm0 + vpslld $14, %ymm0, %ymm2 + vpsrld $3, %ymm0, %ymm8 + vpsrld $7, %ymm0, %ymm1 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpsrld $11, %ymm1, %ymm1 + vpslld $11, %ymm2, %ymm2 + vpxor %ymm1, %ymm8, %ymm8 + vpxor %ymm2, %ymm8, %ymm8 + vpaddd sha256d_8preext2_23(%rip), %ymm0, %ymm4 + vpaddd -10*32(%rax), %ymm8, %ymm0 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd -1*32(%rax), %ymm0, %ymm0 + vpaddd 0*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 6*32(%rax) + vmovdqa %ymm7, 7*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd sha256d_8preext2_24(%rip), %ymm3, %ymm3 + vpaddd 1*32(%rax), %ymm3, %ymm3 + vpaddd 2*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 8*32(%rax) + vmovdqa %ymm7, 9*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 3*32(%rax), %ymm3, %ymm3 + vpaddd 4*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 10*32(%rax) + vmovdqa %ymm7, 11*32(%rax) + + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd 5*32(%rax), %ymm3, %ymm3 + vpaddd 6*32(%rax), %ymm7, %ymm7 + vmovdqa %ymm3, 12*32(%rax) + vmovdqa %ymm7, 13*32(%rax) + + vmovdqa sha256d_8preext2_30(%rip), %ymm0 + vmovdqa 0*32(%rax), %ymm4 + vpslld $14, %ymm4, %ymm6 + vpsrld $3, %ymm4, %ymm4 + vpsrld $4, %ymm4, %ymm5 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpsrld $11, %ymm5, %ymm5 + vpslld $11, %ymm6, %ymm6 + vpxor %ymm5, %ymm4, %ymm4 + vpxor %ymm6, %ymm4, %ymm4 + vpaddd -1*32(%rax), %ymm4, %ymm4 + vpslld $13, %ymm3, %ymm2 + vpslld $13, %ymm7, %ymm6 + vpsrld $10, %ymm3, %ymm3 + vpsrld $10, %ymm7, %ymm7 + vpaddd 7*32(%rax), %ymm0, %ymm0 + vpaddd 8*32(%rax), %ymm4, %ymm4 + vpsrld $7, %ymm3, %ymm1 + vpsrld $7, %ymm7, %ymm5 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpsrld $2, %ymm1, %ymm1 + vpsrld $2, %ymm5, %ymm5 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpslld $2, %ymm2, %ymm2 + vpslld $2, %ymm6, %ymm6 + vpxor %ymm1, %ymm3, %ymm3 + vpxor %ymm5, %ymm7, %ymm7 + vpxor %ymm2, %ymm3, %ymm3 + vpxor %ymm6, %ymm7, %ymm7 + vpaddd %ymm0, %ymm3, %ymm3 + vpaddd %ymm4, %ymm7, %ymm7 + vmovdqa %ymm3, 14*32(%rax) + vmovdqa %ymm7, 15*32(%rax) + + jmp sha256d_ms_8way_avx2_extend_loop2 + +sha256d_ms_8way_avx2_extend_coda2: + sha256_avx2_extend_round 44 + + vmovdqa sha256_8h+0(%rip), %ymm7 + vmovdqa sha256_8h+32(%rip), %ymm5 + vmovdqa sha256_8h+64(%rip), %ymm4 + vmovdqa sha256_8h+96(%rip), %ymm3 + vmovdqa sha256_8h+128(%rip), %ymm0 + vmovdqa sha256_8h+160(%rip), %ymm8 + vmovdqa sha256_8h+192(%rip), %ymm9 + vmovdqa sha256_8h+224(%rip), %ymm10 + + movq %rsp, %rax + leaq sha256_8k(%rip), %rcx + jmp sha256d_ms_8way_avx2_main_loop2 + +.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 + vpaddd 32*\i(%rax), \r0, %ymm6 + vpaddd 32*\i(%rcx), %ymm6, %ymm6 + vpandn \r1, \r3, %ymm1 + vpand \r3, \r2, %ymm2 + vpxor %ymm2, %ymm1, %ymm1 + vpaddd %ymm1, %ymm6, %ymm6 + vpslld $7, \r3, %ymm1 + vpsrld $6, \r3, \r0 + vpsrld $5, \r0, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $14, %ymm1, %ymm1 + vpsrld $14, %ymm2, %ymm2 + vpxor %ymm1, \r0, \r0 + vpxor %ymm2, \r0, \r0 + vpslld $5, %ymm1, %ymm1 + vpxor %ymm1, \r0, \r0 + vpaddd \r0, %ymm6, %ymm6 + vpaddd %ymm6, \r4, \r0 +.endm + +sha256d_ms_8way_avx2_finish: + sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 + sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 + sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 + sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 + + vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 + vmovdqa %ymm10, 224(%rdi) + + movq %rbp, %rsp + popq %rbp +#if defined(WIN64) + popq %rsi + vmovdqa 0(%rsp), %xmm6 + vmovdqa 16(%rsp), %xmm7 + vmovdqa 32(%rsp), %xmm8 + vmovdqa 48(%rsp), %xmm9 + vmovdqa 64(%rsp), %xmm10 + addq $80, %rsp + popq %rdi +#endif + ret + + + .text + .p2align 6 + .globl sha256_use_8way + .globl _sha256_use_8way +sha256_use_8way: +_sha256_use_8way: + pushq %rbx + + /* Check for AVX and OSXSAVE support */ + movl $1, %eax + cpuid + andl $0x18000000, %ecx + cmpl $0x18000000, %ecx + jne sha256_use_8way_no + /* Check for AVX2 support */ + movl $7, %eax + xorl %ecx, %ecx + cpuid + andl $0x00000020, %ebx + cmpl $0x00000020, %ebx + jne sha256_use_8way_no + /* Check for XMM and YMM state support */ + xorl %ecx, %ecx + xgetbv + andl $0x00000006, %eax + cmpl $0x00000006, %eax + jne sha256_use_8way_no + +sha256_use_8way_yes: + movl $1, %eax + jmp sha256_use_8way_done + +sha256_use_8way_no: + xorl %eax, %eax + +sha256_use_8way_done: + popq %rbx + ret + +#endif /* USE_AVX2 */ + #endif diff --git a/sha2.c b/sha2.c index 76b6ef348..817473586 100644 --- a/sha2.c +++ b/sha2.c @@ -522,6 +522,65 @@ static inline int scanhash_sha256d_4way(int thr_id, uint32_t *pdata, #endif /* HAVE_SHA256_4WAY */ +#ifdef HAVE_SHA256_8WAY + +void sha256d_ms_8way(uint32_t *hash, uint32_t *data, + const uint32_t *midstate, const uint32_t *prehash); + +static inline int scanhash_sha256d_8way(int thr_id, uint32_t *pdata, + const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t data[8 * 64] __attribute__((aligned(128))); + uint32_t hash[8 * 8] __attribute__((aligned(32))); + uint32_t midstate[8 * 8] __attribute__((aligned(32))); + uint32_t prehash[8 * 8] __attribute__((aligned(32))); + uint32_t n = pdata[19] - 1; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + int i, j; + + memcpy(data, pdata + 16, 64); + sha256d_preextend(data); + for (i = 31; i >= 0; i--) + for (j = 0; j < 8; j++) + data[i * 8 + j] = data[i]; + + sha256_init(midstate); + sha256_transform(midstate, pdata, 0); + memcpy(prehash, midstate, 32); + sha256d_prehash(prehash, pdata + 16); + for (i = 7; i >= 0; i--) { + for (j = 0; j < 8; j++) { + midstate[i * 8 + j] = midstate[i]; + prehash[i * 8 + j] = prehash[i]; + } + } + + do { + for (i = 0; i < 8; i++) + data[8 * 3 + i] = ++n; + + sha256d_ms_8way(hash, data, midstate, prehash); + + for (i = 0; i < 8; i++) { + if (swab32(hash[8 * 7 + i]) <= Htarg) { + pdata[19] = data[8 * 3 + i]; + sha256d_80_swap(hash, pdata); + if (fulltest(hash, ptarget)) { + *hashes_done = n - first_nonce + 1; + return 1; + } + } + } + } while (n < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = n - first_nonce + 1; + pdata[19] = n; + return 0; +} + +#endif /* HAVE_SHA256_8WAY */ + int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { @@ -533,6 +592,11 @@ int scanhash_sha256d(int thr_id, uint32_t *pdata, const uint32_t *ptarget, const uint32_t first_nonce = pdata[19]; const uint32_t Htarg = ptarget[7]; +#ifdef HAVE_SHA256_8WAY + if (sha256_use_8way()) + return scanhash_sha256d_8way(thr_id, pdata, ptarget, + max_nonce, hashes_done); +#endif #ifdef HAVE_SHA256_4WAY if (sha256_use_4way()) return scanhash_sha256d_4way(thr_id, pdata, ptarget,