diff --git a/.gitignore b/.gitignore index 904341b936..daa3432195 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ compat/curl-for-windows/ *.cudafe1.c *.cudafe2.c +*.bak diff --git a/Algo256/blake2s.cu b/Algo256/blake2s.cu index 0b4bbe0428..29d28f4916 100644 --- a/Algo256/blake2s.cu +++ b/Algo256/blake2s.cu @@ -34,18 +34,6 @@ uint32_t ROL16(const uint32_t a) { #define ROL16(u) (u << 16) #endif -__device__ __forceinline__ -uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c) -{ - uint32_t result; -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA -#else - result = a^b^c; -#endif - return result; -} - static const uint32_t blake2s_IV[8] = { 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL @@ -562,4 +550,3 @@ extern "C" void free_blake2s(int thr_id) cudaDeviceSynchronize(); } - diff --git a/Algo256/bmw512.cu b/Algo256/bmw512.cu new file mode 100644 index 0000000000..9567369432 --- /dev/null +++ b/Algo256/bmw512.cu @@ -0,0 +1,130 @@ +/** + * BMW512 + */ +extern "C" { +#include "sph/sph_bmw.h" +} +#include "miner.h" +#include "cuda_helper.h" +#include + +#define NBN 2 + +static uint32_t *d_resNonce[MAX_GPUS]; +static uint32_t *h_resNonce[MAX_GPUS]; +extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); +extern void quark_bmw512_cpu_setBlock_80(void *pdata); +void quark_bmw512_cpu_hash_80_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_resNonce, const uint64_t target); + + +extern "C" void bmw512_hash(void *state, const void *input) { + sph_bmw512_context ctx_bmw; + unsigned char hash[64]; + + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, input, 80); + sph_bmw512_close(&ctx_bmw, hash); + memcpy(state, hash, 32); +} + + +static bool init[MAX_GPUS] = { 0 }; + + +extern "C" int scanhash_bmw512(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + uint32_t endiandata[20]; + + if (opt_benchmark) ptarget[7] = 0x00ff; + + for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); + + uint32_t throughput = cuda_default_throughput(thr_id, 1 << 28); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t))); + h_resNonce[thr_id] = (uint32_t*) malloc(NBN * sizeof(uint32_t)); + if(h_resNonce[thr_id] == NULL){ + gpulog(LOG_ERR,thr_id,"Host memory allocation failed"); + exit(EXIT_FAILURE); + } + quark_bmw512_cpu_init(thr_id, throughput); + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + quark_bmw512_cpu_setBlock_80((void*)endiandata); + cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t)); + *hashes_done = 0; + + do { + quark_bmw512_cpu_hash_80_final(thr_id, throughput, pdata[19], d_resNonce[thr_id], *(uint64_t*)&ptarget[6]); + cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost); + *hashes_done += throughput; + + if (h_resNonce[thr_id][0] != UINT32_MAX) { + const uint32_t Htarg = ptarget[7]; + const uint32_t startNounce = pdata[19]; + uint32_t _ALIGN(64) vhash[8]; + + be32enc(&endiandata[19], startNounce + h_resNonce[thr_id][0]); + bmw512_hash(vhash, endiandata); + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[0] = startNounce + h_resNonce[thr_id][0]; + work_set_target_ratio(work, vhash); + if (h_resNonce[thr_id][1] != UINT32_MAX) { + uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[thr_id][1]; + be32enc(&endiandata[19], secNonce); + bmw512_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + cudaMemset(d_resNonce[thr_id], 0xff, 2*sizeof(uint32_t)); + pdata[19] = startNounce + h_resNonce[thr_id][0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + return 0; +} + + +extern "C" void free_bmw512(int thr_id) { + if (!init[thr_id]) return; + + cudaSetDevice(device_map[thr_id]); + + free(h_resNonce[thr_id]); + cudaFree(d_resNonce[thr_id]); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu index 418ca07ec5..ce9ff99ea5 100644 --- a/Algo256/cuda_blake256.cu +++ b/Algo256/cuda_blake256.cu @@ -20,12 +20,12 @@ __device__ uint32_t __byte_perm(uint32_t a, uint32_t b, uint32_t c); #define UINT2(x,y) make_uint2(x,y) -__device__ __inline__ uint2 ROR8(const uint2 a) { +/*__device__ __inline__ uint2 ROR8(const uint2 a) { uint2 result; result.x = __byte_perm(a.y, a.x, 0x0765); result.y = __byte_perm(a.x, a.y, 0x0765); return result; -} +}*/ static __device__ uint64_t cuda_swab32ll(uint64_t x) { return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); diff --git a/Algo256/cuda_keccak256.cu b/Algo256/cuda_keccak256.cu index 7e87bb2860..9ae67c6046 100644 --- a/Algo256/cuda_keccak256.cu +++ b/Algo256/cuda_keccak256.cu @@ -32,18 +32,6 @@ __constant__ uint2 keccak_round_constants[24] = { { 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 } }; -__device__ __forceinline__ -uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) { - uint2 result; -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA -#else - result = a^b^c; -#endif - return result; -} - __device__ __forceinline__ uint2 chi(const uint2 a,const uint2 b,const uint2 c) { // keccak chi uint2 result; diff --git a/Algo256/cuda_keccak256_sha3d.cu b/Algo256/cuda_keccak256_sha3d.cu new file mode 100644 index 0000000000..debe4577a3 --- /dev/null +++ b/Algo256/cuda_keccak256_sha3d.cu @@ -0,0 +1,310 @@ +#include "miner.h" + +extern "C" { +#include +#include +} + +#include "cuda_helper.h" + +static const uint64_t host_keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +static uint32_t *d_KNonce[MAX_GPUS]; + +__constant__ uint32_t pTarget[8]; +__constant__ uint64_t keccak_round_constants[24]; +__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?) + +#if __CUDA_ARCH__ >= 350 +__device__ __forceinline__ +static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants) +{ + size_t i; + uint2 t[5], u[5], v, w; + + #pragma unroll + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROL2(t[1], 1); + u[1] = t[0] ^ ROL2(t[2], 1); + u[2] = t[1] ^ ROL2(t[3], 1); + u[3] = t[2] ^ ROL2(t[4], 1); + u[4] = t[3] ^ ROL2(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[1]; + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL2(s[19], 8); + s[19] = ROL2(s[23], 56); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; + v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= vectorize(keccak_round_constants[i]); + } +} +#else + +__device__ __forceinline__ +static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants) +{ + size_t i; + uint64_t t[5], u[5], v, w; + + /* absorb input */ + + for (i = 0; i < 24; i++) { + /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ + t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; + t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; + t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; + t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; + t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; + + /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ + u[0] = t[4] ^ ROTL64(t[1], 1); + u[1] = t[0] ^ ROTL64(t[2], 1); + u[2] = t[1] ^ ROTL64(t[3], 1); + u[3] = t[2] ^ ROTL64(t[4], 1); + u[4] = t[3] ^ ROTL64(t[0], 1); + + /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ + s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; + s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; + s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; + s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; + s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[ 1]; + s[ 1] = ROTL64(s[ 6], 44); + s[ 6] = ROTL64(s[ 9], 20); + s[ 9] = ROTL64(s[22], 61); + s[22] = ROTL64(s[14], 39); + s[14] = ROTL64(s[20], 18); + s[20] = ROTL64(s[ 2], 62); + s[ 2] = ROTL64(s[12], 43); + s[12] = ROTL64(s[13], 25); + s[13] = ROTL64(s[19], 8); + s[19] = ROTL64(s[23], 56); + s[23] = ROTL64(s[15], 41); + s[15] = ROTL64(s[ 4], 27); + s[ 4] = ROTL64(s[24], 14); + s[24] = ROTL64(s[21], 2); + s[21] = ROTL64(s[ 8], 55); + s[ 8] = ROTL64(s[16], 45); + s[16] = ROTL64(s[ 5], 36); + s[ 5] = ROTL64(s[ 3], 28); + s[ 3] = ROTL64(s[18], 21); + s[18] = ROTL64(s[17], 15); + s[17] = ROTL64(s[11], 10); + s[11] = ROTL64(s[ 7], 6); + s[ 7] = ROTL64(s[10], 3); + s[10] = ROTL64( v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; + v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; + v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; + v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; + v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } +} +#endif + +__global__ __launch_bounds__(128,5) +void keccak256_sha3d_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t nounce = startNounce + thread; + +#if __CUDA_ARCH__ >= 350 + uint2 keccak_gpu_state[25]; + #pragma unroll 25 + for (int i=0; i<25; i++) { + if (i<9) keccak_gpu_state[i] = vectorize(c_PaddedMessage80[i]); + else keccak_gpu_state[i] = make_uint2(0, 0); + } + + keccak_gpu_state[9]= vectorize(c_PaddedMessage80[9]); + keccak_gpu_state[9].y = cuda_swab32(nounce); + keccak_gpu_state[10] = make_uint2(0x6, 0); + keccak_gpu_state[16] = make_uint2(0, 0x80000000); + + keccak_blockv35(keccak_gpu_state,keccak_round_constants); + // Output is 256 bits = 32 bytes. So, only keep the first 32 bytes + for (int i = 4; i<25; i++) { + keccak_gpu_state[i] = make_uint2(0, 0); + } + keccak_gpu_state[4] = make_uint2(0x6, 0); + keccak_gpu_state[16] = make_uint2(0, 0x80000000); + keccak_blockv35(keccak_gpu_state,keccak_round_constants); + if (devectorize(keccak_gpu_state[3]) <= ((uint64_t*)pTarget)[3]) {resNounce[0] = nounce;} +#else + uint64_t keccak_gpu_state[25]; + #pragma unroll 25 + for (int i=0; i<25; i++) { + if (i<9) keccak_gpu_state[i] = c_PaddedMessage80[i]; + else keccak_gpu_state[i] = 0; + } + keccak_gpu_state[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nounce)); + keccak_gpu_state[10] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_blockv30(keccak_gpu_state, keccak_round_constants); + if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) { resNounce[0] = nounce; } +#endif + } +} + +__host__ +void keccak256_sha3d_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, int order) +{ + cudaMemset(d_KNonce[thr_id], 0xff, 2*sizeof(uint32_t)); + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 0; + + keccak256_sha3d_gpu_hash_80<<>>(threads, startNounce, d_KNonce[thr_id]); + + cudaMemcpy(resNonces, d_KNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaThreadSynchronize(); +} + +#if 0 +__global__ __launch_bounds__(256,3) +void keccak256_sha3d_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { +#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */ + uint2 keccak_gpu_state[25]; + #pragma unroll 25 + for (int i = 0; i<25; i++) { + if (i<4) keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]); + else keccak_gpu_state[i] = make_uint2(0, 0); + } + keccak_gpu_state[4] = make_uint2(1, 0); + keccak_gpu_state[16] = make_uint2(0, 0x80000000); + keccak_blockv35(keccak_gpu_state, keccak_round_constants); + + #pragma unroll 4 + for (int i=0; i<4; i++) + outputHash[i*threads+thread] = devectorize(keccak_gpu_state[i]); +#else + uint64_t keccak_gpu_state[25]; + #pragma unroll 25 + for (int i = 0; i<25; i++) { + if (i<4) + keccak_gpu_state[i] = outputHash[i*threads+thread]; + else + keccak_gpu_state[i] = 0; + } + keccak_gpu_state[4] = 0x0000000000000001; + keccak_gpu_state[16] = 0x8000000000000000; + + keccak_blockv30(keccak_gpu_state, keccak_round_constants); + #pragma unroll 4 + for (int i = 0; i<4; i++) + outputHash[i*threads + thread] = keccak_gpu_state[i]; +#endif + } +} + +__host__ +void keccak256_sha3d_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + keccak256_sha3d_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); + MyStreamSynchronize(NULL, order, thr_id); +} +#endif + +__host__ +void keccak256_sha3d_setBlock_80(void *pdata,const void *pTargetIn) +{ + unsigned char PaddedMessage[80]; + memcpy(PaddedMessage, pdata, 80); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); +} + +__host__ +void keccak256_sha3d_init(int thr_id, uint32_t threads) +{ + CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants, + sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 2*sizeof(uint32_t))); +} + +__host__ +void keccak256_sha3d_free(int thr_id) +{ + cudaFree(d_KNonce[thr_id]); +} diff --git a/Algo256/sha3d.cu b/Algo256/sha3d.cu new file mode 100644 index 0000000000..ee7e022d1b --- /dev/null +++ b/Algo256/sha3d.cu @@ -0,0 +1,178 @@ +/* + * SHA3D + * + */ + +extern "C" +{ +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_sha3d.h" + +#include "miner.h" +} + +#include "cuda_helper.h" + +// SM5+ cuda +extern void keccak256_cpu_init(int thr_id); +extern void keccak256_cpu_free(int thr_id); +extern void keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, const uint2 highTarget); +extern void keccak256_setBlock_80(uint64_t *endiandata); +extern void keccak256_setOutput(int thr_id); + +// compat +extern void keccak256_sha3d_init(int thr_id, uint32_t threads); +extern void keccak256_sha3d_free(int thr_id); +extern void keccak256_sha3d_setBlock_80(void *pdata, const void *ptarget); +extern void keccak256_sha3d_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, int order); + +// CPU Hash +extern "C" void sha3d_hash(void *state, const void *input) +{ + uint32_t _ALIGN(64) buffer[16], hash[16]; + sph_keccak_context ctx_keccak; + + sph_sha3d256_init(&ctx_keccak); + sph_sha3d256 (&ctx_keccak, input, 80); + sph_sha3d256_close(&ctx_keccak, (void*) buffer); + sph_sha3d256_init(&ctx_keccak); + sph_sha3d256 (&ctx_keccak, buffer, 32); + sph_sha3d256_close(&ctx_keccak, (void*) hash); + + memcpy(state, hash, 32); +} + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_sha3d(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + uint32_t throughput; + uint32_t intensity = 23; + if(!use_compat_kernels[thr_id]) { + if (strstr(device_name[dev_id], "GTX 1070")) intensity = 25; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 26; + } + throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0x000f; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = true; + + if(!use_compat_kernels[thr_id]) { + keccak256_cpu_init(thr_id); + } else { + // really useful ? + keccak256_sha3d_init(thr_id, throughput); + } + + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + init[thr_id] = true; + } + + for (int k=0; k < 19; k++) { + be32enc(&endiandata[k], pdata[k]); + } + + const uint2 highTarget = make_uint2(ptarget[6], ptarget[7]); + if(use_compat_kernels[thr_id]) + keccak256_sha3d_setBlock_80((void*)endiandata, ptarget); + else { + keccak256_setBlock_80((uint64_t*)endiandata); + keccak256_setOutput(thr_id); + } + + do { + int order = 0; + + *hashes_done = pdata[19] - first_nonce + throughput; + + if(use_compat_kernels[thr_id]) + keccak256_sha3d_hash_80(thr_id, throughput, pdata[19], work->nonces, order++); + else { + keccak256_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, highTarget); + } + + if (work->nonces[0] != UINT32_MAX && bench_algo < 0) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + + be32enc(&endiandata[19], work->nonces[0]); + sha3d_hash(vhash, endiandata); + + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + if (!use_compat_kernels[thr_id] && work->nonces[1] != UINT32_MAX) { + be32enc(&endiandata[19], work->nonces[1]); + sha3d_hash(vhash, endiandata); + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces++; + bn_set_target_ratio(work, vhash, 1); + } + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + //keccak256_setOutput(thr_id); + continue; + } + } + + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_sha3d(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + if(!use_compat_kernels[thr_id]) + keccak256_cpu_free(thr_id); + else { + keccak256_sha3d_free(thr_id); + } + + cudaDeviceSynchronize(); + init[thr_id] = false; +} diff --git a/Makefile.am b/Makefile.am index 8ba3aa7590..4b3edc8d33 100644 --- a/Makefile.am +++ b/Makefile.am @@ -41,14 +41,14 @@ ccminer_SOURCES = elist.h miner.h compat.h \ myriadgroestl.cpp cuda_myriadgroestl.cu \ lyra2/Lyra2.c lyra2/Sponge.c \ lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \ - lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ + lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \ lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \ Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \ Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \ - Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ + Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256_sha3d.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \ Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \ Algo256/blake2s.cu sph/blake2s.c \ - Algo256/bmw.cu Algo256/cuda_bmw.cu \ + Algo256/bmw.cu Algo256/bmw512.cu Algo256/cuda_bmw.cu \ crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \ crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \ crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \ @@ -57,7 +57,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \ quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \ quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \ - quark/nist5.cu \ + quark/animecoin.cu quark/nist5.cu \ quark/quarkcoin.cu quark/cuda_quark_compactionTest.cu \ neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \ pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \ @@ -66,24 +66,28 @@ ccminer_SOURCES = elist.h miner.h compat.h \ sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \ sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ - sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \ + sph/hamsi.c sph/hamsi_helper.c sph/streebog.c sph/tiger.c \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ - sph/ripemd.c sph/sph_sha2.c \ + sph/ripemd.c sph/sph_sha2.c sph/sha3d.c \ polytimos.cu \ lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ tribus/tribus.cu tribus/cuda_echo512_final.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ - x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ + x11/cuda_x11_shavite512.cu x11/cuda_x11_shavite512_sp.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \ + x16/x16r.cu x16/x16rv2.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \ + x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \ + x16/cuda_x16_echo512_64.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x11/phi.cu x11/cuda_streebog_maxwell.cu \ - x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu x11/0x10.cu - + x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu x11/0x10.cu \ + x21/x21s.cu x21/cuda_sha256.cu x21/cuda_tiger192.cu + # scrypt ccminer_SOURCES += scrypt.cpp scrypt-jane.cpp \ scrypt/blake.cu scrypt/keccak.cu scrypt/sha256.cu \ diff --git a/README.txt b/README.txt index 61a1eaf36f..ebe9df092c 100644 --- a/README.txt +++ b/README.txt @@ -599,4 +599,4 @@ With kind regards, Christian Buchner ( Christian.Buchner@gmail.com ) Christian H. ( Chris84 ) - Tanguy Pruvot ( tpruvot@github ) + Tanguy Pruvot ( tpruvot@github ) \ No newline at end of file diff --git a/algos.h b/algos.h index 2e9cbfe9ff..2b7619fd23 100644 --- a/algos.h +++ b/algos.h @@ -5,10 +5,12 @@ #include "compat.h" enum sha_algos { - ALGO_BLAKECOIN = 0, + ALGO_ANIME = 0, + ALGO_BLAKECOIN, ALGO_BLAKE, ALGO_BLAKE2S, ALGO_BMW, + ALGO_BMW512, ALGO_BASTION, ALGO_C11, ALGO_CRYPTOLIGHT, @@ -19,6 +21,7 @@ enum sha_algos { ALGO_EQUIHASH, ALGO_FRESH, ALGO_FUGUE256, /* Fugue256 */ + ALGO_GOSTCOIN, ALGO_GROESTL, ALGO_HEAVY, /* Heavycoin hash */ ALGO_HMQ1725, @@ -37,6 +40,7 @@ enum sha_algos { ALGO_MJOLLNIR, /* Hefty hash */ ALGO_MYR_GR, ALGO_NEOSCRYPT, + ALGO_XAYA, ALGO_NIST5, ALGO_PENTABLAKE, ALGO_PHI, @@ -47,6 +51,7 @@ enum sha_algos { ALGO_SCRYPT_JANE, ALGO_SHA256D, ALGO_SHA256T, + ALGO_SHA3D, ALGO_SIA, ALGO_SIB, ALGO_SKEIN, @@ -62,7 +67,12 @@ enum sha_algos { ALGO_X13, ALGO_X14, ALGO_X15, + ALGO_X16R, + ALGO_X16RT, + ALGO_X16RV2, + ALGO_X16S, ALGO_X17, + ALGO_X21S, ALGO_VANILLA, ALGO_VELTOR, ALGO_WHIRLCOIN, @@ -77,10 +87,12 @@ enum sha_algos { extern volatile enum sha_algos opt_algo; static const char *algo_names[] = { + "anime", "blakecoin", "blake", "blake2s", "bmw", + "bmw512", "bastion", "c11", "cryptolight", @@ -91,6 +103,7 @@ static const char *algo_names[] = { "equihash", "fresh", "fugue256", + "gostcoin", "groestl", "heavy", "hmq1725", @@ -109,6 +122,7 @@ static const char *algo_names[] = { "mjollnir", "myr-gr", "neoscrypt", + "xaya", "nist5", "penta", "phi", @@ -119,6 +133,7 @@ static const char *algo_names[] = { "scrypt-jane", "sha256d", "sha256t", + "sha3d", "sia", "sib", "skein", @@ -134,7 +149,12 @@ static const char *algo_names[] = { "x13", "x14", "x15", + "x16r", + "x16rt", + "x16rv2", + "x16s", "x17", + "x21s", "vanilla", "veltor", "whirlcoin", @@ -187,6 +207,8 @@ static inline int algo_to_int(char* arg) i = ALGO_SHA256D; else if (!strcasecmp("sha256", arg)) i = ALGO_SHA256D; + else if (!strcasecmp("gostcoin", arg)) + i = ALGO_GOSTCOIN; else if (!strcasecmp("thorsriddle", arg)) i = ALGO_VELTOR; else if (!strcasecmp("timetravel10", arg)) @@ -195,6 +217,8 @@ static inline int algo_to_int(char* arg) i = ALGO_WHIRLPOOL; else if (!strcasecmp("ziftr", arg)) i = ALGO_ZR5; + else if (!strcasecmp("neoscrypt-xaya", arg)) + i = ALGO_XAYA; else i = -1; } diff --git a/allium.cu b/allium.cu index ef078ac6cc..65dbbe3a77 100644 --- a/allium.cu +++ b/allium.cu @@ -1,9 +1,9 @@ extern "C" { #include "sph/sph_blake.h" -#include "sph/sph_groestl.h" -#include "sph/sph_skein.h" #include "sph/sph_keccak.h" #include "sph/sph_cubehash.h" +#include "sph/sph_skein.h" +#include "sph/sph_groestl.h" #include "lyra2/Lyra2.h" } @@ -12,7 +12,6 @@ extern "C" { static uint64_t* d_hash[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS]; -static uint64_t* g_pad[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); extern void blake256_cpu_setBlock_80(uint32_t *pdata); @@ -25,12 +24,13 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); + +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); + extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_init_high_end(int thr_id, uint32_t threads, uint64_t *g_pad); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti, uint32_t high_end); -extern void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, uint64_t *g_pad, bool gtx750ti, uint32_t high_end); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_free(int thr_id); @@ -38,7 +38,6 @@ extern void groestl256_setTarget(const void *ptarget); extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order); extern uint32_t groestl256_getSecNonce(int thr_id, int num); -extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); extern "C" void allium_hash(void *state, const void *input) { @@ -46,9 +45,9 @@ extern "C" void allium_hash(void *state, const void *input) sph_blake256_context ctx_blake; sph_keccak256_context ctx_keccak; + sph_cubehash256_context ctx_cube; sph_skein256_context ctx_skein; sph_groestl256_context ctx_groestl; - sph_cubehash256_context ctx_cube; sph_blake256_set_rounds(14); @@ -89,15 +88,18 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce const uint32_t first_nonce = pdata[19]; if (opt_benchmark) - ptarget[7] = 0x0400; + ptarget[7] = 0x00ff; static __thread bool gtx750ti; - static __thread uint32_t high_end; if (!init[thr_id]) { int dev_id = device_map[thr_id]; cudaSetDevice(dev_id); - CUDA_LOG_ERROR(); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; if (device_sm[device_map[thr_id]] == 500) intensity = 15; @@ -110,17 +112,6 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce if (strstr(props.name, "750 Ti")) gtx750ti = true; else gtx750ti = false; - if (strstr(props.name, "1080") || - strstr(props.name, "1070")) high_end = 1; - if (strstr(props.name, "3090") || - strstr(props.name, "3080") || - strstr(props.name, "3070") || - strstr(props.name, "3060") || - strstr(props.name, "A4000") || - strstr(props.name, "A5000") || - strstr(props.name, "A6000")) high_end = 2; - else high_end = 0; - gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); blake256_cpu_init(thr_id, throughput); @@ -134,11 +125,6 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); - if (high_end == 1) { - size_t pad_sz = sizeof(uint64_t) * 8 * 8 * 3 * 4; - CUDA_SAFE_CALL(cudaMalloc(&g_pad[thr_id], pad_sz * throughput)); - lyra2_cpu_init_high_end(thr_id, throughput, g_pad[thr_id]); - } } CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); @@ -147,7 +133,7 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce } uint32_t _ALIGN(128) endiandata[20]; - for (int k = 0; k < 20; k++) + for (int k=0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); blake256_cpu_setBlock_80(pdata); @@ -156,14 +142,12 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce do { int order = 0; + //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - lyra2_cpu_hash_32_fancyIX(thr_id, throughput, pdata[19], d_hash[thr_id], g_pad[thr_id], gtx750ti, high_end); - + lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - - lyra2_cpu_hash_32_fancyIX(thr_id, throughput, pdata[19], d_hash[thr_id], g_pad[thr_id], gtx750ti, high_end); - + lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; @@ -187,8 +171,7 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce bn_set_target_ratio(work, vhash, 1); work->valid_nonces++; pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; - } - else { + } else { pdata[19] = work->nonces[0] + 1; // cursor } return work->valid_nonces; @@ -196,7 +179,7 @@ extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce else if (vhash[7] > Htarg) { gpu_increment_reject(thr_id); if (!opt_quiet) - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); pdata[19] = work->nonces[0] + 1; continue; } @@ -224,9 +207,6 @@ extern "C" void free_allium(int thr_id) cudaFree(d_hash[thr_id]); cudaFree(d_matrix[thr_id]); - if (g_pad[thr_id] != NULL) { - cudaFree(g_pad[thr_id]); - } //keccak256_sm3_free(thr_id); groestl256_cpu_free(thr_id); diff --git a/bench.cpp b/bench.cpp index 23f547e2cc..573d759d89 100644 --- a/bench.cpp +++ b/bench.cpp @@ -56,6 +56,7 @@ void algo_free_all(int thr_id) free_blake256(thr_id); free_blake2s(thr_id); free_bmw(thr_id); + free_bmw512(thr_id); free_c11(thr_id); free_cryptolight(thr_id); free_cryptonight(thr_id); @@ -90,6 +91,7 @@ void algo_free_all(int thr_id) free_skunk(thr_id); free_sha256d(thr_id); free_sha256t(thr_id); + free_sha3d(thr_id); free_sia(thr_id); free_sib(thr_id); free_s3(thr_id); @@ -103,7 +105,11 @@ void algo_free_all(int thr_id) free_x13(thr_id); free_x14(thr_id); free_x15(thr_id); + free_x16r(thr_id); + free_x16rv2(thr_id); + free_x16s(thr_id); free_x17(thr_id); + free_x21s(thr_id); free_zr5(thr_id); free_timetravel(thr_id); free_tribus(thr_id); @@ -135,6 +141,7 @@ bool bench_algo_switch_next(int thr_id) if (algo == ALGO_CRYPTOLIGHT) algo++; if (algo == ALGO_CRYPTONIGHT) algo++; if (algo == ALGO_WILDKECCAK) algo++; + if (algo == ALGO_ANIME) algo++; // to fix if (algo == ALGO_QUARK) algo++; // to fix if (algo == ALGO_LBRY && CUDART_VERSION < 7000) algo++; diff --git a/ccminer-cuda10.sln b/ccminer-cuda10.sln new file mode 100644 index 0000000000..86cf17045d --- /dev/null +++ b/ccminer-cuda10.sln @@ -0,0 +1,27 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 2013 +VisualStudioVersion = 12.0.21005.1 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ccminer", "ccminer-cuda10.vcxproj", "{36DC07F9-A4A6-4877-A146-1B960083CF6F}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Win32 = Debug|Win32 + Debug|x64 = Debug|x64 + Release|Win32 = Release|Win32 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|Win32.ActiveCfg = Debug|Win32 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|Win32.Build.0 = Debug|Win32 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|x64.ActiveCfg = Debug|x64 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Debug|x64.Build.0 = Debug|x64 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|Win32.ActiveCfg = Release|x64 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|Win32.Build.0 = Release|x64 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|x64.ActiveCfg = Release|x64 + {36DC07F9-A4A6-4877-A146-1B960083CF6F}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/ccminer-cuda10.vcxproj b/ccminer-cuda10.vcxproj new file mode 100644 index 0000000000..206c644500 --- /dev/null +++ b/ccminer-cuda10.vcxproj @@ -0,0 +1,640 @@ + + + + + Release + Win32 + + + Debug + Win32 + + + Release + x64 + + + Debug + x64 + + + + {36DC07F9-A4A6-4877-A146-1B960083CF6F} + ccminer + 10.0.17763.0 + + + + Application + MultiByte + v120 + false + false + + + false + true + + + true + + + false + + + false + + + false + + + false + + + + + + + + + + + + + + + + + + + + Level3 + Disabled + MultiThreadedDebugDLL + true + WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + true + true + true + + + true + Console + cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies) + libcmt.lib;msvcrt.lib + compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + /NODEFAULTLIB:LIBCMT %(AdditionalOptions) + true + + + false + true + 80 + true + true + compute_50,sm_50 + $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 + + + + + Level3 + Disabled + MultiThreadedDebugDLL + true + WIN32;_DEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + 8Bytes + true + true + true + + + true + Console + cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;nvapi64.lib;%(AdditionalDependencies) + libcmt.lib + compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + /NODEFAULTLIB:LIBCMTD %(AdditionalOptions) + true + + + false + true + 80 + true + true + compute_52,sm_52 + $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 + 64 + + + false + + + + + Level3 + MaxSpeed + MultiThreaded + Speed + StreamingSIMDExtensions2 + false + true + true + WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + true + true + true + true + + + false + true + true + Console + cudart_static.lib;cuda.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x86.lib;openssl.x86.lib;zlib.x86.lib;ws2_32.lib;Wldap32.lib;nvapi.lib;%(AdditionalDependencies) + libcmt.lib + compat\pthreads\x86;compat\curl-for-windows\out\x86\Release\lib;compat\nvapi\x86;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + /NODEFAULTLIB:LIBCMT %(AdditionalOptions) + false + false + true + + + false + 80 + true + true + 75,sm_75;compute_61,sm_61;compute_52,sm_52;compute_50,sm_50 + --ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions) + O2 + + + false + O3 + + + + + Level3 + MaxSpeed + MultiThreaded + Speed + + + true + false + true + true + WIN32;NDEBUG;_CONSOLE;_CRT_SECURE_NO_WARNINGS;CURL_STATICLIB;USE_WRAPNVML;SCRYPT_KECCAK512;SCRYPT_CHACHA;SCRYPT_CHOOSE_COMPILETIME;%(PreprocessorDefinitions) + .;compat;compat\curl-for-windows\curl\include;compat\jansson;compat\getopt;compat\pthreads;compat\curl-for-windows\openssl\openssl\include;compat\curl-for-windows\zlib;%(AdditionalIncludeDirectories);$(CudaToolkitIncludeDir) + true + true + + + false + true + true + Console + kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;pthreadVC2.lib;libcurl.x64.lib;openssl.x64.lib;zlib.x64.lib;ws2_32.lib;Wldap32.lib;cudart_static.lib;cuda.lib;nvapi64.lib;%(AdditionalDependencies) + libcmt.lib + compat\pthreads\x64;compat\curl-for-windows\out\x64\Release\lib;compat\nvapi\amd64;%(AdditionalLibraryDirectories);$(CudaToolkitLibDir) + /NODEFAULTLIB:LIBCMT %(AdditionalOptions) + false + true + + + false + 128 + true + true + compute_75,sm_75;compute_61,sm_61;compute_52,sm_52;compute_50,sm_50 + $(NVTOOLSEXT_PATH)\include + O3 + 64 + -allow-unsupported-compiler --Wno-deprecated-gpu-targets %(AdditionalOptions) + + + O3 + false + + + + + + + + + + + + + + + + + + + false + Full + + + + + + + + + + + StreamingSIMDExtensions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 76 + + + + + + + 64 + + + + 64 + + + 255 + + + + + + + + + 128 + + + 160 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 80 + + + + + + + + + + + + + -Xptxas "-abi=yes" %(AdditionalOptions) + + + + + 64 + --ptxas-options="-dlcm=cg" %(AdditionalOptions) + true + + + + 128 + + + + + + + + + + + + 92 + + + 128 + + + + + + + + 80 + --ptxas-options="-dlcm=cg" %(AdditionalOptions) + true + + + 128 + + + 80 + + + 80 + + + false + + + -Xptxas "-abi=yes" %(AdditionalOptions) + -Xptxas "-abi=yes" %(AdditionalOptions) + + + + + 88 + + + + + + + + + + + + + + + + + + + + + + + 64 + + + 48 + + + + compute_50,sm_50;compute_52,sm_52 + 64 + + + + + compute_50,sm_50;compute_52,sm_52 + + + + + + + + + + + 76 + + + 128 + + + + 64 + + + + + + + + + + + + + + + + 72 + + + + + + + + + + + + + + + + + + + + + + compute_50,sm_50;compute_52,sm_52 + + + + + + + 80 + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/ccminer-cuda10.vcxproj.filters b/ccminer-cuda10.vcxproj.filters new file mode 100644 index 0000000000..f2c2f26cec --- /dev/null +++ b/ccminer-cuda10.vcxproj.filters @@ -0,0 +1,1052 @@ + + + + + {2450a9c7-a97a-49e1-ba19-c8dbc5a4e3e7} + + + {c53ce808-c5c5-4c6c-99a2-3947090c62f1} + + + {5a45c1bf-81d2-4bc6-97b5-714e34f51a82} + + + {431cec61-9376-4de9-aae9-04c4250652e7} + + + {cc8bb259-5332-4a45-ba81-f4840a55b604} + + + {89362bd8-4690-4f0c-a4f7-6b2fa67a1f34} + + + {6c3cd392-b6b8-424c-87d2-10e33dbd4b41} + + + {5a31b6f4-4943-4b22-b69a-230f3cc96269} + + + {a0f072d0-a831-4c23-8d64-7a026521df9c} + + + {d8f2e173-a0a5-455b-8efc-42511b585156} + + + {dd0822bd-27cc-4d5c-8e2f-cf7d9b00feb4} + + + {0dc051db-f833-476f-b3f4-c69fd03b9348} + + + {7c2a98c6-064c-4a69-b803-d6f6ff5edd0b} + + + {c3222908-22ba-4586-a637-6363f455b06d} + + + {f3ed23a2-8ce7-41a5-b051-6da56047dc35} + + + {23ed23a2-8e7a-65a2-b051-8aa63047d352} + + + {dd751f2d-bfd6-42c1-8f9b-cbe94e539353} + + + {d67a2af7-4851-4d21-910e-87791bc8ee35} + + + {a2403c22-6777-46ab-a55a-3fcc7386c974} + + + {55dfae6a-66ba-43e2-8ceb-98ee70cbdf16} + + + {85dfae6a-66ca-4332-8cec-98ee70cbdf2f} + + + {64ac22a6-f49f-11e8-a49f-e3788c43d09c} + + + {17b56151-79ec-4a32-bac3-9d94ae7f68fe} + + + {ef6f9983-bda5-4fb2-adfa-ac4f29b74f25} + + + {9762c92c-9677-4044-8292-ff6ba4bfdd89} + + + {2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d} + + + {f5117ccb-a70d-411a-b7ea-d6faed230bc7} + + + {c26f5b02-37b5-4420-a4e8-ee1ad517dc95} + + + {2037fd0d-e7a2-4da8-956c-150aec726a99} + + + {1613763f-895c-4321-b58b-6f5849868956} + + + {3079ea1f-f768-455a-acd6-f517fac535b4} + + + {86a896c0-1688-4854-98e3-285d166069a3} + + + {86a896c0-1688-4854-98e0-285d166069a3} + + + {fea0fce3-c0fe-42f7-aa37-0cbba10b008a} + + + {af52b078-ed91-4c6e-b07a-e9243acc85d2} + + + {af387eac-e9e6-4e91-a5e8-637b1e7a8d93} + + + {0f9aec5e-5409-488f-992a-2c108590d1ac} + + + {031afae7-2a78-4e32-9738-4b589b6f7ff3} + + + {1e548d79-c217-4203-989a-a592fe2b2de3} + + + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\getopt + + + Source Files\gettimeofday + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\jansson + + + Source Files\jansson + + + Header Files\compat + + + Source Files\sph + + + Source Files\sph + + + Source Files\sph + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\neoscrypt + + + Source Files\neoscrypt + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA\heavy + + + Source Files + + + Source Files + + + Source Files + + + Source Files + + + Source Files\sia + + + Source Files\crypto + + + Source Files\crypto\xmr + + + Source Files\crypto\xmr + + + Source Files\crypto\xmr + + + Source Files\crypto\xmr + + + Source Files\crypto\xmr + + + Source Files\crypto\bbr + + + Source Files\crypto\bbr + + + Source Files\equi + + + Source Files\equi + + + Source Files\equi + + + Source Files\equi + + + + + + + Header Files + + + Header Files\compat + + + Header Files + + + Header Files + + + Header Files\compat\sys + + + Header Files\compat + + + Header Files\compat + + + Header Files\compat\getopt + + + Header Files\compat + + + Header Files\compat + + + Header Files + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\CUDA + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\sph + + + Header Files\compat + + + Header Files + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat\nvapi + + + Header Files\compat + + + Header Files\lyra2 + + + Header Files\lyra2 + + + Header Files\lyra2 + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Ressources + + + Source Files\CUDA\scrypt + + + Source Files\neoscrypt + + + Source Files\neoscrypt + + + Source Files\CUDA\heavy + + + Header Files + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\x11 + + + Source Files\sia + + + Source Files\crypto + + + Source Files\crypto\bbr + + + Source Files\crypto\bbr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\equi + + + + + Source Files\equi + + + Header Files\CUDA + + + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA\JHA + + + Source Files\CUDA\JHA + + + Source Files\CUDA\JHA + + + Source Files\CUDA + + + Source Files\CUDA\JHA + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\heavy + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\quark + + + Source Files\CUDA\qubit + + + Source Files\CUDA\qubit + + + Source Files\CUDA\qubit + + + Source Files\CUDA\qubit + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x13 + + + Source Files\CUDA\x13 + + + Source Files\CUDA\x13 + + + Source Files\CUDA\x13 + + + Source Files\CUDA\x13 + + + Source Files\CUDA + + + Source Files\CUDA\x15 + + + Source Files\CUDA\x15 + + + Source Files\CUDA\x15 + + + Source Files\CUDA\x15 + + + Source Files\CUDA\x15 + + + Source Files\CUDA\x17 + + + Source Files\CUDA\x17 + + + Source Files\CUDA\x17 + + + Source Files\CUDA\x17 + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA\skunk + + + Source Files\CUDA\skunk + + + Source Files\CUDA\skunk + + + Source Files\CUDA\tribus + + + Source Files\CUDA\tribus + + + Source Files\CUDA\tribus + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\x11 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\CUDA\scrypt + + + Source Files\neoscrypt + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\lyra2 + + + Source Files\CUDA\Algo256 + + + Source Files\CUDA\lbry + + + Source Files\CUDA\lbry + + + Source Files\CUDA\lbry + + + Source Files\CUDA\lbry + + + Source Files\CUDA\sha256 + + + Source Files\CUDA\sha256 + + + Source Files\CUDA\sha256 + + + Source Files\CUDA\sha256 + + + Source Files\sia + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA\xmr + + + Source Files\CUDA + + + Source Files\CUDA + + + Source Files\equi + + + Source Files\CUDA\x15 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x21 + + + Source Files\CUDA\x21 + + + Source Files\CUDA\x21 + + + + + + + Ressources + + + + + Ressources + + + + + Ressources + + + \ No newline at end of file diff --git a/ccminer.cpp b/ccminer.cpp index 8cc86cfafc..6c3b6dfea5 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -44,6 +44,7 @@ #include "algos.h" #include "sia/sia-rpc.h" #include "crypto/xmr-rpc.h" +#include "sph/sph_sha3d.h" #include "equi/equihash.h" #include @@ -236,68 +237,79 @@ static char const usage[] = "\ Usage: " PROGRAM_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the hash algorithm to use\n\ + 0x10 ChainOX\n\ allium Lyra2 blake2s\n\ + anime Animecoin\n\ heavyhash oBTC coin\n\ - bastion Hefty bastion\n\ - bitcore Timetravel-10\n\ - blake Blake 256 (SFR)\n\ - blake2s Blake2-S 256 (NEVA)\n\ - blakecoin Fast Blake 256 (8 rounds)\n\ - bmw BMW 256\n\ - cryptolight AEON cryptonight (MEM/2)\n\ - cryptonight XMR cryptonight\n\ - c11/flax X11 variant\n\ - decred Decred Blake256\n\ - deep Deepcoin\n\ - equihash Zcash Equihash\n\ - dmd-gr Diamond-Groestl\n\ - fresh Freshcoin (shavite 80)\n\ - fugue256 Fuguecoin\n\ - groestl Groestlcoin\n" + bastion Hefty bastion\n\ + bitcore Timetravel-10\n\ + blake Blake 256 (SFR)\n\ + blake2s Blake2-S 256 (NEVA)\n\ + blakecoin Fast Blake 256 (8 rounds)\n\ + bmw BMW 256\n\ + bmw512 BMW 512\n\ + cryptolight AEON cryptonight (MEM/2)\n\ + cryptonight XMR cryptonight\n\ + c11/flax X11 variant\n\ + decred Decred Blake256\n\ + deep Deepcoin\n\ + equihash Zcash Equihash\n\ + dmd-gr Diamond-Groestl\n\ + fresh Freshcoin (shavite 80)\n\ + fugue256 Fuguecoin\n\ + gostcoin Double GOST R 34.11\n\ + groestl Groestlcoin\n" #ifdef WITH_HEAVY_ALGO " heavy Heavycoin\n" #endif -" hmq1725 Doubloons / Espers\n\ - jackpot JHA v8\n\ - keccak Deprecated Keccak-256\n\ - keccakc Keccak-256 (CreativeCoin)\n\ - lbry LBRY Credits (Sha/Ripemd)\n\ - luffa Joincoin\n\ - lyra2 CryptoCoin\n\ - lyra2v2 VertCoin\n\ - lyra2z ZeroCoin (3rd impl)\n\ - myr-gr Myriad-Groestl\n\ - neoscrypt FeatherCoin, Phoenix, UFO...\n\ - nist5 NIST5 (TalkCoin)\n\ - penta Pentablake hash (5x Blake 512)\n\ - phi BHCoin\n\ - polytimos Politimos\n\ - quark Quark\n\ - qubit Qubit\n\ - sha256d SHA256d (bitcoin)\n\ - sha256t SHA256 x3\n\ - sia SIA (Blake2B)\n\ - sib Sibcoin (X11+Streebog)\n\ - scrypt Scrypt\n\ - scrypt-jane Scrypt-jane Chacha\n\ - skein Skein SHA2 (Skeincoin)\n\ - skein2 Double Skein (Woodcoin)\n\ - skunk Skein Cube Fugue Streebog\n\ - s3 S3 (1Coin)\n\ - timetravel Machinecoin permuted x8\n\ - tribus Denarius\n\ - vanilla Blake256-8 (VNL)\n\ - veltor Thorsriddle streebog\n\ - whirlcoin Old Whirlcoin (Whirlpool algo)\n\ - whirlpool Whirlpool algo\n\ - x11evo Permuted x11 (Revolver)\n\ - x11 X11 (DarkCoin)\n\ - x13 X13 (MaruCoin)\n\ - x14 X14\n\ - x15 X15\n\ - x17 X17\n\ - wildkeccak Boolberry\n\ - zr5 ZR5 (ZiftrCoin)\n\ +" hmq1725 Doubloons / Espers\n\ + jackpot JHA v8\n\ + keccak Deprecated Keccak-256\n\ + keccakc Keccak-256 (CreativeCoin)\n\ + lbry LBRY Credits (Sha/Ripemd)\n\ + luffa Joincoin\n\ + lyra2 CryptoCoin\n\ + lyra2v2 VertCoin\n\ + lyra2z ZeroCoin (3rd impl)\n\ + myr-gr Myriad-Groestl\n\ + neoscrypt FeatherCoin, Phoenix, UFO...\n\ + neoscrypt-xaya XAYA's version...\n\ + nist5 NIST5 (TalkCoin)\n\ + penta Pentablake hash (5x Blake 512)\n\ + phi BHCoin\n\ + polytimos Politimos\n\ + quark Quark\n\ + qubit Qubit\n\ + sha256d SHA256d (bitcoin)\n\ + sha256t SHA256 x3\n\ + sha3d Bsha3, Yilacoin and Kylacoin\n\ + sia SIA (Blake2B)\n\ + sib Sibcoin (X11+Streebog)\n\ + scrypt Scrypt\n\ + scrypt-jane Scrypt-jane Chacha\n\ + skein Skein SHA2 (Skeincoin)\n\ + skein2 Double Skein (Woodcoin)\n\ + skunk Skein Cube Fugue Streebog\n\ + s3 S3 (1Coin)\n\ + timetravel Machinecoin permuted x8\n\ + tribus Denarius\n\ + vanilla Blake256-8 (VNL)\n\ + veltor Thorsriddle streebog\n\ + whirlcoin Old Whirlcoin (Whirlpool algo)\n\ + whirlpool Whirlpool algo\n\ + x11evo Permuted x11 (Revolver)\n\ + x11 X11 (DarkCoin)\n\ + x13 X13 (MaruCoin)\n\ + x14 X14\n\ + x15 X15\n\ + x17 X17\n\ + x16r X16R\n\ + x16rt X16RT\n\ + x16rv2 X16R V2\n\ + x16s X16S\n\ + x21s X21S\n\ + wildkeccak Boolberry\n\ + zr5 ZR5 (ZiftrCoin)\n\ -d, --devices Comma separated list of CUDA devices to use.\n\ Device IDs start counting from 0! Alternatively takes\n\ string names of your cards like gtx780ti or gt640#2\n\ @@ -696,6 +708,7 @@ static bool work_decode(const json_t *val, struct work *work) adata_sz = 180/4; break; case ALGO_NEOSCRYPT: + case ALGO_XAYA: case ALGO_ZR5: data_size = 80; adata_sz = data_size / 4; @@ -950,6 +963,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) case ALGO_BLAKECOIN: case ALGO_BLAKE2S: case ALGO_BMW: + case ALGO_BMW512: case ALGO_SHA256D: case ALGO_SHA256T: case ALGO_VANILLA: @@ -1523,6 +1537,21 @@ static bool submit_work(struct thr_info *thr, const struct work *work_in) return false; } +void sha3d(void *state, const void *input, int len) +{ + uint32_t _ALIGN(64) buffer[16], hash[16]; + sph_keccak_context ctx_keccak; + + sph_sha3d256_init(&ctx_keccak); + sph_sha3d256 (&ctx_keccak, input, len); + sph_sha3d256_close(&ctx_keccak, (void*) buffer); + sph_sha3d256_init(&ctx_keccak); + sph_sha3d256 (&ctx_keccak, buffer, 32); + sph_sha3d256_close(&ctx_keccak, (void*) hash); + + memcpy(state, hash, 32); +} + static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) { uchar merkle_root[64] = { 0 }; @@ -1565,10 +1594,16 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_FUGUE256: case ALGO_GROESTL: case ALGO_KECCAK: + case ALGO_SHA3D: + sha3d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + break; case ALGO_BLAKECOIN: case ALGO_WHIRLCOIN: SHA256((uchar*)sctx->job.coinbase, sctx->job.coinbase_size, (uchar*)merkle_root); break; + case ALGO_GOSTCOIN: + gostd(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); + break; case ALGO_WHIRLPOOL: default: sha256d(merkle_root, sctx->job.coinbase, (int)sctx->job.coinbase_size); @@ -1576,6 +1611,11 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) for (i = 0; i < sctx->job.merkle_count; i++) { memcpy(merkle_root + 32, sctx->job.merkle[i], 32); + if (opt_algo == ALGO_GOSTCOIN) + { + memcpy(merkle_root + 32, merkle_root, 32); + gostd(merkle_root, merkle_root, 64); + } #ifdef WITH_HEAVY_ALGO if (opt_algo == ALGO_HEAVY || opt_algo == ALGO_MJOLLNIR) heavycoin_hash(merkle_root, merkle_root, 64); @@ -1642,6 +1682,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) memcpy(&work->data[12], sctx->job.coinbase, 32); // merkle_root work->data[20] = 0x80000000; if (opt_debug) applog_hex(work->data, 80); + } else if (opt_algo == ALGO_XAYA) { + for (i = 0; i < 8; i++) + work->data[9 + i] = swab32(be32dec((uint32_t *)merkle_root + i)); + + work->data[17] = le32dec(sctx->job.ntime); + work->data[18] = le32dec(sctx->job.nbits); + work->data[20] = 0x80000000; + work->data[31] = (opt_algo == ALGO_MJOLLNIR) ? 0x000002A0 : 0x00000280; } else { for (i = 0; i < 8; i++) work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); @@ -1693,6 +1741,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_JACKPOT: case ALGO_JHA: case ALGO_NEOSCRYPT: + case ALGO_XAYA: case ALGO_SCRYPT: case ALGO_SCRYPT_JANE: work_set_target(work, sctx->job.diff / (65536.0 * opt_difficulty)); @@ -1708,6 +1757,12 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_ALLIUM: case ALGO_TIMETRAVEL: case ALGO_BITCORE: + case ALGO_BMW512: + case ALGO_X16R: + case ALGO_X16RT: + case ALGO_X16RV2: + case ALGO_X16S: + case ALGO_X21S: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); break; case ALGO_KECCAK: @@ -1717,6 +1772,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_EQUIHASH: equi_work_set_target(work, sctx->job.diff / opt_difficulty); break; + case ALGO_SHA3D: default: work_set_target(work, sctx->job.diff / opt_difficulty); } @@ -2216,6 +2272,7 @@ static void *miner_thread(void *userdata) break; case ALGO_BLAKE: case ALGO_BMW: + case ALGO_BMW512: case ALGO_DECRED: case ALGO_SHA256D: case ALGO_SHA256T: @@ -2251,6 +2308,7 @@ static void *miner_thread(void *userdata) case ALGO_X13: case ALGO_WHIRLCOIN: case ALGO_WHIRLPOOL: + case ALGO_GOSTCOIN: minmax = 0x400000; break; case ALGO_X14: @@ -2261,6 +2319,7 @@ static void *miner_thread(void *userdata) case ALGO_LYRA2Z: case ALGO_ALLIUM: case ALGO_NEOSCRYPT: + case ALGO_XAYA: case ALGO_SIB: case ALGO_SCRYPT: case ALGO_VELTOR: @@ -2337,6 +2396,9 @@ static void *miner_thread(void *userdata) case ALGO_BMW: rc = scanhash_bmw(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_BMW512: + rc = scanhash_bmw512(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_C11: rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done); break; @@ -2358,6 +2420,9 @@ static void *miner_thread(void *userdata) case ALGO_FUGUE256: rc = scanhash_fugue256(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_GOSTCOIN: + rc = scanhash_gostd(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_GROESTL: case ALGO_DMD_GR: @@ -2417,12 +2482,18 @@ static void *miner_thread(void *userdata) case ALGO_ALLIUM: rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_ANIME: + rc = scanhash_anime(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_HEAVYHASH: rc = scanhash_heavyhash(thr_id, &work, max_nonce, &hashes_done); break; case ALGO_NEOSCRYPT: rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_XAYA: + rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_NIST5: rc = scanhash_nist5(thr_id, &work, max_nonce, &hashes_done); break; @@ -2450,6 +2521,9 @@ static void *miner_thread(void *userdata) case ALGO_SHA256T: rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_SHA3D: + rc = scanhash_sha3d(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_SIA: rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done); break; @@ -2502,9 +2576,24 @@ static void *miner_thread(void *userdata) case ALGO_X15: rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_X16R: + rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done); + break; + case ALGO_X16RT: + rc = scanhash_x16rt(thr_id, &work, max_nonce, &hashes_done); + break; + case ALGO_X16RV2: + rc = scanhash_x16rv2(thr_id, &work, max_nonce, &hashes_done); + break; + case ALGO_X16S: + rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_X17: rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_X21S: + rc = scanhash_x21s(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_ZR5: rc = scanhash_zr5(thr_id, &work, max_nonce, &hashes_done); break; @@ -2560,6 +2649,7 @@ static void *miner_thread(void *userdata) /* hashrate factors for some algos */ double rate_factor = 1.0; switch (opt_algo) { + case ALGO_ANIME: case ALGO_JACKPOT: case ALGO_QUARK: // to stay comparable to other ccminer forks or pools @@ -3293,8 +3383,10 @@ void parse_arg(int key, char *arg) } p = strstr(arg, "://"); if (p) { - if (strncasecmp(arg, "http://", 7) && strncasecmp(arg, "https://", 8) && - strncasecmp(arg, "stratum+tcp://", 14)) + if (strncasecmp(arg, "http://", 7) + && strncasecmp(arg, "https://", 8) + && strncasecmp(arg, "stratum+tcp://", 14) + && strncasecmp(arg, "stratum+tcps://", 15) ) show_usage_and_exit(1); free(rpc_url); rpc_url = strdup(arg); @@ -3981,9 +4073,12 @@ int main(int argc, char *argv[]) GetScratchpad(); } - flags = !opt_benchmark && strncmp(rpc_url, "https:", 6) - ? (CURL_GLOBAL_ALL & ~CURL_GLOBAL_SSL) - : CURL_GLOBAL_ALL; + flags = CURL_GLOBAL_ALL; + if ( !opt_benchmark ) + if ( strncasecmp( rpc_url, "https:", 6 ) + && strncasecmp( rpc_url, "stratum+tcps://", 15 ) ) + flags &= ~CURL_GLOBAL_SSL; + if (curl_global_init(flags)) { applog(LOG_ERR, "CURL initialization failed"); return EXIT_CODE_SW_INIT_ERROR; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 45e738bdee..c57913be1e 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -1,5 +1,5 @@  - + Release @@ -21,6 +21,7 @@ {36DC07F9-A4A6-4877-A146-1B960083CF6F} ccminer + 10.0.17763.0 @@ -37,9 +38,21 @@ true + + false + + + false + + + false + + + false + - + @@ -155,7 +168,7 @@ 80 true true - compute_75,sm_75;compute_61,sm_61;compute_52,sm_52 + 75,sm_75;compute_61,sm_61;compute_52,sm_52;compute_50,sm_50 --ptxas-options="-O2" --Wno-deprecated-gpu-targets %(AdditionalOptions) O2 @@ -198,7 +211,7 @@ 128 true true - compute_86,sm_86;compute_75,sm_75;compute_61,sm_61 + compute_86,sm_86;compute_75,sm_75;compute_70,sm_70;compute_61,sm_61;compute_52,sm_52;compute_50,sm_50 $(NVTOOLSEXT_PATH)\include O3 64 @@ -269,7 +282,9 @@ + + 76 @@ -313,6 +328,7 @@ + @@ -323,6 +339,7 @@ + @@ -354,6 +371,7 @@ + @@ -378,6 +396,7 @@ + @@ -385,6 +404,7 @@ + @@ -412,7 +432,11 @@ + + + + @@ -448,6 +472,7 @@ + @@ -457,6 +482,9 @@ 92 + + 128 + @@ -490,6 +518,7 @@ + @@ -525,7 +554,9 @@ + + @@ -533,7 +564,7 @@ 76 - 128 + 80 @@ -549,8 +580,8 @@ - + 72 @@ -563,16 +594,30 @@ - - + + + + + + + + + + + compute_50,sm_50;compute_52,sm_52 + + 80 + + + @@ -585,7 +630,7 @@ - + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 7c69203a66..a3c9ff14d5 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -58,9 +58,15 @@ {a2403c22-6777-46ab-a55a-3fcc7386c974} + + {55dfae6a-66ba-43e2-8ceb-98ee70cbdf16} + {85dfae6a-66ca-4332-8cec-98ee70cbdf2f} + + {64ac22a6-f49f-11e8-a49f-e3788c43d09c} + {17b56151-79ec-4a32-bac3-9d94ae7f68fe} @@ -225,6 +231,9 @@ Source Files\sph + + Source Files\sph + Source Files\sph @@ -270,18 +279,6 @@ Source Files\sph - - Source Files\heavyhash - - - Source Files\heavyhash - - - Source Files\CUDA\scrypt - - - Source Files\CUDA\scrypt - Source Files\neoscrypt @@ -345,15 +342,15 @@ Source Files\equi - - Source Files\equi - Source Files\equi Source Files\equi + + + @@ -449,6 +446,9 @@ Header Files\CUDA + + Header Files\CUDA + Header Files\CUDA @@ -461,6 +461,9 @@ Header Files\sph + + Header Files\sph + Header Files\sph @@ -512,12 +515,6 @@ Header Files\lyra2 - - Source Files\heavyhash - - - Source Files\heavyhash - Header Files\lyra2 @@ -557,6 +554,9 @@ Source Files\CUDA\quark + + Source Files\CUDA\quark + Source Files\CUDA\quark @@ -605,9 +605,12 @@ Source Files\equi - - Source Files\equi + + + + Header Files\CUDA + @@ -700,13 +703,22 @@ Source Files\CUDA\x11 + + Source Files\CUDA\x11 + Source Files\CUDA\x11 + + Source Files\CUDA\x11 + Source Files\CUDA\x11 - + + Source Files\CUDA\x11 + + Source Files\CUDA\x11 @@ -715,6 +727,9 @@ Source Files\CUDA\x11 + + Source Files\CUDA\x11 + Source Files\CUDA\x11 @@ -727,10 +742,10 @@ Source Files\CUDA\x11 - + Source Files\CUDA\x11 - + Source Files\CUDA\x11 @@ -877,39 +892,24 @@ Source Files\CUDA\scrypt - - Source Files\CUDA\scrypt - Source Files\CUDA\scrypt - - Source Files\CUDA\scrypt - - - Source Files\CUDA\scrypt - - - Source Files\CUDA\scrypt - Source Files\CUDA\scrypt Source Files\CUDA\scrypt - - Source Files\CUDA\scrypt - - - Source Files\CUDA\scrypt - Source Files\neoscrypt Source Files\CUDA\Algo256 + + Source Files\CUDA\Algo256 + Source Files\CUDA\Algo256 @@ -979,18 +979,49 @@ Source Files\CUDA - - Source Files\equi - Source Files\CUDA - - Source Files\CUDA\heavyhash + + Source Files\CUDA\x15 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x16 + + + Source Files\CUDA\x21 + + + Source Files\CUDA\x21 - - Source Files\CUDA\heavyhash + + Source Files\CUDA\x21 + + + + diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index b48c5e68a6..dd21f14069 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -158,13 +158,13 @@ #define PACKAGE_BUGREPORT "" /* Define to the full name of this package. */ -#define PACKAGE_NAME "ccminer" +#define PACKAGE_NAME "ccminer-kudaraidee" /* Define to the home page for this package. */ -#define PACKAGE_URL "http://github.com/tpruvot/ccminer" +#define PACKAGE_URL "https://github.com/Kudaraidee/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "0.5.0" +#define PACKAGE_VERSION "1.2.0" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/configure.ac b/configure.ac index 4bc90d4117..451f32a814 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer-fancyIX], [0.5.0], [], [ccminer-fancyIX], [https://github.com/fancyIX/ccminer]) +AC_INIT([ccminer-fancyIX], [0.5.1], [], [ccminer-fancyIX], [https://github.com/fancyIX/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cuda_helper.h b/cuda_helper.h index c51a325332..aefc90dc3e 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -102,7 +102,7 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); } -// Endian Drehung fr 32 Bit Typen +// Endian Drehung f�r 32 Bit Typen #ifdef __CUDA_ARCH__ __device__ __forceinline__ uint32_t cuda_swab32(uint32_t x) { @@ -178,7 +178,7 @@ do { \ cudaError_t err = call; \ if (cudaSuccess != err) { \ fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \ - __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \ + __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \ exit(EXIT_FAILURE); \ } \ } while (0) @@ -226,7 +226,7 @@ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) { uint64_t result; asm("xor.b64 %0, %2, %3; // xor3\n\t" - "xor.b64 %0, %0, %1;\n\t" + "xor.b64 %0, %0, %1;\n\t" /* output : input registers */ : "=l"(result) : "l"(a), "l"(b), "l"(c)); return result; @@ -480,11 +480,11 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) { /** * uint2 direct ops by c++ operator definitions */ -static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } -static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } -static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } -static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } -static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } +static __device__ __host__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } +static __device__ __host__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } +static __device__ __host__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } +static __device__ __host__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); } +static __device__ __host__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; } static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) { return vectorize(devectorize(a) + devectorize(b)); @@ -601,8 +601,34 @@ __device__ __inline__ uint2 ROR24(const uint2 a) result.y = __byte_perm(a.y, a.x, 0x6543); return result; } +__device__ __forceinline__ +uint2 ROL16(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +__device__ __forceinline__ +uint2 ROL24(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x0765); + return result; +} +__device__ __forceinline__ +uint2 ROR8(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x4321); + return result; +} #else +#define ROR8(u) ROR2(u, 8) #define ROL8(u) ROL2(u, 8) +#define ROL16(u) ROL2(u,16) +#define ROL24(u) ROL2(u,24) #define ROR16(u) ROR2(u,16) #define ROR24(u) ROR2(u,24) #endif @@ -669,6 +695,29 @@ static uint2 SHR2(uint2 a, int offset) #endif } +__device__ __forceinline__ +uint32_t xor3x(uint32_t a,uint32_t b,uint32_t c){ + uint32_t result; + #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); + #else + result = a^b^c; + #endif + return result; +} + +__device__ __forceinline__ +uint2 xor3x(const uint2 a,const uint2 b,const uint2 c) { + uint2 result; +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA +#else + result = a^b^c; +#endif + return result; +} + // CUDA 9+ deprecated functions warnings (new mask param) #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 #undef __shfl diff --git a/cuda_helper_alexis.h b/cuda_helper_alexis.h new file mode 100644 index 0000000000..ab40814bed --- /dev/null +++ b/cuda_helper_alexis.h @@ -0,0 +1,732 @@ +#ifndef CUDA_HELPER_H +#define CUDA_HELPER_H + +#include +#include + +#ifdef __INTELLISENSE__ +/* reduce vstudio warnings (__byteperm, blockIdx...) */ +#include +#include +#define __launch_bounds__(max_tpb, min_blocks) +#endif + +#include +#include + +#ifndef UINT32_MAX +/* slackware need that */ +#define UINT32_MAX UINT_MAX +#endif + +#ifndef MAX_GPUS +#define MAX_GPUS 16 +#endif + +extern "C" short device_map[MAX_GPUS]; +extern "C" long device_sm[MAX_GPUS]; + +extern int cuda_arch[MAX_GPUS]; + +// common functions +extern int cuda_get_arch(int thr_id); +extern void cuda_check_cpu_init(int thr_id, uint32_t threads); +extern void cuda_check_cpu_free(int thr_id); +extern void cuda_check_cpu_setTarget(const void *ptarget); +extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash); +extern uint32_t cuda_check_hash_suppl(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce); +extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); +extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); +extern __device__ __device_builtin__ void __syncthreads(void); +extern __device__ __device_builtin__ void __threadfence(void); + + +#define AS_U32(addr) *((uint32_t*)(addr)) +#define AS_U64(addr) *((uint64_t*)(addr)) +#define AS_UINT2(addr) *((uint2*)(addr)) +#define AS_UINT4(addr) *((uint4*)(addr)) +#define AS_UL2(addr) *((ulonglong2*)(addr)) + + +#ifndef __CUDA_ARCH__ +// define blockDim and threadIdx for host +extern const dim3 blockDim; +extern const uint3 threadIdx; +#endif + +#ifndef SPH_C32 +#define SPH_C32(x) (x) +// #define SPH_C32(x) ((uint32_t)(x ## U)) +#endif + +#ifndef SPH_C64 +#define SPH_C64(x) (x) +// #define SPH_C64(x) ((uint64_t)(x ## ULL)) +#endif + +#ifndef SPH_T32 +#define SPH_T32(x) (x) +// #define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) +#endif + +#ifndef SPH_T64 +#define SPH_T64(x) (x) +// #define SPH_T64(x) ((x) & SPH_C64(0xFFFFFFFFFFFFFFFF)) +#endif + +/*********************************************************************/ +// Macros to catch CUDA errors in CUDA runtime calls + +#define CUDA_SAFE_CALL(call) \ +do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in func '%s' at line %i : %s.\n", \ + __FUNCTION__, __LINE__, cudaGetErrorString(err) ); \ + exit(EXIT_FAILURE); \ + } \ +} while (0) + +#define CUDA_CALL_OR_RET(call) do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + cudaReportHardwareFailure(thr_id, err, __FUNCTION__); \ + return; \ + } \ +} while (0) + +#define CUDA_CALL_OR_RET_X(call, ret) do { \ + cudaError_t err = call; \ + if (cudaSuccess != err) { \ + cudaReportHardwareFailure(thr_id, err, __FUNCTION__); \ + return ret; \ + } \ +} while (0) + +/*********************************************************************/ + +__device__ __forceinline__ uint64_t MAKE_ULONGLONG(uint32_t LO, uint32_t HI){ + return __double_as_longlong(__hiloint2double(HI, LO)); +// return (uint64_t)LO | (((uint64_t)HI) << 32); +} + +// das Hi Word in einem 64 Bit Typen ersetzen +__device__ __forceinline__ uint64_t REPLACE_HIDWORD(const uint64_t &x, const uint32_t &y) { + return (x & 0xFFFFFFFFULL) | (((uint64_t)y) << 32U); +} + +// das Lo Word in einem 64 Bit Typen ersetzen +__device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uint32_t &y) { + return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); +} + +// Endian Drehung fr 32 Bit Typen +#if defined(__CUDA_ARCH__) +__device__ __forceinline__ uint32_t cuda_swab32(uint32_t x) +{ + /* device */ + return __byte_perm(x, x, 0x0123); +} +#else + /* host */ + #define cuda_swab32(x) \ + ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | \ + (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu)) +#endif + +// das Lo Word aus einem 64 Bit Typen extrahieren +__device__ __forceinline__ uint32_t _LODWORD(const uint64_t &x) { + return (uint32_t)__double2loint(__longlong_as_double(x)); +// return (uint32_t)(x & 0xFFFFFFFFULL); +} + +// das Hi Word aus einem 64 Bit Typen extrahieren +__device__ __forceinline__ uint32_t _HIDWORD(const uint64_t &x) { + return (uint32_t)__double2hiint(__longlong_as_double(x)); +// return (uint32_t)(x >> 32); +} + + +__device__ __forceinline__ uint2 cuda_swab64_U2(uint2 a) +{ + // Input: 77665544 33221100 + // Output: 00112233 44556677 + uint2 result; + result.y = __byte_perm(a.x, 0, 0x0123); + result.x = __byte_perm(a.y, 0, 0x0123); + return result; +} + +#if defined(__CUDA_ARCH__) +__device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) +{ + // Input: 77665544 33221100 + // Output: 00112233 44556677 + uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123); + return (result << 32) | __byte_perm(_HIDWORD(x), 0, 0x0123); +} +#else +/* host */ +#define cuda_swab64(x) \ + ((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \ + (((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \ + (((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \ + (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ + (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ + (((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \ + (((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \ + (((uint64_t)(x) & 0x00000000000000ffULL) << 56))) +#endif + +// swap two uint32_t without extra registers +__device__ __host__ __forceinline__ void xchg(uint32_t &x, uint32_t &y) { + x ^= y; y = x ^ y; x ^= y; +} +// for other types... +#define XCHG(x, y) { x ^= y; y = x ^ y; x ^= y; } + +static __host__ __device__ __forceinline__ uint2 vectorize(uint64_t v) { + uint2 result; +#if defined(__CUDA_ARCH__) + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(result.x), "=r"(result.y) : "l"(v)); +#else + result.x = (uint32_t)(v); + result.y = (uint32_t)(v >> 32); +#endif + return result; +} + +static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) { +#if defined(__CUDA_ARCH__) + return MAKE_ULONGLONG(v.x, v.y); +#else + return (((uint64_t)v.y) << 32) + v.x; +#endif +} + +#if defined(__CUDA_ARCH__) + // Compute 3.2+ + #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) + #define ROTR32(x, n) __funnelshift_r( (x), (x), (n) ) +#else + // Host and Compute 3.0 + #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) + #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) + #define __ldg(x) (*(x)) +#endif + +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a){ + return __byte_perm(a, 0, 0x1032); +} +__device__ __forceinline__ +uint32_t ROL8(const uint32_t a){ + return __byte_perm(a, 0, 0x2103); +} +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a){ + return __byte_perm(a, 0, 0x0321); +} + +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor1(uint64_t a, uint64_t b) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); + return result; +} + +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; + asm("xor.b64 %0, %2, %3;\n\t" + "xor.b64 %0, %0, %1;\n\t" + /* output : input registers */ + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} + +// device asm for whirpool +__device__ __forceinline__ +uint64_t xor5(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(d) ,"l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} + +__device__ __forceinline__ +uint64_t xor9(const uint64_t a, const uint64_t b, const uint64_t c, const uint64_t d, const uint64_t e, const uint64_t f, const uint64_t g, const uint64_t h,const uint64_t i) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(h) ,"l"(i)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(g)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} + +__device__ __forceinline__ +uint64_t xor8(uint64_t a, uint64_t b, uint64_t c, uint64_t d,uint64_t e,uint64_t f,uint64_t g, uint64_t h) +{ + uint64_t result; + asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(g) ,"l"(h)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(f)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(e)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(d)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(c)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(b)); + asm("xor.b64 %0, %0, %1;" : "+l"(result) : "l"(a)); + return result; +} + +static __device__ __forceinline__ uint2 xorswap32(uint2 u, uint2 v) +{ + uint2 result; + result.y = u.x ^ v.x; + result.x = u.y ^ v.y; + return result; +} + +// device asm for x17 +__device__ __forceinline__ +uint64_t andor(const uint64_t a,const uint64_t b,const uint64_t c) +{ + uint64_t result; + asm("{\n\t" + ".reg .u64 m,n;\n\t" + "and.b64 m, %1, %2;\n\t" + " or.b64 n, %1, %2;\n\t" + "and.b64 %0, n, %3;\n\t" + " or.b64 %0, %0, m ;\n\t" + "}\n" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +// return ((a | b) & c) | (a & b); +} + +// device asm for x17 +__device__ __forceinline__ +uint64_t shr_u64(const uint64_t x, uint32_t n){ + uint64_t result; + asm ("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return result; +// return x >> n; +} + +__device__ __forceinline__ +uint64_t shl_u64(const uint64_t x, uint32_t n){ + uint64_t result; + asm("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return result; +// return x << n; +} + +__device__ __forceinline__ +uint32_t shr_u32(const uint32_t x,uint32_t n) { + uint32_t result; + asm("shr.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +// return x >> n; +} + +__device__ __forceinline__ +uint32_t shl_u32(const uint32_t x,uint32_t n) { + uint32_t result; + asm("shl.b32 %0,%1,%2;" : "=r"(result) : "r"(x), "r"(n)); + return result; +// return x << n; +} + +// 64-bit ROTATE RIGHT +#if defined(__CUDA_ARCH__) +/* complicated sm >= 3.5 one (with Funnel Shifter beschleunigt), to bench */ +__device__ __forceinline__ +uint64_t ROTR64(const uint64_t value, const int offset) { + uint2 result; + const uint2 tmp = vectorize(value); + + if(offset == 8) { + result.x = __byte_perm(tmp.x, tmp.y, 0x4321); + result.y = __byte_perm(tmp.y, tmp.x, 0x4321); + } + else if(offset == 16) { + result.x = __byte_perm(tmp.x, tmp.y, 0x5432); + result.y = __byte_perm(tmp.y, tmp.x, 0x5432); + } + else if(offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.x), "r"(tmp.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.y), "r"(tmp.x), "r"(offset)); + } else { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.y), "r"(tmp.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.x), "r"(tmp.y), "r"(offset)); + } + return devectorize(result); +} +#else +/* host */ +#define ROTR64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) +#endif + +// 64-bit ROTATE LEFT +#if defined(__CUDA_ARCH__) +__device__ __forceinline__ +uint64_t ROTL64(const uint64_t value, const int offset) { + uint2 result; + const uint2 tmp = vectorize(value); + if(offset == 8){ + result.x = __byte_perm(tmp.x, tmp.y, 0x2107); + result.y = __byte_perm(tmp.y, tmp.x, 0x2107); + } + else if(offset == 16) { + result.x = __byte_perm(tmp.x, tmp.y, 0x1076); + result.y = __byte_perm(tmp.y, tmp.x, 0x1076); + } + else if(offset == 24) { + result.x = __byte_perm(tmp.x, tmp.y, 0x0765); + result.y = __byte_perm(tmp.y, tmp.x, 0x0765); + } + else if(offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.x), "r"(tmp.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.y), "r"(tmp.x), "r"(offset)); + } else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(tmp.y), "r"(tmp.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(tmp.x), "r"(tmp.y), "r"(offset)); + } + return devectorize(result); +} +#else +/* host */ +#define ROTL64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#endif + +__device__ __forceinline__ +uint64_t SWAPDWORDS(uint64_t value){ + uint2 temp; + asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value)); + asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x)); + return value; +} + +__device__ __forceinline__ +uint2 SWAPDWORDS2(uint2 value){ + return make_uint2(value.y, value.x); +} + +/* lyra2/bmw - uint2 vector's operators */ + +__device__ __forceinline__ +uint2 SHL8(const uint2 a){ + uint2 result; + result.y = __byte_perm(a.y, a.x, 0x2107); + result.x = __byte_perm(a.x, 0, 0x2107); + + return result; +} + +__device__ __forceinline__ +void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) { +#if defined(__CUDA_ARCH__) + asm("mov.b64 {%0,%1},%2; \n\t" + : "=r"(lo), "=r"(hi) : "l"(x)); +#else + lo = (uint32_t)(x); + hi = (uint32_t)(x >> 32); +#endif +} + +/** + * uint2 direct ops by c++ operator definitions + */ +static __device__ __forceinline__ uint2 operator^ (const uint2 a,const uint32_t b) { return make_uint2(a.x^ b, a.y); } +static __device__ __forceinline__ uint2 operator^ (const uint2 a,const uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); } +static __device__ __forceinline__ uint2 operator& (const uint2 a,const uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); } +static __device__ __forceinline__ uint2 operator| (const uint2 a,const uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); } +static __device__ __forceinline__ uint2 operator~ (const uint2 a) { return make_uint2(~a.x, ~a.y); } +static __device__ __forceinline__ void operator^= (uint2 &a,const uint2 b) { a = a ^ b; } + +static __device__ __forceinline__ uint2 operator+ (const uint2 a,const uint2 b) { +#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000 + uint2 result; + asm("{\n\t" + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +#else + return vectorize(devectorize(a) + devectorize(b)); +#endif +} + +static __device__ __forceinline__ uint2 operator+ (const uint2 a,const uint64_t b) { + return vectorize(devectorize(a) + b); +} + +static __device__ __forceinline__ void operator+= (uint2 &a,const uint2 b) { a = a + b; } + +static __device__ __forceinline__ uint2 operator- (const uint2 a,const uint2 b) { +#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000 + uint2 result; + asm("{\n\t" + "sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +#else + return vectorize(devectorize(a) - devectorize(b)); +#endif +} +static __device__ __forceinline__ void operator-= (uint2 &a,const uint2 b) { a = a - b; } + +static __device__ __forceinline__ uint2 operator+ (const uint2 a,const uint32_t b) +{ +#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000 + uint2 result; + asm("add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +#else + return vectorize(devectorize(a) + b); +#endif +} + +static __device__ __forceinline__ uint2 operator- (const uint2 a,const uint64_t b) { + return vectorize(devectorize(a) - b); +} +static __device__ __forceinline__ uint2 operator- (const uint2 a,const uint32_t b) +{ +#if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000 + uint2 result; + asm("sub.cc.u32 %0,%2,%4; \n\t" + "subc.u32 %1,%3,%5; \n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +#else + return vectorize(devectorize(a) - b); +#endif +} + +/** + * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b)) + * (what does uint64 "*" operator) + */ +static __device__ __forceinline__ uint2 operator* (const uint2 a,const uint2 b){ + uint2 result; + asm("{\n\t" + "mul.lo.u32 %0,%2,%4; \n\t" + "mul.hi.u32 %1,%2,%4; \n\t" + "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" + "madc.lo.u32 %1,%3,%5,%1; \n\t" + "}\n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + return result; +} + +// uint2 ROR/ROL methods +__device__ __forceinline__ +uint2 ROR2(const uint2 a, const uint32_t offset){ + uint2 result; +#if __CUDA_ARCH__ > 300 + if (offset < 32) { + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } else /* if (offset < 64) */ { + /* offset SHOULD BE < 64 ! */ + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } +#else + if (!offset) + result = a; + else if (offset < 32) { + result.y = ((a.y >> offset) | (a.x << (32 - offset))); + result.x = ((a.x >> offset) | (a.y << (32 - offset))); + } else if (offset == 32) { + result.y = a.x; + result.x = a.y; + } else { + result.y = ((a.x >> (offset - 32)) | (a.y << (64 - offset))); + result.x = ((a.y >> (offset - 32)) | (a.x << (64 - offset))); + } +#endif + return result; +} + +__device__ __forceinline__ +uint2 ROL2(const uint2 a, const uint32_t offset) +{ + uint2 result; +#if __CUDA_ARCH__ > 300 + if (offset >= 32) { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + } + else { + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + } +#else + if (!offset) + result = a; + else + result = ROR2(a, 64 - offset); +#endif + return result; +} + +__device__ __forceinline__ +uint2 SWAPUINT2(uint2 value) +{ + return make_uint2(value.y, value.x); +} + +/* Byte aligned Rotations (lyra2) */ +__device__ __forceinline__ +uint2 ROL8(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x2107); + result.y = __byte_perm(a.y, a.x, 0x2107); + return result; +} +__device__ __forceinline__ +uint2 ROR8(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x4321); + return result; +} +__device__ __forceinline__ +uint2 ROR16(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x5432); + result.y = __byte_perm(a.y, a.x, 0x5432); + return result; +} +__device__ __forceinline__ +uint2 ROL16(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x1076); + result.y = __byte_perm(a.y, a.x, 0x1076); + + return result; +} + +__device__ __forceinline__ +uint2 ROR24(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x6543); + result.y = __byte_perm(a.y, a.x, 0x6543); + return result; +} +__device__ __forceinline__ +uint2 ROL24(const uint2 a){ + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x0765); + return result; +} +/* uint2 for bmw512 - to double check later */ + +__device__ __forceinline__ +static uint2 SHL2(const uint2 a,const uint32_t n) { + uint64_t result; + const uint64_t x = devectorize(a); + asm ("shl.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return vectorize(result); +} + +__device__ __forceinline__ +static uint2 SHR2(const uint2 a,const uint32_t n){ + + uint64_t result; + const uint64_t x = devectorize(a); + asm ("shr.b64 %0,%1,%2;\n\t" : "=l"(result) : "l"(x), "r"(n)); + return vectorize(result); +} + +__device__ __forceinline__ +uint32_t xor3x(uint32_t a,uint32_t b,uint32_t c){ + uint32_t result; + #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA + #else + result = a^b^c; + #endif + return result; +} + +__device__ __forceinline__ +uint2 xor3x(const uint2 a,const uint2 b,const uint2 c){ + uint2 result; + #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA + #else + result = a^b^c; + #endif + return result; +} + +__device__ __forceinline__ +uint2 chi(const uint2 a,const uint2 b,const uint2 c){ //keccak - chi + uint2 result; + #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.x) : "r"(a.x), "r"(b.x),"r"(c.x)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result.y) : "r"(a.y), "r"(b.y),"r"(c.y)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + #else + result = a ^ (~b) & c; + #endif + return result; +} +__device__ __forceinline__ +uint32_t chi(const uint32_t a,const uint32_t b,const uint32_t c){ //keccak - chi + uint32_t result; + #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0xD2;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0xD2 = 0xF0 ^ ((~0xCC) & 0xAA) + #else + result = a ^ (~b) & c; + #endif + return result; +} +__device__ __forceinline__ +uint32_t bfe(uint32_t x, uint32_t bit, uint32_t numBits) { + uint32_t ret; + asm("bfe.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(x), "r"(bit), "r"(numBits)); + return ret; + +} + +__device__ __forceinline__ +uint32_t bfi(uint32_t x, uint32_t a, uint32_t bit, uint32_t numBits) { + uint32_t ret; + asm("bfi.b32 %0, %1, %2, %3,%4;" : "=r"(ret) : "r"(x), "r"(a), "r"(bit), "r"(numBits)); + return ret; +} + +// CUDA 9+ deprecated functions warnings (new mask param) +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#undef __shfl +#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width) +#undef __shfl_up +#define __shfl_up(var, delta, width) __shfl_up_sync(0xFFFFFFFF, var, delta, width) +#undef __any +#define __any(p) __any_sync(0xFFFFFFFFu, p) +#endif + + +#endif // #ifndef CUDA_HELPER_H + diff --git a/cuda_vectors.h b/cuda_vectors.h index 8399fcfd5d..65c79981f3 100644 --- a/cuda_vectors.h +++ b/cuda_vectors.h @@ -2,21 +2,6 @@ /* Macros for uint2 operations (used by skein) */ -__device__ __forceinline__ -uint2 ROR8(const uint2 a) { - uint2 result; - result.x = __byte_perm(a.x, a.y, 0x4321); - result.y = __byte_perm(a.y, a.x, 0x4321); - return result; -} - -__device__ __forceinline__ -uint2 ROL24(const uint2 a) { - uint2 result; - result.x = __byte_perm(a.x, a.y, 0x0765); - result.y = __byte_perm(a.y, a.x, 0x0765); - return result; -} static __device__ __forceinline__ uint2 operator+ (const uint2 a, const uint32_t b) { @@ -31,17 +16,3 @@ static __device__ __forceinline__ uint2 operator+ (const uint2 a, const uint32_t return vectorize(devectorize(a) + b); #endif } - -/* whirlpool ones */ -#ifdef __CUDA_ARCH__ -__device__ __forceinline__ -uint2 ROL16(const uint2 a) { - uint2 result; - result.x = __byte_perm(a.x, a.y, 0x1076); - result.y = __byte_perm(a.y, a.x, 0x1076); - return result; -} -#else -#define ROL16(a) make_uint2(a.x, a.y) /* bad, just to define it */ -#endif - diff --git a/cuda_vectors_alexis.h b/cuda_vectors_alexis.h new file mode 100644 index 0000000000..6ab0970c8e --- /dev/null +++ b/cuda_vectors_alexis.h @@ -0,0 +1,655 @@ +/* DJM CRAP to strip (again) made for SM 3.2+ */ + +#ifndef CUDA_LYRA_VECTOR_H +#define CUDA_LYRA_VECTOR_H + +/////////////////////////////////////////////////////////////////////////////////// +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +#include "cuda_helper.h" + +#if __CUDA_ARCH__ < 320 && !defined(__ldg4) +#define __ldg4(x) (*(x)) +#endif + +typedef struct __align__(32) uint8 { + unsigned int s0, s1, s2, s3, s4, s5, s6, s7; +} uint8; + +typedef struct __align__(64) uint2_8 { + uint2 s0, s1, s2, s3, s4, s5, s6, s7; +} uint2_8; + +typedef struct __align__(16) ulong2x2 { + ulonglong2 l0,l1; +} ulonglong2x2; + +typedef struct __align__(64) ulonglong2to8 { + ulonglong2 l0,l1,l2,l3; +} ulonglong2to8; + +typedef struct __align__(128) ulonglong8to16 { + ulonglong2to8 lo, hi; +} ulonglong8to16; + +typedef struct __align__(256) ulonglong16to32 { + ulonglong8to16 lo, hi; +} ulonglong16to32; + +typedef struct __align__(512) ulonglong32to64 { + ulonglong16to32 lo, hi; +} ulonglong32to64; + +typedef struct __align__(128) ulonglonglong { + ulonglong2 s0,s1,s2,s3,s4,s5,s6,s7; +} ulonglonglong; + +typedef struct __align__(64) uint16 { + union { + struct {unsigned int s0, s1, s2, s3, s4, s5, s6, s7;}; + uint8 lo; + }; + union { + struct {unsigned int s8, s9, sa, sb, sc, sd, se, sf;}; + uint8 hi; + }; +} uint16; + +typedef struct __align__(128) uint2_16 { + union { + struct { uint2 s0, s1, s2, s3, s4, s5, s6, s7; }; + uint2_8 lo; + }; + union { + struct { uint2 s8, s9, sa, sb, sc, sd, se, sf; }; + uint2_8 hi; + }; +} uint2_16; + +typedef struct __align__(128) uint32 { + uint16 lo,hi; +} uint32; + +struct __align__(128) ulong8 { + ulonglong4 s0, s1, s2, s3; +}; +typedef __device_builtin__ struct ulong8 ulong8; + +typedef struct __align__(256) ulonglong16 { + ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7; +} ulonglong16; + +typedef struct __align__(16) uint28 { + uint2 x, y, z, w; +} uint2x4; +typedef uint2x4 uint28; /* name deprecated */ + +typedef struct __builtin_align__(32) uint48 { + uint4 s0,s1; +} uint48; + +typedef struct __align__(256) uint4x16 { + uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; +} uint4x16; + +static __inline__ __device__ ulonglong2to8 make_ulonglong2to8(ulonglong2 s0, ulonglong2 s1, ulonglong2 s2, ulonglong2 s3) +{ + ulonglong2to8 t; t.l0=s0; t.l1=s1; t.l2=s2; t.l3=s3; + return t; +} + +static __inline__ __device__ ulonglong8to16 make_ulonglong8to16(const ulonglong2to8 &s0, const ulonglong2to8 &s1) +{ + ulonglong8to16 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong16to32 make_ulonglong16to32(const ulonglong8to16 &s0, const ulonglong8to16 &s1) +{ + ulonglong16to32 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __device__ ulonglong32to64 make_ulonglong32to64(const ulonglong16to32 &s0, const ulonglong16to32 &s1) +{ + ulonglong32to64 t; t.lo = s0; t.hi = s1; + return t; +} + +static __inline__ __host__ __device__ ulonglonglong make_ulonglonglong( + const ulonglong2 &s0, const ulonglong2 &s1, const ulonglong2 &s2, const ulonglong2 &s3, + const ulonglong2 &s4, const ulonglong2 &s5) +{ + ulonglonglong t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; + return t; +} + +static __inline__ __device__ uint48 make_uint48(uint4 s0, uint4 s1) +{ + uint48 t; t.s0 = s0; t.s1 = s1; + return t; +} + +static __inline__ __device__ uint28 make_uint28(uint2 s0, uint2 s1, uint2 s2, uint2 s3) +{ + uint28 t; t.x = s0; t.y = s1; t.z = s2; t.w = s3; + return t; +} + +static __inline__ __host__ __device__ uint4x16 make_uint4x16( + uint4 s0, uint4 s1, uint4 s2, uint4 s3, uint4 s4, uint4 s5, uint4 s6, uint4 s7, + uint4 s8, uint4 s9, uint4 sa, uint4 sb, uint4 sc, uint4 sd, uint4 se, uint4 sf) +{ + uint4x16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.s10 = sa; t.s11 = sb; t.s12 = sc; t.s13 = sd; t.s14 = se; t.s15 = sf; + return t; +} + +static __inline__ __device__ uint2_16 make_uint2_16( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7, + uint2 s8, uint2 s9, uint2 sa, uint2 sb, uint2 sc, uint2 sd, uint2 se, uint2 sf) +{ + uint2_16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7, + unsigned int s8, unsigned int s9, unsigned int sa, unsigned int sb, unsigned int sc, unsigned int sd, unsigned int se, unsigned int sf) +{ + uint16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + t.s8 = s8; t.s9 = s9; t.sa = sa; t.sb = sb; t.sc = sc; t.sd = sd; t.se = se; t.sf = sf; + return t; +} + +static __inline__ __host__ __device__ uint16 make_uint16(const uint8 &a, const uint8 &b) +{ + uint16 t; t.lo=a; t.hi=b; return t; +} + +static __inline__ __host__ __device__ uint32 make_uint32(const uint16 &a, const uint16 &b) +{ + uint32 t; t.lo = a; t.hi = b; return t; +} + + +static __inline__ __host__ __device__ uint8 make_uint8( + unsigned int s0, unsigned int s1, unsigned int s2, unsigned int s3, unsigned int s4, unsigned int s5, unsigned int s6, unsigned int s7) +{ + uint8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ uint2_8 make_uint2_8( + uint2 s0, uint2 s1, uint2 s2, uint2 s3, uint2 s4, uint2 s5, uint2 s6, uint2 s7) +{ + uint2_8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ ulonglong16 make_ulonglong16(const ulonglong4 &s0, const ulonglong4 &s1, + const ulonglong4 &s2, const ulonglong4 &s3, const ulonglong4 &s4, const ulonglong4 &s5, const ulonglong4 &s6, const ulonglong4 &s7) +{ + ulonglong16 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3; t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + +static __inline__ __host__ __device__ ulong8 make_ulong8( + ulonglong4 s0, ulonglong4 s1, ulonglong4 s2, ulonglong4 s3) +{ + ulong8 t; t.s0 = s0; t.s1 = s1; t.s2 = s2; t.s3 = s3;// t.s4 = s4; t.s5 = s5; t.s6 = s6; t.s7 = s7; + return t; +} + + +static __forceinline__ __device__ uchar4 operator^ (uchar4 a, uchar4 b) { return make_uchar4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uchar4 operator+ (uchar4 a, uchar4 b) { return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } + +static __forceinline__ __device__ uint4 operator+ (uint4 a, uint4 b) { return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __forceinline__ __device__ uint4 operator^ (uint4 a, uint4 b) { return make_uint4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ uint4 operator& (uint4 a, uint4 b) { return make_uint4(a.x & b.x, a.y & b.y, a.z & b.z, a.w & b.w); } +static __forceinline__ __device__ uint4 operator>>(uint4 a, int b) { return make_uint4(a.x >> b, a.y >> b, a.z >> b, a.w >> b); } +static __forceinline__ __device__ uint4 operator<<(uint4 a, int b) { return make_uint4(a.x << b, a.y << b, a.z << b, a.w << b); } +static __forceinline__ __device__ uint4 operator* (uint4 a, int b) { return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); } +static __forceinline__ __device__ void operator^=(uint4 &a,uint4 b) { a = a ^ b; } +static __forceinline__ __device__ void operator+=(uint4 &a, uint4 b){ a = a + b; } + +static __forceinline__ __device__ ulonglong4 operator^ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); } +static __forceinline__ __device__ ulonglong4 operator+ (ulonglong4 a, ulonglong4 b) { return make_ulonglong4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); } +static __forceinline__ __device__ ulonglong2 operator^ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x ^ b.x, a.y ^ b.y); } +static __forceinline__ __device__ ulonglong2 operator+ (ulonglong2 a, ulonglong2 b) { return make_ulonglong2(a.x + b.x, a.y + b.y); } + +static __forceinline__ __device__ ulong8 operator^ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3); +} + +static __forceinline__ __device__ ulong8 operator+ (const ulong8 &a, const ulong8 &b) { + return make_ulong8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3); +} + +static __forceinline__ __device__ __host__ uint8 operator^ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ __host__ uint8 operator+ (const uint8 &a, const uint8 &b) { return make_uint8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + +static __forceinline__ __device__ uint2_8 operator^ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); } + +static __forceinline__ __device__ uint2_8 operator+ (const uint2_8 &a, const uint2_8 &b) { return make_uint2_8(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); } + + +////////////// mess++ ////// + +static __forceinline__ __device__ uint28 operator^ (const uint28 &a, const uint28 &b) { + return make_uint28(a.x ^ b.x, a.y ^ b.y, a.z ^ b.z, a.w ^ b.w); +} + +static __forceinline__ __device__ uint28 operator+ (const uint28 &a, const uint28 &b) { + return make_uint28(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} + +static __forceinline__ __device__ uint48 operator+ (const uint48 &a, const uint48 &b) { + return make_uint48(a.s0 + b.s0, a.s1 + b.s1); +} + +///////////////////////// + +static __forceinline__ __device__ __host__ uint16 operator^ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator+ (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ __host__ uint16 operator- (const uint16 &a, const uint16 &b) { + return make_uint16(a.s0 - b.s0, a.s1 - b.s1, a.s2 - b.s2, a.s3 - b.s3, a.s4 - b.s4, a.s5 - b.s5, a.s6 - b.s6, a.s7 - b.s7, + a.s8 - b.s8, a.s9 - b.s9, a.sa - b.sa, a.sb - b.sb, a.sc - b.sc, a.sd - b.sd, a.se - b.se, a.sf - b.sf); +} + +static __forceinline__ __device__ uint2_16 operator^ (const uint2_16 &a, const uint2_16 &b) { + return make_uint2_16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7, + a.s8 ^ b.s8, a.s9 ^ b.s9, a.sa ^ b.sa, a.sb ^ b.sb, a.sc ^ b.sc, a.sd ^ b.sd, a.se ^ b.se, a.sf ^ b.sf); +} + +static __forceinline__ __device__ uint2_16 operator+ (const uint2_16 &a, const uint2_16 &b) { + return make_uint2_16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7, + a.s8 + b.s8, a.s9 + b.s9, a.sa + b.sa, a.sb + b.sb, a.sc + b.sc, a.sd + b.sd, a.se + b.se, a.sf + b.sf); +} + +static __forceinline__ __device__ uint32 operator^ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ uint32 operator+ (const uint32 &a, const uint32 &b) { + return make_uint32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ ulonglong16 operator^ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5, a.s6 ^ b.s6, a.s7 ^ b.s7); +} + +static __forceinline__ __device__ ulonglong16 operator+ (const ulonglong16 &a, const ulonglong16 &b) { + return make_ulonglong16(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5, a.s6 + b.s6, a.s7 + b.s7); +} + +static __forceinline__ __device__ void operator^= (ulong8 &a, const ulong8 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (uint28 &a, const uint28 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (uint28 &a, const uint28 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (uint2_8 &a, const uint2_8 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (uint2_8 &a, const uint2_8 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (uint32 &a, const uint32 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (uint32 &a, const uint32 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (uchar4 &a, uchar4 b) { a = a ^ b; } + +static __forceinline__ __device__ __host__ void operator^= (uint8 &a, const uint8 &b) { a = a ^ b; } +static __forceinline__ __device__ __host__ void operator^= (uint16 &a, const uint16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator^= (ulonglong16 &a, const ulonglong16 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator^= (ulonglong4 &a, const ulonglong4 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong4 &a, const ulonglong4 &b) { a = a + b; } + +static __forceinline__ __device__ void operator^= (ulonglong2 &a, const ulonglong2 &b) { a = a ^ b; } +static __forceinline__ __device__ void operator+= (ulonglong2 &a, const ulonglong2 &b) { a = a + b; } + +static __forceinline__ __device__ +ulonglong2to8 operator^ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 ^ b.l0, a.l1 ^ b.l1, a.l2 ^ b.l2, a.l3 ^ b.l3); +} +static __forceinline__ __device__ +ulonglong2to8 operator+ (const ulonglong2to8 &a, const ulonglong2to8 &b) +{ + return make_ulonglong2to8(a.l0 + b.l0, a.l1 + b.l1, a.l2 + b.l2, a.l3 + b.l3); +} + +static __forceinline__ __device__ +ulonglong8to16 operator^ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong8to16 operator+ (const ulonglong8to16 &a, const ulonglong8to16 &b) +{ + return make_ulonglong8to16(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator^ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong16to32 operator+ (const ulonglong16to32 &a, const ulonglong16to32 &b) +{ + return make_ulonglong16to32(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator^ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo ^ b.lo, a.hi ^ b.hi); +} + +static __forceinline__ __device__ +ulonglong32to64 operator+ (const ulonglong32to64 &a, const ulonglong32to64 &b) +{ + return make_ulonglong32to64(a.lo + b.lo, a.hi + b.hi); +} + +static __forceinline__ __device__ ulonglonglong operator^ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 ^ b.s0, a.s1 ^ b.s1, a.s2 ^ b.s2, a.s3 ^ b.s3, a.s4 ^ b.s4, a.s5 ^ b.s5); +} + +static __forceinline__ __device__ ulonglonglong operator+ (const ulonglonglong &a, const ulonglonglong &b) { + return make_ulonglonglong(a.s0 + b.s0, a.s1 + b.s1, a.s2 + b.s2, a.s3 + b.s3, a.s4 + b.s4, a.s5 + b.s5); +} + +static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulonglong2to8 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator-= (uint16 &a, const uint16 &b) { a = a - b; } +static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; } + +static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong8to16 &a, const ulonglong8to16 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong16to32 &a, const ulonglong16to32 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglong32to64 &a, const ulonglong32to64 &b) { a = a ^ b; } + +static __forceinline__ __device__ void operator+= (ulonglonglong &a, const ulonglonglong &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (ulonglonglong &a, const ulonglonglong &b) { a = a ^ b; } + + +static __forceinline__ __device__ uint4 rotate4(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +static __forceinline__ __device__ uint2x4 rotate2x4(const uint2x4 &vec4, uint32_t shift) +{ + uint2x4 ret; + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x.x) : "r"(vec4.x.x), "r"(vec4.x.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x.y) : "r"(vec4.x.y), "r"(vec4.x.y), "r"(shift)); + + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y.x) : "r"(vec4.y.x), "r"(vec4.y.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y.y) : "r"(vec4.y.y), "r"(vec4.y.y), "r"(shift)); + + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z.x) : "r"(vec4.z.x), "r"(vec4.z.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z.y) : "r"(vec4.z.y), "r"(vec4.z.y), "r"(shift)); + + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w.x) : "r"(vec4.w.x), "r"(vec4.w.x), "r"(shift)); + asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w.y) : "r"(vec4.w.y), "r"(vec4.w.y), "r"(shift)); + return ret; +} + +__device__ __forceinline__ +static uint4 rotate4R(uint4 vec4, uint32_t shift) +{ + uint4 ret; + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.x) : "r"(vec4.x), "r"(vec4.x), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.y) : "r"(vec4.y), "r"(vec4.y), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.z) : "r"(vec4.z), "r"(vec4.z), "r"(shift)); + asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret.w) : "r"(vec4.w), "r"(vec4.w), "r"(shift)); + return ret; +} + +#ifdef __CUDA_ARCH__ + +static __device__ __inline__ ulonglong4 __ldg4(const ulonglong4 *ptr) +{ + ulonglong4 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.z), "=l"(ret.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ ulonglong2to8 __ldg4(const ulonglong2to8 *ptr) +{ + ulonglong2to8 ret; + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.l0.x), "=l"(ret.l0.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret.l1.x), "=l"(ret.l1.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret.l2.x), "=l"(ret.l2.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret.l3.x), "=l"(ret.l3.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ void ldg4(const ulonglong4 *ptr,ulonglong4 *ret) +{ + asm("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret[0].x), "=l"(ret[0].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+16];" : "=l"(ret[0].z), "=l"(ret[0].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+32];" : "=l"(ret[1].x), "=l"(ret[1].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+48];" : "=l"(ret[1].z), "=l"(ret[1].w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+64];" : "=l"(ret[2].x), "=l"(ret[2].y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v2.u64 {%0,%1}, [%2+80];" : "=l"(ret[2].z), "=l"(ret[2].w) : __LDG_PTR(ptr)); +} + +static __device__ __forceinline__ uint2x4 __ldg4(const uint2x4 *ptr) +{ + uint2x4 ret; + asm ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); + asm ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); +// asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x.x), "=r"(ret.x.y), "=r"(ret.y.x), "=r"(ret.y.y) : __LDG_PTR(ptr)); +// asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.z.x), "=r"(ret.z.y), "=r"(ret.w.x), "=r"(ret.w.y) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ uint48 __ldg4(const uint48 *ptr) +{ + uint48 ret; + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.x), "=r"(ret.s0.y), "=r"(ret.s0.z), "=r"(ret.s0.w) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret.s1.x), "=r"(ret.s1.y), "=r"(ret.s1.z), "=r"(ret.s1.w) : __LDG_PTR(ptr)); + return ret; +} + +static __device__ __inline__ void ldg4(const uint28 *ptr, uint28 *ret) +{ + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret[0].x.x), "=r"(ret[0].x.y), "=r"(ret[0].y.x), "=r"(ret[0].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+16];" : "=r"(ret[0].z.x), "=r"(ret[0].z.y), "=r"(ret[0].w.x), "=r"(ret[0].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+32];" : "=r"(ret[1].x.x), "=r"(ret[1].x.y), "=r"(ret[1].y.x), "=r"(ret[1].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+48];" : "=r"(ret[1].z.x), "=r"(ret[1].z.y), "=r"(ret[1].w.x), "=r"(ret[1].w.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+64];" : "=r"(ret[2].x.x), "=r"(ret[2].x.y), "=r"(ret[2].y.x), "=r"(ret[2].y.y) : __LDG_PTR(ptr)); + asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+80];" : "=r"(ret[2].z.x), "=r"(ret[2].z.y), "=r"(ret[2].w.x), "=r"(ret[2].w.y) : __LDG_PTR(ptr)); +} + +#endif /* __CUDA_ARCH__ < 320 */ + + +static __forceinline__ __device__ uint8 swapvec(const uint8 &buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + return vec; +} + +static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) +{ + uint8 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf[0].s0); + vec.s1 = cuda_swab32(buf[0].s1); + vec.s2 = cuda_swab32(buf[0].s2); + vec.s3 = cuda_swab32(buf[0].s3); + vec.s4 = cuda_swab32(buf[0].s4); + vec.s5 = cuda_swab32(buf[0].s5); + vec.s6 = cuda_swab32(buf[0].s6); + vec.s7 = cuda_swab32(buf[0].s7); + vec.s8 = cuda_swab32(buf[0].s8); + vec.s9 = cuda_swab32(buf[0].s9); + vec.sa = cuda_swab32(buf[0].sa); + vec.sb = cuda_swab32(buf[0].sb); + vec.sc = cuda_swab32(buf[0].sc); + vec.sd = cuda_swab32(buf[0].sd); + vec.se = cuda_swab32(buf[0].se); + vec.sf = cuda_swab32(buf[0].sf); + return vec; +} + +static __forceinline__ __device__ uint28 swapvec(const uint28 &buf) +{ + uint28 vec; + vec.x.x = cuda_swab32(buf.x.x); + vec.x.y = cuda_swab32(buf.x.y); + vec.y.x = cuda_swab32(buf.y.x); + vec.y.y = cuda_swab32(buf.y.y); + vec.z.x = cuda_swab32(buf.z.x); + vec.z.y = cuda_swab32(buf.z.y); + vec.w.x = cuda_swab32(buf.w.x); + vec.w.y = cuda_swab32(buf.w.y); + return vec; +} + +static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) +{ + uint16 vec; + vec.s0 = cuda_swab32(buf.s0); + vec.s1 = cuda_swab32(buf.s1); + vec.s2 = cuda_swab32(buf.s2); + vec.s3 = cuda_swab32(buf.s3); + vec.s4 = cuda_swab32(buf.s4); + vec.s5 = cuda_swab32(buf.s5); + vec.s6 = cuda_swab32(buf.s6); + vec.s7 = cuda_swab32(buf.s7); + vec.s8 = cuda_swab32(buf.s8); + vec.s9 = cuda_swab32(buf.s9); + vec.sa = cuda_swab32(buf.sa); + vec.sb = cuda_swab32(buf.sb); + vec.sc = cuda_swab32(buf.sc); + vec.sd = cuda_swab32(buf.sd); + vec.se = cuda_swab32(buf.se); + vec.sf = cuda_swab32(buf.sf); + return vec; +} + +static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) +{ +#if __CUDA_ARCH__ >= 300 + uint28 res; + res.x.x = __shfl(var.x.x, lane, 8); + res.x.y = __shfl(var.x.y, lane, 8); + res.y.x = __shfl(var.y.x, lane, 8); + res.y.y = __shfl(var.y.y, lane, 8); + res.z.x = __shfl(var.z.x, lane, 8); + res.z.y = __shfl(var.z.y, lane, 8); + res.w.x = __shfl(var.w.x, lane, 8); + res.w.y = __shfl(var.w.y, lane, 8); + return res; +#else + return var; +#endif +} + +static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) +{ +#if __CUDA_ARCH__ >= 300 + ulonglong4 res; + uint2 temp; + temp = vectorize(var.x); + temp.x = __shfl(temp.x, lane, 8); + temp.y = __shfl(temp.y, lane, 8); + res.x = devectorize(temp); + temp = vectorize(var.y); + temp.x = __shfl(temp.x, lane, 8); + temp.y = __shfl(temp.y, lane, 8); + res.y = devectorize(temp); + temp = vectorize(var.z); + temp.x = __shfl(temp.x, lane, 8); + temp.y = __shfl(temp.y, lane, 8); + res.z = devectorize(temp); + temp = vectorize(var.w); + temp.x = __shfl(temp.x, lane, 8); + temp.y = __shfl(temp.y, lane, 8); + res.w = devectorize(temp); + return res; +#else + return var; +#endif +} +/* +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ +uint32_t ROL8(const uint32_t a){ + return __byte_perm(a, 0, 0x2103); +} + +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a){ + return __byte_perm(a, 0, 0x0321); +} + +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a){ + return __byte_perm(a, 0, 0x1032); +} +#else + #define ROL8(u) ROTL32(u, 8) + #define ROR8(u) ROTR32(u, 8) + #define ROL16(u) ROTL32(u,16) +#endif +*/ + +#endif // #ifndef CUDA_LYRA_VECTOR_H diff --git a/gost/cuda_gosthash.cu b/gost/cuda_gosthash.cu new file mode 100644 index 0000000000..70a286f85c --- /dev/null +++ b/gost/cuda_gosthash.cu @@ -0,0 +1,976 @@ +/* + * Streebog GOST R 34.10-2012 CUDA implementation. + * + * https://tools.ietf.org/html/rfc6986 + * https://en.wikipedia.org/wiki/Streebog + * + * ==========================(LICENSE BEGIN)============================ + * + * @author Tanguy Pruvot 2015, orignal 2017 + */ +#include + +#include "cuda_helper.h" + +typedef unsigned char uchar; + +static uint32_t* d_resNonces[MAX_GPUS] = { 0 }; +__constant__ static uint64_t __align__(8) c_header1[10] = + { + 0, 0, 0, 0, 0, 0, 0, 0x0100000000000000, 0, 0 + }, + // c_header1 +2 is leading zero,1 and first 16 bytes of header for first hash + // c_header1 is leading zero and 1 for second hash + c_header2[8]; // second block of 64 bytes + +__device__ uint64_t d_target[1]; + +// Tables for function F +__device__ static uint64_t T0[256] = { + 0xE6F87E5C5B711FD0, 0x258377800924FA16, 0xC849E07E852EA4A8, 0x5B4686A18F06C16A, + 0x0B32E9A2D77B416E, 0xABDA37A467815C66, 0xF61796A81A686676, 0xF5DC0B706391954B, + 0x4862F38DB7E64BF1, 0xFF5C629A68BD85C5, 0xCB827DA6FCD75795, 0x66D36DAF69B9F089, + 0x356C9F74483D83B0, 0x7CBCECB1238C99A1, 0x36A702AC31C4708D, 0x9EB6A8D02FBCDFD6, + 0x8B19FA51E5B3AE37, 0x9CCFB5408A127D0B, 0xBC0C78B508208F5A, 0xE533E3842288ECED, + 0xCEC2C7D377C15FD2, 0xEC7817B6505D0F5E, 0xB94CC2C08336871D, 0x8C205DB4CB0B04AD, + 0x763C855B28A0892F, 0x588D1B79F6FF3257, 0x3FECF69E4311933E, 0x0FC0D39F803A18C9, + 0xEE010A26F5F3AD83, 0x10EFE8F4411979A6, 0x5DCDA10C7DE93A10, 0x4A1BEE1D1248E92C, + 0x53BFF2DB21847339, 0xB4F50CCFA6A23D09, 0x5FB4BC9CD84798CD, 0xE88A2D8B071C56F9, + 0x7F7771695A756A9C, 0xC5F02E71A0BA1EBC, 0xA663F9AB4215E672, 0x2EB19E22DE5FBB78, + 0x0DB9CE0F2594BA14, 0x82520E6397664D84, 0x2F031E6A0208EA98, 0x5C7F2144A1BE6BF0, + 0x7A37CB1CD16362DB, 0x83E08E2B4B311C64, 0xCF70479BAB960E32, 0x856BA986B9DEE71E, + 0xB5478C877AF56CE9, 0xB8FE42885F61D6FD, 0x1BDD0156966238C8, 0x622157923EF8A92E, + 0xFC97FF42114476F8, 0x9D7D350856452CEB, 0x4C90C9B0E0A71256, 0x2308502DFBCB016C, + 0x2D7A03FAA7A64845, 0xF46E8B38BFC6C4AB, 0xBDBEF8FDD477DEBA, 0x3AAC4CEBC8079B79, + 0xF09CB105E8879D0C, 0x27FA6A10AC8A58CB, 0x8960E7C1401D0CEA, 0x1A6F811E4A356928, + 0x90C4FB0773D196FF, 0x43501A2F609D0A9F, 0xF7A516E0C63F3796, 0x1CE4A6B3B8DA9252, + 0x1324752C38E08A9B, 0xA5A864733BEC154F, 0x2BF124575549B33F, 0xD766DB15440DC5C7, + 0xA7D179E39E42B792, 0xDADF151A61997FD3, 0x86A0345EC0271423, 0x38D5517B6DA939A4, + 0x6518F077104003B4, 0x02791D90A5AEA2DD, 0x88D267899C4A5D0A, 0x930F66DF0A2865C2, + 0x4EE9D4204509B08B, 0x325538916685292A, 0x412907BFC533A842, 0xB27E2B62544DC673, + 0x6C5304456295E007, 0x5AF406E95351908A, 0x1F2F3B6BC123616F, 0xC37B09DC5255E5C6, + 0x3967D133B1FE6844, 0x298839C7F0E711E2, 0x409B87F71964F9A2, 0xE938ADC3DB4B0719, + 0x0C0B4E47F9C3EBF4, 0x5534D576D36B8843, 0x4610A05AEB8B02D8, 0x20C3CDF58232F251, + 0x6DE1840DBEC2B1E7, 0xA0E8DE06B0FA1D08, 0x7B854B540D34333B, 0x42E29A67BCCA5B7F, + 0xD8A6088AC437DD0E, 0xC63BB3A9D943ED81, 0x21714DBD5E65A3B1, 0x6761EDE7B5EEA169, + 0x2431F7C8D573ABF6, 0xD51FC685E1A3671A, 0x5E063CD40410C92D, 0x283AB98F2CB04002, + 0x8FEBC06CB2F2F790, 0x17D64F116FA1D33C, 0xE07359F1A99EE4AA, 0x784ED68C74CDC006, + 0x6E2A19D5C73B42DA, 0x8712B4161C7045C3, 0x371582E4ED93216D, 0xACE390414939F6FC, + 0x7EC5F12186223B7C, 0xC0B094042BAC16FB, 0xF9D745379A527EBF, 0x737C3F2EA3B68168, + 0x33E7B8D9BAD278CA, 0xA9A32A34C22FFEBB, 0xE48163CCFEDFBD0D, 0x8E5940246EA5A670, + 0x51C6EF4B842AD1E4, 0x22BAD065279C508C, 0xD91488C218608CEE, 0x319EA5491F7CDA17, + 0xD394E128134C9C60, 0x094BF43272D5E3B3, 0x9BF612A5A4AAD791, 0xCCBBDA43D26FFD0F, + 0x34DE1F3C946AD250, 0x4F5B5468995EE16B, 0xDF9FAF6FEA8F7794, 0x2648EA5870DD092B, + 0xBFC7E56D71D97C67, 0xDDE6B2FF4F21D549, 0x3C276B463AE86003, 0x91767B4FAF86C71F, + 0x68A13E7835D4B9A0, 0xB68C115F030C9FD4, 0x141DD2C916582001, 0x983D8F7DDD5324AC, + 0x64AA703FCC175254, 0xC2C989948E02B426, 0x3E5E76D69F46C2DE, 0x50746F03587D8004, + 0x45DB3D829272F1E5, 0x60584A029B560BF3, 0xFBAE58A73FFCDC62, 0xA15A5E4E6CAD4CE8, + 0x4BA96E55CE1FB8CC, 0x08F9747AAE82B253, 0xC102144CF7FB471B, 0x9F042898F3EB8E36, + 0x068B27ADF2EFFB7A, 0xEDCA97FE8C0A5EBE, 0x778E0513F4F7D8CF, 0x302C2501C32B8BF7, + 0x8D92DDFC175C554D, 0xF865C57F46052F5F, 0xEAF3301BA2B2F424, 0xAA68B7ECBBD60D86, + 0x998F0F350104754C, 0x0000000000000000, 0xF12E314D34D0CCEC, 0x710522BE061823B5, + 0xAF280D9930C005C1, 0x97FD5CE25D693C65, 0x19A41CC633CC9A15, 0x95844172F8C79EB8, + 0xDC5432B7937684A9, 0x9436C13A2490CF58, 0x802B13F332C8EF59, 0xC442AE397CED4F5C, + 0xFA1CD8EFE3AB8D82, 0xF2E5AC954D293FD1, 0x6AD823E8907A1B7D, 0x4D2249F83CF043B6, + 0x03CB9DD879F9F33D, 0xDE2D2F2736D82674, 0x2A43A41F891EE2DF, 0x6F98999D1B6C133A, + 0xD4AD46CD3DF436FA, 0xBB35DF50269825C0, 0x964FDCAA813E6D85, 0xEB41B0537EE5A5C4, + 0x0540BA758B160847, 0xA41AE43BE7BB44AF, 0xE3B8C429D0671797, 0x819993BBEE9FBEB9, + 0xAE9A8DD1EC975421, 0xF3572CDD917E6E31, 0x6393D7DAE2AFF8CE, 0x47A2201237DC5338, + 0xA32343DEC903EE35, 0x79FC56C4A89A91E6, 0x01B28048DC5751E0, 0x1296F564E4B7DB7B, + 0x75F7188351597A12, 0xDB6D9552BDCE2E33, 0x1E9DBB231D74308F, 0x520D7293FDD322D9, + 0xE20A44610C304677, 0xFEEEE2D2B4EAD425, 0xCA30FDEE20800675, 0x61EACA4A47015A13, + 0xE74AFE1487264E30, 0x2CC883B27BF119A5, 0x1664CF59B3F682DC, 0xA811AA7C1E78AF5B, + 0x1D5626FB648DC3B2, 0xB73E9117DF5BCE34, 0xD05F7CF06AB56F5D, 0xFD257F0ACD132718, + 0x574DC8E676C52A9E, 0x0739A7E52EB8AA9A, 0x5486553E0F3CD9A3, 0x56FF48AEAA927B7E, + 0xBE756525AD8E2D87, 0x7D0E6CF9FFDBC841, 0x3B1ECCA31450CA99, 0x6913BE30E983E840, + 0xAD511009956EA71C, 0xB1B5B6BA2DB4354E, 0x4469BDCA4E25A005, 0x15AF5281CA0F71E1, + 0x744598CB8D0E2BF2, 0x593F9B312AA863B7, 0xEFB38A6E29A4FC63, 0x6B6AA3A04C2D4A9D, + 0x3D95EB0EE6BF31E3, 0xA291C3961554BFD5, 0x18169C8EEF9BCBF5, 0x115D68BC9D4E2846, + 0xBA875F18FACF7420, 0xD1EDFCB8B6E23EBD, 0xB00736F2F1E364AE, 0x84D929CE6589B6FE, + 0x70B7A2F6DA4F7255, 0x0E7253D75C6D4929, 0x04F23A3D574159A7, 0x0A8069EA0B2C108E, + 0x49D073C56BB11A11, 0x8AAB7A1939E4FFD7, 0xCD095A0B0E38ACEF, 0xC9FB60365979F548, + 0x92BDE697D67F3422, 0xC78933E10514BC61, 0xE1C1D9B975C9B54A, 0xD2266160CF1BCD80, + 0x9A4492ED78FD8671, 0xB3CCAB2A881A9793, 0x72CEBF667FE1D088, 0xD6D45B5D985A9427 +}; +__device__ static uint64_t T1[256] = { + 0xC811A8058C3F55DE, 0x65F5B43196B50619, 0xF74F96B1D6706E43, 0x859D1E8BCB43D336, + 0x5AAB8A85CCFA3D84, 0xF9C7BF99C295FCFD, 0xA21FD5A1DE4B630F, 0xCDB3EF763B8B456D, + 0x803F59F87CF7C385, 0xB27C73BE5F31913C, 0x98E3AC6633B04821, 0xBF61674C26B8F818, + 0x0FFBC995C4C130C8, 0xAAA0862010761A98, 0x6057F342210116AA, 0xF63C760C0654CC35, + 0x2DDB45CC667D9042, 0xBCF45A964BD40382, 0x68E8A0C3EF3C6F3D, 0xA7BD92D269FF73BC, + 0x290AE20201ED2287, 0xB7DE34CDE885818F, 0xD901EEA7DD61059B, 0xD6FA273219A03553, + 0xD56F1AE874CCCEC9, 0xEA31245C2E83F554, 0x7034555DA07BE499, 0xCE26D2AC56E7BEF7, + 0xFD161857A5054E38, 0x6A0E7DA4527436D1, 0x5BD86A381CDE9FF2, 0xCAF7756231770C32, + 0xB09AAED9E279C8D0, 0x5DEF1091C60674DB, 0x111046A2515E5045, 0x23536CE4729802FC, + 0xC50CBCF7F5B63CFA, 0x73A16887CD171F03, 0x7D2941AFD9F28DBD, 0x3F5E3EB45A4F3B9D, + 0x84EEFE361B677140, 0x3DB8E3D3E7076271, 0x1A3A28F9F20FD248, 0x7EBC7C75B49E7627, + 0x74E5F293C7EB565C, 0x18DCF59E4F478BA4, 0x0C6EF44FA9ADCB52, 0xC699812D98DAC760, + 0x788B06DC6E469D0E, 0xFC65F8EA7521EC4E, 0x30A5F7219E8E0B55, 0x2BEC3F65BCA57B6B, + 0xDDD04969BAF1B75E, 0x99904CDBE394EA57, 0x14B201D1E6EA40F6, 0xBBB0C08241284ADD, + 0x50F20463BF8F1DFF, 0xE8D7F93B93CBACB8, 0x4D8CB68E477C86E8, 0xC1DD1B3992268E3F, + 0x7C5AA11209D62FCB, 0x2F3D98ABDB35C9AE, 0x671369562BFD5FF5, 0x15C1E16C36CEE280, + 0x1D7EB2EDF8F39B17, 0xDA94D37DB00DFE01, 0x877BC3EC760B8ADA, 0xCB8495DFE153AE44, + 0x05A24773B7B410B3, 0x12857B783C32ABDF, 0x8EB770D06812513B, 0x536739B9D2E3E665, + 0x584D57E271B26468, 0xD789C78FC9849725, 0xA935BBFA7D1AE102, 0x8B1537A3DFA64188, + 0xD0CD5D9BC378DE7A, 0x4AC82C9A4D80CFB7, 0x42777F1B83BDB620, 0x72D2883A1D33BD75, + 0x5E7A2D4BAB6A8F41, 0xF4DAAB6BBB1C95D9, 0x905CFFE7FD8D31B6, 0x83AA6422119B381F, + 0xC0AEFB8442022C49, 0xA0F908C663033AE3, 0xA428AF0804938826, 0xADE41C341A8A53C7, + 0xAE7121EE77E6A85D, 0xC47F5C4A25929E8C, 0xB538E9AA55CDD863, 0x06377AA9DAD8EB29, + 0xA18AE87BB3279895, 0x6EDFDA6A35E48414, 0x6B7D9D19825094A7, 0xD41CFA55A4E86CBF, + 0xE5CAEDC9EA42C59C, 0xA36C351C0E6FC179, 0x5181E4DE6FABBF89, 0xFFF0C530184D17D4, + 0x9D41EB1584045892, 0x1C0D525028D73961, 0xF178EC180CA8856A, 0x9A0571018EF811CD, + 0x4091A27C3EF5EFCC, 0x19AF15239F6329D2, 0x347450EFF91EB990, 0xE11B4A078DD27759, + 0xB9561DE5FC601331, 0x912F1F5A2DA993C0, 0x1654DCB65BA2191A, 0x3E2DDE098A6B99EB, + 0x8A66D71E0F82E3FE, 0x8C51ADB7D55A08D7, 0x4533E50F8941FF7F, 0x02E6DD67BD4859EC, + 0xE068AABA5DF6D52F, 0xC24826E3FF4A75A5, 0x6C39070D88ACDDF8, 0x6486548C4691A46F, + 0xD1BEBD26135C7C0C, 0xB30F93038F15334A, 0x82D9849FC1BF9A69, 0x9C320BA85420FAE4, + 0xFA528243AFF90767, 0x9ED4D6CFE968A308, 0xB825FD582C44B147, 0x9B7691BC5EDCB3BB, + 0xC7EA619048FE6516, 0x1063A61F817AF233, 0x47D538683409A693, 0x63C2CE984C6DED30, + 0x2A9FDFD86C81D91D, 0x7B1E3B06032A6694, 0x666089EBFBD9FD83, 0x0A598EE67375207B, + 0x07449A140AFC495F, 0x2CA8A571B6593234, 0x1F986F8A45BBC2FB, 0x381AA4A050B372C2, + 0x5423A3ADD81FAF3A, 0x17273C0B8B86BB6C, 0xFE83258DC869B5A2, 0x287902BFD1C980F1, + 0xF5A94BD66B3837AF, 0x88800A79B2CABA12, 0x55504310083B0D4C, 0xDF36940E07B9EEB2, + 0x04D1A7CE6790B2C5, 0x612413FFF125B4DC, 0x26F12B97C52C124F, 0x86082351A62F28AC, + 0xEF93632F9937E5E7, 0x3507B052293A1BE6, 0xE72C30AE570A9C70, 0xD3586041AE1425E0, + 0xDE4574B3D79D4CC4, 0x92BA228040C5685A, 0xF00B0CA5DC8C271C, 0xBE1287F1F69C5A6E, + 0xF39E317FB1E0DC86, 0x495D114020EC342D, 0x699B407E3F18CD4B, 0xDCA3A9D46AD51528, + 0x0D1D14F279896924, 0x0000000000000000, 0x593EB75FA196C61E, 0x2E4E78160B116BD8, + 0x6D4AE7B058887F8E, 0xE65FD013872E3E06, 0x7A6DDBBBD30EC4E2, 0xAC97FC89CAAEF1B1, + 0x09CCB33C1E19DBE1, 0x89F3EAC462EE1864, 0x7770CF49AA87ADC6, 0x56C57ECA6557F6D6, + 0x03953DDA6D6CFB9A, 0x36928D884456E07C, 0x1EEB8F37959F608D, 0x31D6179C4EAAA923, + 0x6FAC3AD7E5C02662, 0x43049FA653991456, 0xABD3669DC052B8EE, 0xAF02C153A7C20A2B, + 0x3CCB036E3723C007, 0x93C9C23D90E1CA2C, 0xC33BC65E2F6ED7D3, 0x4CFF56339758249E, + 0xB1E94E64325D6AA6, 0x37E16D359472420A, 0x79F8E661BE623F78, 0x5214D90402C74413, + 0x482EF1FDF0C8965B, 0x13F69BC5EC1609A9, 0x0E88292814E592BE, 0x4E198B542A107D72, + 0xCCC00FCBEBAFE71B, 0x1B49C844222B703E, 0x2564164DA840E9D5, 0x20C6513E1FF4F966, + 0xBAC3203F910CE8AB, 0xF2EDD1C261C47EF0, 0x814CB945ACD361F3, 0x95FEB8944A392105, + 0x5C9CF02C1622D6AD, 0x971865F3F77178E9, 0xBD87BA2B9BF0A1F4, 0x444005B259655D09, + 0xED75BE48247FBC0B, 0x7596122E17CFF42A, 0xB44B091785E97A15, 0x966B854E2755DA9F, + 0xEEE0839249134791, 0x32432A4623C652B9, 0xA8465B47AD3E4374, 0xF8B45F2412B15E8B, + 0x2417F6F078644BA3, 0xFB2162FE7FDDA511, 0x4BBBCC279DA46DC1, 0x0173E0BDD024A276, + 0x22208C59A2BCA08A, 0x8FC4906DB836F34D, 0xE4B90D743A6667EA, 0x7147B5E0705F46EF, + 0x2782CB2A1508B039, 0xEC065EF5F45B1E7D, 0x21B5B183CFD05B10, 0xDBE733C060295C77, + 0x9FA73672394C017E, 0xCF55321186C31C81, 0xD8720E1A0D45A7ED, 0x3B8F997A3DDF8958, + 0x3AFC79C7EDFB2B2E, 0xE9A4198643EF0ECE, 0x5F09CDF67B4E2D37, 0x4F6A6BE9FA34DF04, + 0xB6ADD47038A123F9, 0x8D224D0A057EAAA1, 0xC96248B85C1BF7A8, 0xE3FD9760309A2EB5, + 0x0B2A6E5BA351820D, 0xEB42C4E1FEA75722, 0x948D58299A1D8373, 0x7FCF9CC864BAD451, + 0xA55B4FB5D4B72A50, 0x08BF5381CE3D7997, 0x46A6D8D5E42D04E5, 0xD22B80FC7E308796, + 0x57B69E77B57354A0, 0x3969441D8097D0B4, 0x3330CAFBF3E2F0CF, 0xE28E77DDE0BE8CC3, + 0x62B12E259C494F46, 0xA6CE726FB9DBD1CA, 0x41E242C1EED14DBA, 0x76032FF47AA30FB0 +}; +__device__ static uint64_t T2[256] = { + 0x45B268A93ACDE4CC, 0xAF7F0BE884549D08, 0x048354B3C1468263, 0x925435C2C80EFED2, + 0xEE4E37F27FDFFBA7, 0x167A33920C60F14D, 0xFB123B52EA03E584, 0x4A0CAB53FDBB9007, + 0x9DEAF6380F788A19, 0xCB48EC558F0CB32A, 0xB59DC4B2D6FEF7E0, 0xDCDBCA22F4F3ECB6, + 0x11DF5813549A9C40, 0xE33FDEDF568ACED3, 0xA0C1C8124322E9C3, 0x07A56B8158FA6D0D, + 0x77279579B1E1F3DD, 0xD9B18B74422AC004, 0xB8EC2D9FFFABC294, 0xF4ACF8A82D75914F, + 0x7BBF69B1EF2B6878, 0xC4F62FAF487AC7E1, 0x76CE809CC67E5D0C, 0x6711D88F92E4C14C, + 0x627B99D9243DEDFE, 0x234AA5C3DFB68B51, 0x909B1F15262DBF6D, 0x4F66EA054B62BCB5, + 0x1AE2CF5A52AA6AE8, 0xBEA053FBD0CE0148, 0xED6808C0E66314C9, 0x43FE16CD15A82710, + 0xCD049231A06970F6, 0xE7BC8A6C97CC4CB0, 0x337CE835FCB3B9C0, 0x65DEF2587CC780F3, + 0x52214EDE4132BB50, 0x95F15E4390F493DF, 0x870839625DD2E0F1, 0x41313C1AFB8B66AF, + 0x91720AF051B211BC, 0x477D427ED4EEA573, 0x2E3B4CEEF6E3BE25, 0x82627834EB0BCC43, + 0x9C03E3DD78E724C8, 0x2877328AD9867DF9, 0x14B51945E243B0F2, 0x574B0F88F7EB97E2, + 0x88B6FA989AA4943A, 0x19C4F068CB168586, 0x50EE6409AF11FAEF, 0x7DF317D5C04EABA4, + 0x7A567C5498B4C6A9, 0xB6BBFB804F42188E, 0x3CC22BCF3BC5CD0B, 0xD04336EAAA397713, + 0xF02FAC1BEC33132C, 0x2506DBA7F0D3488D, 0xD7E65D6BF2C31A1E, 0x5EB9B2161FF820F5, + 0x842E0650C46E0F9F, 0x716BEB1D9E843001, 0xA933758CAB315ED4, 0x3FE414FDA2792265, + 0x27C9F1701EF00932, 0x73A4C1CA70A771BE, 0x94184BA6E76B3D0E, 0x40D829FF8C14C87E, + 0x0FBEC3FAC77674CB, 0x3616A9634A6A9572, 0x8F139119C25EF937, 0xF545ED4D5AEA3F9E, + 0xE802499650BA387B, 0x6437E7BD0B582E22, 0xE6559F89E053E261, 0x80AD52E305288DFC, + 0x6DC55A23E34B9935, 0xDE14E0F51AD0AD09, 0xC6390578A659865E, 0x96D7617109487CB1, + 0xE2D6CB3A21156002, 0x01E915E5779FAED1, 0xADB0213F6A77DCB7, 0x9880B76EB9A1A6AB, + 0x5D9F8D248644CF9B, 0xFD5E4536C5662658, 0xF1C6B9FE9BACBDFD, 0xEACD6341BE9979C4, + 0xEFA7221708405576, 0x510771ECD88E543E, 0xC2BA51CB671F043D, 0x0AD482AC71AF5879, + 0xFE787A045CDAC936, 0xB238AF338E049AED, 0xBD866CC94972EE26, 0x615DA6EBBD810290, + 0x3295FDD08B2C1711, 0xF834046073BF0AEA, 0xF3099329758FFC42, 0x1CAEB13E7DCFA934, + 0xBA2307481188832B, 0x24EFCE42874CE65C, 0x0E57D61FB0E9DA1A, 0xB3D1BAD6F99B343C, + 0xC0757B1C893C4582, 0x2B510DB8403A9297, 0x5C7698C1F1DB614A, 0x3E0D0118D5E68CB4, + 0xD60F488E855CB4CF, 0xAE961E0DF3CB33D9, 0x3A8E55AB14A00ED7, 0x42170328623789C1, + 0x838B6DD19C946292, 0x895FEF7DED3B3AEB, 0xCFCBB8E64E4A3149, 0x064C7E642F65C3DC, + 0x3D2B3E2A4C5A63DA, 0x5BD3F340A9210C47, 0xB474D157A1615931, 0xAC5934DA1DE87266, + 0x6EE365117AF7765B, 0xC86ED36716B05C44, 0x9BA6885C201D49C5, 0xB905387A88346C45, + 0x131072C4BAB9DDFF, 0xBF49461EA751AF99, 0xD52977BC1CE05BA1, 0xB0F785E46027DB52, + 0x546D30BA6E57788C, 0x305AD707650F56AE, 0xC987C682612FF295, 0xA5AB8944F5FBC571, + 0x7ED528E759F244CA, 0x8DDCBBCE2C7DB888, 0xAA154ABE328DB1BA, 0x1E619BE993ECE88B, + 0x09F2BD9EE813B717, 0x7401AA4B285D1CB3, 0x21858F143195CAEE, 0x48C381841398D1B8, + 0xFCB750D3B2F98889, 0x39A86A998D1CE1B9, 0x1F888E0CE473465A, 0x7899568376978716, + 0x02CF2AD7EE2341BF, 0x85C713B5B3F1A14E, 0xFF916FE12B4567E7, 0x7C1A0230B7D10575, + 0x0C98FCC85ECA9BA5, 0xA3E7F720DA9E06AD, 0x6A6031A2BBB1F438, 0x973E74947ED7D260, + 0x2CF4663918C0FF9A, 0x5F50A7F368678E24, 0x34D983B4A449D4CD, 0x68AF1B755592B587, + 0x7F3C3D022E6DEA1B, 0xABFC5F5B45121F6B, 0x0D71E92D29553574, 0xDFFDF5106D4F03D8, + 0x081BA87B9F8C19C6, 0xDB7EA1A3AC0981BB, 0xBBCA12AD66172DFA, 0x79704366010829C7, + 0x179326777BFF5F9C, 0x0000000000000000, 0xEB2476A4C906D715, 0x724DD42F0738DF6F, + 0xB752EE6538DDB65F, 0x37FFBC863DF53BA3, 0x8EFA84FCB5C157E6, 0xE9EB5C73272596AA, + 0x1B0BDABF2535C439, 0x86E12C872A4D4E20, 0x9969A28BCE3E087A, 0xFAFB2EB79D9C4B55, + 0x056A4156B6D92CB2, 0x5A3AE6A5DEBEA296, 0x22A3B026A8292580, 0x53C85B3B36AD1581, + 0xB11E900117B87583, 0xC51F3A4A3FE56930, 0xE019E1EDCF3621BD, 0xEC811D2591FCBA18, + 0x445B7D4C4D524A1D, 0xA8DA6069DCAEF005, 0x58F5CC72309DE329, 0xD4C062596B7FF570, + 0xCE22AD0339D59F98, 0x591CD99747024DF8, 0x8B90C5AA03187B54, 0xF663D27FC356D0F0, + 0xD8589E9135B56ED5, 0x35309651D3D67A1C, 0x12F96721CD26732E, 0xD28C1C3D441A36AC, + 0x492A946164077F69, 0x2D1D73DC6F5F514B, 0x6F0A70F40D68D88A, 0x60B4B30ECA1EAC41, + 0xD36509D83385987D, 0x0B3D97490630F6A8, 0x9ECCC90A96C46577, 0xA20EE2C5AD01A87C, + 0xE49AB55E0E70A3DE, 0xA4429CA182646BA0, 0xDA97B446DB962F6A, 0xCCED87D4D7F6DE27, + 0x2AB8185D37A53C46, 0x9F25DCEFE15BCBA6, 0xC19C6EF9FEA3EB53, 0xA764A3931BD884CE, + 0x2FD2590B817C10F4, 0x56A21A6D80743933, 0xE573A0BB79EF0D0F, 0x155C0CA095DC1E23, + 0x6C2C4FC694D437E4, 0x10364DF623053291, 0xDD32DFC7836C4267, 0x03263F3299BCEF6E, + 0x66F8CD6AE57B6F9D, 0x8C35AE2B5BE21659, 0x31B3C2E21290F87F, 0x93BD2027BF915003, + 0x69460E90220D1B56, 0x299E276FAE19D328, 0x63928C3C53A2432F, 0x7082FEF8E91B9ED0, + 0xBC6F792C3EED40F7, 0x4C40D537D2DE53DB, 0x75E8BFAE5FC2B262, 0x4DA9C0D2A541FD0A, + 0x4E8FFFE03CFD1264, 0x2620E495696FA7E3, 0xE1F0F408B8A98F6C, 0xD1AA230FDDA6D9C2, + 0xC7D0109DD1C6288F, 0x8A79D04F7487D585, 0x4694579BA3710BA2, 0x38417F7CFA834F68, + 0x1D47A4DB0A5007E5, 0x206C9AF1460A643F, 0xA128DDF734BD4712, 0x8144470672B7232D, + 0xF2E086CC02105293, 0x182DE58DBC892B57, 0xCAA1F9B0F8931DFB, 0x6B892447CC2E5AE9, + 0xF9DD11850420A43B, 0x4BE5BEB68A243ED6, 0x5584255F19C8D65D, 0x3B67404E633FA006, + 0xA68DB6766C472A1F, 0xF78AC79AB4C97E21, 0xC353442E1080AAEC, 0x9A4F9DB95782E714 +}; +__device__ static uint64_t T3[256] = { + 0x05BA7BC82C9B3220, 0x31A54665F8B65E4F, 0xB1B651F77547F4D4, 0x8BFA0D857BA46682, + 0x85A96C5AA16A98BB, 0x990FAEF908EB79C9, 0xA15E37A247F4A62D, 0x76857DCD5D27741E, + 0xF8C50B800A1820BC, 0xBE65DCB201F7A2B4, 0x666D1B986F9426E7, 0x4CC921BF53C4E648, + 0x95410A0F93D9CA42, 0x20CDCCAA647BA4EF, 0x429A4060890A1871, 0x0C4EA4F69B32B38B, + 0xCCDA362DDE354CD3, 0x96DC23BC7C5B2FA9, 0xC309BB68AA851AB3, 0xD26131A73648E013, + 0x021DC52941FC4DB2, 0xCD5ADAB7704BE48A, 0xA77965D984ED71E6, 0x32386FD61734BBA4, + 0xE82D6DD538AB7245, 0x5C2147EA6177B4B1, 0x5DA1AB70CF091CE8, 0xAC907FCE72B8BDFF, + 0x57C85DFD972278A8, 0xA4E44C6A6B6F940D, 0x3851995B4F1FDFE4, 0x62578CCAED71BC9E, + 0xD9882BB0C01D2C0A, 0x917B9D5D113C503B, 0xA2C31E11A87643C6, 0xE463C923A399C1CE, + 0xF71686C57EA876DC, 0x87B4A973E096D509, 0xAF0D567D9D3A5814, 0xB40C2A3F59DCC6F4, + 0x3602F88495D121DD, 0xD3E1DD3D9836484A, 0xF945E71AA46688E5, 0x7518547EB2A591F5, + 0x9366587450C01D89, 0x9EA81018658C065B, 0x4F54080CBC4603A3, 0x2D0384C65137BF3D, + 0xDC325078EC861E2A, 0xEA30A8FC79573FF7, 0x214D2030CA050CB6, 0x65F0322B8016C30C, + 0x69BE96DD1B247087, 0xDB95EE9981E161B8, 0xD1FC1814D9CA05F8, 0x820ED2BBCC0DE729, + 0x63D76050430F14C7, 0x3BCCB0E8A09D3A0F, 0x8E40764D573F54A2, 0x39D175C1E16177BD, + 0x12F5A37C734F1F4B, 0xAB37C12F1FDFC26D, 0x5648B167395CD0F1, 0x6C04ED1537BF42A7, + 0xED97161D14304065, 0x7D6C67DAAB72B807, 0xEC17FA87BA4EE83C, 0xDFAF79CB0304FBC1, + 0x733F060571BC463E, 0x78D61C1287E98A27, 0xD07CF48E77B4ADA1, 0xB9C262536C90DD26, + 0xE2449B5860801605, 0x8FC09AD7F941FCFB, 0xFAD8CEA94BE46D0E, 0xA343F28B0608EB9F, + 0x9B126BD04917347B, 0x9A92874AE7699C22, 0x1B017C42C4E69EE0, 0x3A4C5C720EE39256, + 0x4B6E9F5E3EA399DA, 0x6BA353F45AD83D35, 0xE7FEE0904C1B2425, 0x22D009832587E95D, + 0x842980C00F1430E2, 0xC6B3C0A0861E2893, 0x087433A419D729F2, 0x341F3DADD42D6C6F, + 0xEE0A3FAEFBB2A58E, 0x4AEE73C490DD3183, 0xAAB72DB5B1A16A34, 0xA92A04065E238FDF, + 0x7B4B35A1686B6FCC, 0x6A23BF6EF4A6956C, 0x191CB96B851AD352, 0x55D598D4D6DE351A, + 0xC9604DE5F2AE7EF3, 0x1CA6C2A3A981E172, 0xDE2F9551AD7A5398, 0x3025AAFF56C8F616, + 0x15521D9D1E2860D9, 0x506FE31CFA45073A, 0x189C55F12B647B0B, 0x0180EC9AAE7EA859, + 0x7CEC8B40050C105E, 0x2350E5198BF94104, 0xEF8AD33455CC0DD7, 0x07A7BEE16D677F92, + 0xE5E325B90DE76997, 0x5A061591A26E637A, 0xB611EF1618208B46, 0x09F4DF3EB7A981AB, + 0x1EBB078AE87DACC0, 0xB791038CB65E231F, 0x0FD38D4574B05660, 0x67EDF702C1EA8EBE, + 0xBA5F4BE0831238CD, 0xE3C477C2CEFEBE5C, 0x0DCE486C354C1BD2, 0x8C5DB36416C31910, + 0x26EA9ED1A7627324, 0x039D29B3EF82E5EB, 0x9F28FC82CBF2AE02, 0xA8AAE89CF05D2786, + 0x431AACFA2774B028, 0xCF471F9E31B7A938, 0x581BD0B8E3922EC8, 0xBC78199B400BEF06, + 0x90FB71C7BF42F862, 0x1F3BEB1046030499, 0x683E7A47B55AD8DE, 0x988F4263A695D190, + 0xD808C72A6E638453, 0x0627527BC319D7CB, 0xEBB04466D72997AE, 0xE67E0C0AE2658C7C, + 0x14D2F107B056C880, 0x7122C32C30400B8C, 0x8A7AE11FD5DACEDB, 0xA0DEDB38E98A0E74, + 0xAD109354DCC615A6, 0x0BE91A17F655CC19, 0x8DDD5FFEB8BDB149, 0xBFE53028AF890AED, + 0xD65BA6F5B4AD7A6A, 0x7956F0882997227E, 0x10E8665532B352F9, 0x0E5361DFDACEFE39, + 0xCEC7F3049FC90161, 0xFF62B561677F5F2E, 0x975CCF26D22587F0, 0x51EF0F86543BAF63, + 0x2F1E41EF10CBF28F, 0x52722635BBB94A88, 0xAE8DBAE73344F04D, 0x410769D36688FD9A, + 0xB3AB94DE34BBB966, 0x801317928DF1AA9B, 0xA564A0F0C5113C54, 0xF131D4BEBDB1A117, + 0x7F71A2F3EA8EF5B5, 0x40878549C8F655C3, 0x7EF14E6944F05DEC, 0xD44663DCF55137D8, + 0xF2ACFD0D523344FC, 0x0000000000000000, 0x5FBC6E598EF5515A, 0x16CF342EF1AA8532, + 0xB036BD6DDB395C8D, 0x13754FE6DD31B712, 0xBBDFA77A2D6C9094, 0x89E7C8AC3A582B30, + 0x3C6B0E09CDFA459D, 0xC4AE0589C7E26521, 0x49735A777F5FD468, 0xCAFD64561D2C9B18, + 0xDA1502032F9FC9E1, 0x8867243694268369, 0x3782141E3BAF8984, 0x9CB5D53124704BE9, + 0xD7DB4A6F1AD3D233, 0xA6F989432A93D9BF, 0x9D3539AB8A0EE3B0, 0x53F2CAAF15C7E2D1, + 0x6E19283C76430F15, 0x3DEBE2936384EDC4, 0x5E3C82C3208BF903, 0x33B8834CB94A13FD, + 0x6470DEB12E686B55, 0x359FD1377A53C436, 0x61CAA57902F35975, 0x043A975282E59A79, + 0xFD7F70482683129C, 0xC52EE913699CCD78, 0x28B9FF0E7DAC8D1D, 0x5455744E78A09D43, + 0xCB7D88CCB3523341, 0x44BD121B4A13CFBA, 0x4D49CD25FDBA4E11, 0x3E76CB208C06082F, + 0x3FF627BA2278A076, 0xC28957F204FBB2EA, 0x453DFE81E46D67E3, 0x94C1E6953DA7621B, + 0x2C83685CFF491764, 0xF32C1197FC4DECA5, 0x2B24D6BD922E68F6, 0xB22B78449AC5113F, + 0x48F3B6EDD1217C31, 0x2E9EAD75BEB55AD6, 0x174FD8B45FD42D6B, 0x4ED4E4961238ABFA, + 0x92E6B4EEFEBEB5D0, 0x46A0D7320BEF8208, 0x47203BA8A5912A51, 0x24F75BF8E69E3E96, + 0xF0B1382413CF094E, 0xFEE259FBC901F777, 0x276A724B091CDB7D, 0xBDF8F501EE75475F, + 0x599B3C224DEC8691, 0x6D84018F99C1EAFE, 0x7498B8E41CDB39AC, 0xE0595E71217C5BB7, + 0x2AA43A273C50C0AF, 0xF50B43EC3F543B6E, 0x838E3E2162734F70, 0xC09492DB4507FF58, + 0x72BFEA9FDFC2EE67, 0x11688ACF9CCDFAA0, 0x1A8190D86A9836B9, 0x7ACBD93BC615C795, + 0xC7332C3A286080CA, 0x863445E94EE87D50, 0xF6966A5FD0D6DE85, 0xE9AD814F96D5DA1C, + 0x70A22FB69E3EA3D5, 0x0A69F68D582B6440, 0xB8428EC9C2EE757F, 0x604A49E3AC8DF12C, + 0x5B86F90B0C10CB23, 0xE1D9B2EB8F02F3EE, 0x29391394D3D22544, 0xC8E0A17F5CD0D6AA, + 0xB58CC6A5F7A26EAD, 0x8193FB08238F02C2, 0xD5C68F465B2F9F81, 0xFCFF9CD288FDBAC5, + 0x77059157F359DC47, 0x1D262E3907FF492B, 0xFB582233E59AC557, 0xDDB2BCE242F8B673, + 0x2577B76248E096CF, 0x6F99C4A6D83DA74C, 0xC1147E41EB795701, 0xF48BAF76912A9337 +}; +__device__ static uint64_t T4[256] = { + 0x3EF29D249B2C0A19, 0xE9E16322B6F8622F, 0x5536994047757F7A, 0x9F4D56D5A47B0B33, + 0x822567466AA1174C, 0xB8F5057DEB082FB2, 0xCC48C10BF4475F53, 0x373088D4275DEC3A, + 0x968F4325180AED10, 0x173D232CF7016151, 0xAE4ED09F946FCC13, 0xFD4B4741C4539873, + 0x1B5B3F0DD9933765, 0x2FFCB0967B644052, 0xE02376D20A89840C, 0xA3AE3A70329B18D7, + 0x419CBD2335DE8526, 0xFAFEBF115B7C3199, 0x0397074F85AA9B0D, 0xC58AD4FB4836B970, + 0xBEC60BE3FC4104A8, 0x1EFF36DC4B708772, 0x131FDC33ED8453B6, 0x0844E33E341764D3, + 0x0FF11B6EAB38CD39, 0x64351F0A7761B85A, 0x3B5694F509CFBA0E, 0x30857084B87245D0, + 0x47AFB3BD2297AE3C, 0xF2BA5C2F6F6B554A, 0x74BDC4761F4F70E1, 0xCFDFC64471EDC45E, + 0xE610784C1DC0AF16, 0x7ACA29D63C113F28, 0x2DED411776A859AF, 0xAC5F211E99A3D5EE, + 0xD484F949A87EF33B, 0x3CE36CA596E013E4, 0xD120F0983A9D432C, 0x6BC40464DC597563, + 0x69D5F5E5D1956C9E, 0x9AE95F043698BB24, 0xC9ECC8DA66A4EF44, 0xD69508C8A5B2EAC6, + 0xC40C2235C0503B80, 0x38C193BA8C652103, 0x1CEEC75D46BC9E8F, 0xD331011937515AD1, + 0xD8E2E56886ECA50F, 0xB137108D5779C991, 0x709F3B6905CA4206, 0x4FEB50831680CAEF, + 0xEC456AF3241BD238, 0x58D673AFE181ABBE, 0x242F54E7CAD9BF8C, 0x0211F1810DCC19FD, + 0x90BC4DBB0F43C60A, 0x9518446A9DA0761D, 0xA1BFCBF13F57012A, 0x2BDE4F8961E172B5, + 0x27B853A84F732481, 0xB0B1E643DF1F4B61, 0x18CC38425C39AC68, 0xD2B7F7D7BF37D821, + 0x3103864A3014C720, 0x14AA246372ABFA5C, 0x6E600DB54EBAC574, 0x394765740403A3F3, + 0x09C215F0BC71E623, 0x2A58B947E987F045, 0x7B4CDF18B477BDD8, 0x9709B5EB906C6FE0, + 0x73083C268060D90B, 0xFEDC400E41F9037E, 0x284948C6E44BE9B8, 0x728ECAE808065BFB, + 0x06330E9E17492B1A, 0x5950856169E7294E, 0xBAE4F4FCE6C4364F, 0xCA7BCF95E30E7449, + 0x7D7FD186A33E96C2, 0x52836110D85AD690, 0x4DFAA1021B4CD312, 0x913ABB75872544FA, + 0xDD46ECB9140F1518, 0x3D659A6B1E869114, 0xC23F2CABD719109A, 0xD713FE062DD46836, + 0xD0A60656B2FBC1DC, 0x221C5A79DD909496, 0xEFD26DBCA1B14935, 0x0E77EDA0235E4FC9, + 0xCBFD395B6B68F6B9, 0x0DE0EAEFA6F4D4C4, 0x0422FF1F1A8532E7, 0xF969B85EDED6AA94, + 0x7F6E2007AEF28F3F, 0x3AD0623B81A938FE, 0x6624EE8B7AADA1A7, 0xB682E8DDC856607B, + 0xA78CC56F281E2A30, 0xC79B257A45FAA08D, 0x5B4174E0642B30B3, 0x5F638BFF7EAE0254, + 0x4BC9AF9C0C05F808, 0xCE59308AF98B46AE, 0x8FC58DA9CC55C388, 0x803496C7676D0EB1, + 0xF33CAAE1E70DD7BA, 0xBB6202326EA2B4BF, 0xD5020F87201871CB, 0x9D5CA754A9B712CE, + 0x841669D87DE83C56, 0x8A6184785EB6739F, 0x420BBA6CB0741E2B, 0xF12D5B60EAC1CE47, + 0x76AC35F71283691C, 0x2C6BB7D9FECEDB5F, 0xFCCDB18F4C351A83, 0x1F79C012C3160582, + 0xF0ABADAE62A74CB7, 0xE1A5801C82EF06FC, 0x67A21845F2CB2357, 0x5114665F5DF04D9D, + 0xBF40FD2D74278658, 0xA0393D3FB73183DA, 0x05A409D192E3B017, 0xA9FB28CF0B4065F9, + 0x25A9A22942BF3D7C, 0xDB75E22703463E02, 0xB326E10C5AB5D06C, 0xE7968E8295A62DE6, + 0xB973F3B3636EAD42, 0xDF571D3819C30CE5, 0xEE549B7229D7CBC5, 0x12992AFD65E2D146, + 0xF8EF4E9056B02864, 0xB7041E134030E28B, 0xC02EDD2ADAD50967, 0x932B4AF48AE95D07, + 0x6FE6FB7BC6DC4784, 0x239AACB755F61666, 0x401A4BEDBDB807D6, 0x485EA8D389AF6305, + 0xA41BC220ADB4B13D, 0x753B32B89729F211, 0x997E584BB3322029, 0x1D683193CEDA1C7F, + 0xFF5AB6C0C99F818E, 0x16BBD5E27F67E3A1, 0xA59D34EE25D233CD, 0x98F8AE853B54A2D9, + 0x6DF70AFACB105E79, 0x795D2E99B9BBA425, 0x8E437B6744334178, 0x0186F6CE886682F0, + 0xEBF092A3BB347BD2, 0xBCD7FA62F18D1D55, 0xADD9D7D011C5571E, 0x0BD3E471B1BDFFDE, + 0xAA6C2F808EEAFEF4, 0x5EE57D31F6C880A4, 0xF50FA47FF044FCA0, 0x1ADDC9C351F5B595, + 0xEA76646D3352F922, 0x0000000000000000, 0x85909F16F58EBEA6, 0x46294573AAF12CCC, + 0x0A5512BF39DB7D2E, 0x78DBD85731DD26D5, 0x29CFBE086C2D6B48, 0x218B5D36583A0F9B, + 0x152CD2ADFACD78AC, 0x83A39188E2C795BC, 0xC3B9DA655F7F926A, 0x9ECBA01B2C1D89C3, + 0x07B5F8509F2FA9EA, 0x7EE8D6C926940DCF, 0x36B67E1AAF3B6ECA, 0x86079859702425AB, + 0xFB7849DFD31AB369, 0x4C7C57CC932A51E2, 0xD96413A60E8A27FF, 0x263EA566C715A671, + 0x6C71FC344376DC89, 0x4A4F595284637AF8, 0xDAF314E98B20BCF2, 0x572768C14AB96687, + 0x1088DB7C682EC8BB, 0x887075F9537A6A62, 0x2E7A4658F302C2A2, 0x619116DBE582084D, + 0xA87DDE018326E709, 0xDCC01A779C6997E8, 0xEDC39C3DAC7D50C8, 0xA60A33A1A078A8C0, + 0xC1A82BE452B38B97, 0x3F746BEA134A88E9, 0xA228CCBEBAFD9A27, 0xABEAD94E068C7C04, + 0xF48952B178227E50, 0x5CF48CB0FB049959, 0x6017E0156DE48ABD, 0x4438B4F2A73D3531, + 0x8C528AE649FF5885, 0xB515EF924DFCFB76, 0x0C661C212E925634, 0xB493195CC59A7986, + 0x9CDA519A21D1903E, 0x32948105B5BE5C2D, 0x194ACE8CD45F2E98, 0x438D4CA238129CDB, + 0x9B6FA9CABEFE39D4, 0x81B26009EF0B8C41, 0xDED1EBF691A58E15, 0x4E6DA64D9EE6481F, + 0x54B06F8ECF13FD8A, 0x49D85E1D01C9E1F5, 0xAFC826511C094EE3, 0xF698A33075EE67AD, + 0x5AC7822EEC4DB243, 0x8DD47C28C199DA75, 0x89F68337DB1CE892, 0xCDCE37C57C21DDA3, + 0x530597DE503C5460, 0x6A42F2AA543FF793, 0x5D727A7E73621BA9, 0xE232875307459DF1, + 0x56A19E0FC2DFE477, 0xC61DD3B4CD9C227D, 0xE5877F03986A341B, 0x949EB2A415C6F4ED, + 0x6206119460289340, 0x6380E75AE84E11B0, 0x8BE772B6D6D0F16F, 0x50929091D596CF6D, + 0xE86795EC3E9EE0DF, 0x7CF927482B581432, 0xC86A3E14EEC26DB4, 0x7119CDA78DACC0F6, + 0xE40189CD100CB6EB, 0x92ADBC3A028FDFF7, 0xB2A017C2D2D3529C, 0x200DABF8D05C8D6B, + 0x34A78F9BA2F77737, 0xE3B4719D8F231F01, 0x45BE423C2F5BB7C1, 0xF71E55FEFD88E55D, + 0x6853032B59F3EE6E, 0x65B3E9C4FF073AAA, 0x772AC3399AE5EBEC, 0x87816E97F842A75B, + 0x110E2DB2E0484A4B, 0x331277CB3DD8DEDD, 0xBD510CAC79EB9FA5, 0x352179552A91F5C7 +}; +__device__ static uint64_t T5[256] = { + 0x8AB0A96846E06A6D, 0x43C7E80B4BF0B33A, 0x08C9B3546B161EE5, 0x39F1C235EBA990BE, + 0xC1BEF2376606C7B2, 0x2C209233614569AA, 0xEB01523B6FC3289A, 0x946953AB935ACEDD, + 0x272838F63E13340E, 0x8B0455ECA12BA052, 0x77A1B2C4978FF8A2, 0xA55122CA13E54086, + 0x2276135862D3F1CD, 0xDB8DDFDE08B76CFE, 0x5D1E12C89E4A178A, 0x0E56816B03969867, + 0xEE5F79953303ED59, 0xAFED748BAB78D71D, 0x6D929F2DF93E53EE, 0xF5D8A8F8BA798C2A, + 0xF619B1698E39CF6B, 0x95DDAF2F749104E2, 0xEC2A9C80E0886427, 0xCE5C8FD8825B95EA, + 0xC4E0D9993AC60271, 0x4699C3A5173076F9, 0x3D1B151F50A29F42, 0x9ED505EA2BC75946, + 0x34665ACFDC7F4B98, 0x61B1FB53292342F7, 0xC721C0080E864130, 0x8693CD1696FD7B74, + 0x872731927136B14B, 0xD3446C8A63A1721B, 0x669A35E8A6680E4A, 0xCAB658F239509A16, + 0xA4E5DE4EF42E8AB9, 0x37A7435EE83F08D9, 0x134E6239E26C7F96, 0x82791A3C2DF67488, + 0x3F6EF00A8329163C, 0x8E5A7E42FDEB6591, 0x5CAAEE4C7981DDB5, 0x19F234785AF1E80D, + 0x255DDDE3ED98BD70, 0x50898A32A99CCCAC, 0x28CA4519DA4E6656, 0xAE59880F4CB31D22, + 0x0D9798FA37D6DB26, 0x32F968F0B4FFCD1A, 0xA00F09644F258545, 0xFA3AD5175E24DE72, + 0xF46C547C5DB24615, 0x713E80FBFF0F7E20, 0x7843CF2B73D2AAFA, 0xBD17EA36AEDF62B4, + 0xFD111BACD16F92CF, 0x4ABAA7DBC72D67E0, 0xB3416B5DAD49FAD3, 0xBCA316B24914A88B, + 0x15D150068AECF914, 0xE27C1DEBE31EFC40, 0x4FE48C759BEDA223, 0x7EDCFD141B522C78, + 0x4E5070F17C26681C, 0xE696CAC15815F3BC, 0x35D2A64B3BB481A7, 0x800CFF29FE7DFDF6, + 0x1ED9FAC3D5BAA4B0, 0x6C2663A91EF599D1, 0x03C1199134404341, 0xF7AD4DED69F20554, + 0xCD9D9649B61BD6AB, 0xC8C3BDE7EADB1368, 0xD131899FB02AFB65, 0x1D18E352E1FAE7F1, + 0xDA39235AEF7CA6C1, 0xA1BBF5E0A8EE4F7A, 0x91377805CF9A0B1E, 0x3138716180BF8E5B, + 0xD9F83ACBDB3CE580, 0x0275E515D38B897E, 0x472D3F21F0FBBCC6, 0x2D946EB7868EA395, + 0xBA3C248D21942E09, 0xE7223645BFDE3983, 0xFF64FEB902E41BB1, 0xC97741630D10D957, + 0xC3CB1722B58D4ECC, 0xA27AEC719CAE0C3B, 0x99FECB51A48C15FB, 0x1465AC826D27332B, + 0xE1BD047AD75EBF01, 0x79F733AF941960C5, 0x672EC96C41A3C475, 0xC27FEBA6524684F3, + 0x64EFD0FD75E38734, 0xED9E60040743AE18, 0xFB8E2993B9EF144D, 0x38453EB10C625A81, + 0x6978480742355C12, 0x48CF42CE14A6EE9E, 0x1CAC1FD606312DCE, 0x7B82D6BA4792E9BB, + 0x9D141C7B1F871A07, 0x5616B80DC11C4A2E, 0xB849C198F21FA777, 0x7CA91801C8D9A506, + 0xB1348E487EC273AD, 0x41B20D1E987B3A44, 0x7460AB55A3CFBBE3, 0x84E628034576F20A, + 0x1B87D16D897A6173, 0x0FE27DEFE45D5258, 0x83CDE6B8CA3DBEB7, 0x0C23647ED01D1119, + 0x7A362A3EA0592384, 0xB61F40F3F1893F10, 0x75D457D1440471DC, 0x4558DA34237035B8, + 0xDCA6116587FC2043, 0x8D9B67D3C9AB26D0, 0x2B0B5C88EE0E2517, 0x6FE77A382AB5DA90, + 0x269CC472D9D8FE31, 0x63C41E46FAA8CB89, 0xB7ABBC771642F52F, 0x7D1DE4852F126F39, + 0xA8C6BA3024339BA0, 0x600507D7CEE888C8, 0x8FEE82C61A20AFAE, 0x57A2448926D78011, + 0xFCA5E72836A458F0, 0x072BCEBB8F4B4CBD, 0x497BBE4AF36D24A1, 0x3CAFE99BB769557D, + 0x12FA9EBD05A7B5A9, 0xE8C04BAA5B836BDB, 0x4273148FAC3B7905, 0x908384812851C121, + 0xE557D3506C55B0FD, 0x72FF996ACB4F3D61, 0x3EDA0C8E64E2DC03, 0xF0868356E6B949E9, + 0x04EAD72ABB0B0FFC, 0x17A4B5135967706A, 0xE3C8E16F04D5367F, 0xF84F30028DAF570C, + 0x1846C8FCBD3A2232, 0x5B8120F7F6CA9108, 0xD46FA231ECEA3EA6, 0x334D947453340725, + 0x58403966C28AD249, 0xBED6F3A79A9F21F5, 0x68CCB483A5FE962D, 0xD085751B57E1315A, + 0xFED0023DE52FD18E, 0x4B0E5B5F20E6ADDF, 0x1A332DE96EB1AB4C, 0xA3CE10F57B65C604, + 0x108F7BA8D62C3CD7, 0xAB07A3A11073D8E1, 0x6B0DAD1291BED56C, 0xF2F366433532C097, + 0x2E557726B2CEE0D4, 0x0000000000000000, 0xCB02A476DE9B5029, 0xE4E32FD48B9E7AC2, + 0x734B65EE2C84F75E, 0x6E5386BCCD7E10AF, 0x01B4FC84E7CBCA3F, 0xCFE8735C65905FD5, + 0x3613BFDA0FF4C2E6, 0x113B872C31E7F6E8, 0x2FE18BA255052AEB, 0xE974B72EBC48A1E4, + 0x0ABC5641B89D979B, 0xB46AA5E62202B66E, 0x44EC26B0C4BBFF87, 0xA6903B5B27A503C7, + 0x7F680190FC99E647, 0x97A84A3AA71A8D9C, 0xDD12EDE16037EA7C, 0xC554251DDD0DC84E, + 0x88C54C7D956BE313, 0x4D91696048662B5D, 0xB08072CC9909B992, 0xB5DE5962C5C97C51, + 0x81B803AD19B637C9, 0xB2F597D94A8230EC, 0x0B08AAC55F565DA4, 0xF1327FD2017283D6, + 0xAD98919E78F35E63, 0x6AB9519676751F53, 0x24E921670A53774F, 0xB9FD3D1C15D46D48, + 0x92F66194FBDA485F, 0x5A35DC7311015B37, 0xDED3F4705477A93D, 0xC00A0EB381CD0D8D, + 0xBB88D809C65FE436, 0x16104997BEACBA55, 0x21B70AC95693B28C, 0x59F4C5E225411876, + 0xD5DB5EB50B21F499, 0x55D7A19CF55C096F, 0xA97246B4C3F8519F, 0x8552D487A2BD3835, + 0x54635D181297C350, 0x23C2EFDC85183BF2, 0x9F61F96ECC0C9379, 0x534893A39DDC8FED, + 0x5EDF0B59AA0A54CB, 0xAC2C6D1A9F38945C, 0xD7AEBBA0D8AA7DE7, 0x2ABFA00C09C5EF28, + 0xD84CC64F3CF72FBF, 0x2003F64DB15878B3, 0xA724C7DFC06EC9F8, 0x069F323F68808682, + 0xCC296ACD51D01C94, 0x055E2BAE5CC0C5C3, 0x6270E2C21D6301B6, 0x3B842720382219C0, + 0xD2F0900E846AB824, 0x52FC6F277A1745D2, 0xC6953C8CE94D8B0F, 0xE009F8FE3095753E, + 0x655B2C7992284D0B, 0x984A37D54347DFC4, 0xEAB5AEBF8808E2A5, 0x9A3FD2C090CC56BA, + 0x9CA0E0FFF84CD038, 0x4C2595E4AFADE162, 0xDF6708F4B3BC6302, 0xBF620F237D54EBCA, + 0x93429D101C118260, 0x097D4FD08CDDD4DA, 0x8C2F9B572E60ECEF, 0x708A7C7F18C4B41F, + 0x3A30DBA4DFE9D3FF, 0x4006F19A7FB0F07B, 0x5F6BF7DD4DC19EF4, 0x1F6D064732716E8F, + 0xF9FBCC866A649D33, 0x308C8DE567744464, 0x8971B0F972A0292C, 0xD61A47243F61B7D8, + 0xEFEB8511D4C82766, 0x961CB6BE40D147A3, 0xAAB35F25F7B812DE, 0x76154E407044329D, + 0x513D76B64E570693, 0xF3479AC7D2F90AA8, 0x9B8B2E4477079C85, 0x297EB99D3D85AC69 +}; +__device__ static uint64_t T6[256] = { + 0x7E37E62DFC7D40C3, 0x776F25A4EE939E5B, 0xE045C850DD8FB5AD, 0x86ED5BA711FF1952, + 0xE91D0BD9CF616B35, 0x37E0AB256E408FFB, 0x9607F6C031025A7A, 0x0B02F5E116D23C9D, + 0xF3D8486BFB50650C, 0x621CFF27C40875F5, 0x7D40CB71FA5FD34A, 0x6DAA6616DAA29062, + 0x9F5F354923EC84E2, 0xEC847C3DC507C3B3, 0x025A3668043CE205, 0xA8BF9E6C4DAC0B19, + 0xFA808BE2E9BEBB94, 0xB5B99C5277C74FA3, 0x78D9BC95F0397BCC, 0xE332E50CDBAD2624, + 0xC74FCE129332797E, 0x1729ECEB2EA709AB, 0xC2D6B9F69954D1F8, 0x5D898CBFBAB8551A, + 0x859A76FB17DD8ADB, 0x1BE85886362F7FB5, 0xF6413F8FF136CD8A, 0xD3110FA5BBB7E35C, + 0x0A2FEED514CC4D11, 0xE83010EDCD7F1AB9, 0xA1E75DE55F42D581, 0xEEDE4A55C13B21B6, + 0xF2F5535FF94E1480, 0x0CC1B46D1888761E, 0xBCE15FDB6529913B, 0x2D25E8975A7181C2, + 0x71817F1CE2D7A554, 0x2E52C5CB5C53124B, 0xF9F7A6BEEF9C281D, 0x9E722E7D21F2F56E, + 0xCE170D9B81DCA7E6, 0x0E9B82051CB4941B, 0x1E712F623C49D733, 0x21E45CFA42F9F7DC, + 0xCB8E7A7F8BBA0F60, 0x8E98831A010FB646, 0x474CCF0D8E895B23, 0xA99285584FB27A95, + 0x8CC2B57205335443, 0x42D5B8E984EFF3A5, 0x012D1B34021E718C, 0x57A6626AAE74180B, + 0xFF19FC06E3D81312, 0x35BA9D4D6A7C6DFE, 0xC9D44C178F86ED65, 0x506523E6A02E5288, + 0x03772D5C06229389, 0x8B01F4FE0B691EC0, 0xF8DABD8AED825991, 0x4C4E3AEC985B67BE, + 0xB10DF0827FBF96A9, 0x6A69279AD4F8DAE1, 0xE78689DCD3D5FF2E, 0x812E1A2B1FA553D1, + 0xFBAD90D6EBA0CA18, 0x1AC543B234310E39, 0x1604F7DF2CB97827, 0xA6241C6951189F02, + 0x753513CCEAAF7C5E, 0x64F2A59FC84C4EFA, 0x247D2B1E489F5F5A, 0xDB64D718AB474C48, + 0x79F4A7A1F2270A40, 0x1573DA832A9BEBAE, 0x3497867968621C72, 0x514838D2A2302304, + 0xF0AF6537FD72F685, 0x1D06023E3A6B44BA, 0x678588C3CE6EDD73, 0x66A893F7CC70ACFF, + 0xD4D24E29B5EDA9DF, 0x3856321470EA6A6C, 0x07C3418C0E5A4A83, 0x2BCBB22F5635BACD, + 0x04B46CD00878D90A, 0x06EE5AB80C443B0F, 0x3B211F4876C8F9E5, 0x0958C38912EEDE98, + 0xD14B39CDBF8B0159, 0x397B292072F41BE0, 0x87C0409313E168DE, 0xAD26E98847CAA39F, + 0x4E140C849C6785BB, 0xD5FF551DB7F3D853, 0xA0CA46D15D5CA40D, 0xCD6020C787FE346F, + 0x84B76DCF15C3FB57, 0xDEFDA0FCA121E4CE, 0x4B8D7B6096012D3D, 0x9AC642AD298A2C64, + 0x0875D8BD10F0AF14, 0xB357C6EA7B8374AC, 0x4D6321D89A451632, 0xEDA96709C719B23F, + 0xF76C24BBF328BC06, 0xC662D526912C08F2, 0x3CE25EC47892B366, 0xB978283F6F4F39BD, + 0xC08C8F9E9D6833FD, 0x4F3917B09E79F437, 0x593DE06FB2C08C10, 0xD6887841B1D14BDA, + 0x19B26EEE32139DB0, 0xB494876675D93E2F, 0x825937771987C058, 0x90E9AC783D466175, + 0xF1827E03FF6C8709, 0x945DC0A8353EB87F, 0x4516F9658AB5B926, 0x3F9573987EB020EF, + 0xB855330B6D514831, 0x2AE6A91B542BCB41, 0x6331E413C6160479, 0x408F8E8180D311A0, + 0xEFF35161C325503A, 0xD06622F9BD9570D5, 0x8876D9A20D4B8D49, 0xA5533135573A0C8B, + 0xE168D364DF91C421, 0xF41B09E7F50A2F8F, 0x12B09B0F24C1A12D, 0xDA49CC2CA9593DC4, + 0x1F5C34563E57A6BF, 0x54D14F36A8568B82, 0xAF7CDFE043F6419A, 0xEA6A2685C943F8BC, + 0xE5DCBFB4D7E91D2B, 0xB27ADDDE799D0520, 0x6B443CAED6E6AB6D, 0x7BAE91C9F61BE845, + 0x3EB868AC7CAE5163, 0x11C7B65322E332A4, 0xD23C1491B9A992D0, 0x8FB5982E0311C7CA, + 0x70AC6428E0C9D4D8, 0x895BC2960F55FCC5, 0x76423E90EC8DEFD7, 0x6FF0507EDE9E7267, + 0x3DCF45F07A8CC2EA, 0x4AA06054941F5CB1, 0x5810FB5BB0DEFD9C, 0x5EFEA1E3BC9AC693, + 0x6EDD4B4ADC8003EB, 0x741808F8E8B10DD2, 0x145EC1B728859A22, 0x28BC9F7350172944, + 0x270A06424EBDCCD3, 0x972AEDF4331C2BF6, 0x059977E40A66A886, 0x2550302A4A812ED6, + 0xDD8A8DA0A7037747, 0xC515F87A970E9B7B, 0x3023EAA9601AC578, 0xB7E3AA3A73FBADA6, + 0x0FB699311EAAE597, 0x0000000000000000, 0x310EF19D6204B4F4, 0x229371A644DB6455, + 0x0DECAF591A960792, 0x5CA4978BB8A62496, 0x1C2B190A38753536, 0x41A295B582CD602C, + 0x3279DCC16426277D, 0xC1A194AA9F764271, 0x139D803B26DFD0A1, 0xAE51C4D441E83016, + 0xD813FA44AD65DFC1, 0xAC0BF2BC45D4D213, 0x23BE6A9246C515D9, 0x49D74D08923DCF38, + 0x9D05032127D066E7, 0x2F7FDEFF5E4D63C7, 0xA47E2A0155247D07, 0x99B16FF12FA8BFED, + 0x4661D4398C972AAF, 0xDFD0BBC8A33F9542, 0xDCA79694A51D06CB, 0xB020EBB67DA1E725, + 0xBA0F0563696DAA34, 0xE4F1A480D5F76CA7, 0xC438E34E9510EAF7, 0x939E81243B64F2FC, + 0x8DEFAE46072D25CF, 0x2C08F3A3586FF04E, 0xD7A56375B3CF3A56, 0x20C947CE40E78650, + 0x43F8A3DD86F18229, 0x568B795EAC6A6987, 0x8003011F1DBB225D, 0xF53612D3F7145E03, + 0x189F75DA300DEC3C, 0x9570DB9C3720C9F3, 0xBB221E576B73DBB8, 0x72F65240E4F536DD, + 0x443BE25188ABC8AA, 0xE21FFE38D9B357A8, 0xFD43CA6EE7E4F117, 0xCAA3614B89A47EEC, + 0xFE34E732E1C6629E, 0x83742C431B99B1D4, 0xCF3A16AF83C2D66A, 0xAAE5A8044990E91C, + 0x26271D764CA3BD5F, 0x91C4B74C3F5810F9, 0x7C6DD045F841A2C6, 0x7F1AFD19FE63314F, + 0xC8F957238D989CE9, 0xA709075D5306EE8E, 0x55FC5402AA48FA0E, 0x48FA563C9023BEB4, + 0x65DFBEABCA523F76, 0x6C877D22D8BCE1EE, 0xCC4D3BF385E045E3, 0xBEBB69B36115733E, + 0x10EAAD6720FD4328, 0xB6CEB10E71E5DC2A, 0xBDCC44EF6737E0B7, 0x523F158EA412B08D, + 0x989C74C52DB6CE61, 0x9BEB59992B945DE8, 0x8A2CEFCA09776F4C, 0xA3BD6B8D5B7E3784, + 0xEB473DB1CB5D8930, 0xC3FBA2C29B4AA074, 0x9C28181525CE176B, 0x683311F2D0C438E4, + 0x5FD3BAD7BE84B71F, 0xFC6ED15AE5FA809B, 0x36CDB0116C5EFE77, 0x29918447520958C8, + 0xA29070B959604608, 0x53120EBAA60CC101, 0x3A0C047C74D68869, 0x691E0AC6D2DA4968, + 0x73DB4974E6EB4751, 0x7A838AFDF40599C9, 0x5A4ACD33B4E21F99, 0x6046C94FC03497F0, + 0xE6AB92E8D1CB8EA2, 0x3354C7F5663856F1, 0xD93EE170AF7BAE4D, 0x616BD27BC22AE67C, + 0x92B39A10397A8370, 0xABC8B3304B8E9890, 0xBF967287630B02B2, 0x5B67D607B6FC6E15 +}; +__device__ static uint64_t T7[256] = { + 0xD031C397CE553FE6, 0x16BA5B01B006B525, 0xA89BADE6296E70C8, 0x6A1F525D77D3435B, + 0x6E103570573DFA0B, 0x660EFB2A17FC95AB, 0x76327A9E97634BF6, 0x4BAD9D6462458BF5, + 0xF1830CAEDBC3F748, 0xC5C8F542669131FF, 0x95044A1CDC48B0CB, 0x892962DF3CF8B866, + 0xB0B9E208E930C135, 0xA14FB3F0611A767C, 0x8D2605F21C160136, 0xD6B71922FECC549E, + 0x37089438A5907D8B, 0x0B5DA38E5803D49C, 0x5A5BCC9CEA6F3CBC, 0xEDAE246D3B73FFE5, + 0xD2B87E0FDE22EDCE, 0x5E54ABB1CA8185EC, 0x1DE7F88FE80561B9, 0xAD5E1A870135A08C, + 0x2F2ADBD665CECC76, 0x5780B5A782F58358, 0x3EDC8A2EEDE47B3F, 0xC9D95C3506BEE70F, + 0x83BE111D6C4E05EE, 0xA603B90959367410, 0x103C81B4809FDE5D, 0x2C69B6027D0C774A, + 0x399080D7D5C87953, 0x09D41E16487406B4, 0xCDD63B1826505E5F, 0xF99DC2F49B0298E8, + 0x9CD0540A943CB67F, 0xBCA84B7F891F17C5, 0x723D1DB3B78DF2A6, 0x78AA6E71E73B4F2E, + 0x1433E699A071670D, 0x84F21BE454620782, 0x98DF3327B4D20F2F, 0xF049DCE2D3769E5C, + 0xDB6C60199656EB7A, 0x648746B2078B4783, 0x32CD23598DCBADCF, 0x1EA4955BF0C7DA85, + 0xE9A143401B9D46B5, 0xFD92A5D9BBEC21B8, 0xC8138C790E0B8E1B, 0x2EE00B9A6D7BA562, + 0xF85712B893B7F1FC, 0xEB28FED80BEA949D, 0x564A65EB8A40EA4C, 0x6C9988E8474A2823, + 0x4535898B121D8F2D, 0xABD8C03231ACCBF4, 0xBA2E91CAB9867CBD, 0x7960BE3DEF8E263A, + 0x0C11A977602FD6F0, 0xCB50E1AD16C93527, 0xEAE22E94035FFD89, 0x2866D12F5DE2CE1A, + 0xFF1B1841AB9BF390, 0x9F9339DE8CFE0D43, 0x964727C8C48A0BF7, 0x524502C6AAAE531C, + 0x9B9C5EF3AC10B413, 0x4FA2FA4942AB32A5, 0x3F165A62E551122B, 0xC74148DA76E6E3D7, + 0x924840E5E464B2A7, 0xD372AE43D69784DA, 0x233B72A105E11A86, 0xA48A04914941A638, + 0xB4B68525C9DE7865, 0xDDEABAACA6CF8002, 0x0A9773C250B6BD88, 0xC284FFBB5EBD3393, + 0x8BA0DF472C8F6A4E, 0x2AEF6CB74D951C32, 0x427983722A318D41, 0x73F7CDFFBF389BB2, + 0x074C0AF9382C026C, 0x8A6A0F0B243A035A, 0x6FDAE53C5F88931F, 0xC68B98967E538AC3, + 0x44FF59C71AA8E639, 0xE2FCE0CE439E9229, 0xA20CDE2479D8CD40, 0x19E89FA2C8EBD8E9, + 0xF446BBCFF398270C, 0x43B3533E2284E455, 0xD82F0DCD8E945046, 0x51066F12B26CE820, + 0xE73957AF6BC5426D, 0x081ECE5A40C16FA0, 0x3B193D4FC5BFAB7B, 0x7FE66488DF174D42, + 0x0E9814EF705804D8, 0x8137AC857C39D7C6, 0xB1733244E185A821, 0x695C3F896F11F867, + 0xF6CF0657E3EFF524, 0x1AABF276D02963D5, 0x2DA3664E75B91E5E, 0x0289BD981077D228, + 0x90C1FD7DF413608F, 0x3C5537B6FD93A917, 0xAA12107E3919A2E0, 0x0686DAB530996B78, + 0xDAA6B0559EE3826E, 0xC34E2FF756085A87, 0x6D5358A44FFF4137, 0xFC587595B35948AC, + 0x7CA5095CC7D5F67E, 0xFB147F6C8B754AC0, 0xBFEB26AB91DDACF9, 0x6896EFC567A49173, + 0xCA9A31E11E7C5C33, 0xBBE44186B13315A9, 0x0DDB793B689ABFE4, 0x70B4A02BA7FA208E, + 0xE47A3A7B7307F951, 0x8CECD5BE14A36822, 0xEEED49B923B144D9, 0x17708B4DB8B3DC31, + 0x6088219F2765FED3, 0xB3FA8FDCF1F27A09, 0x910B2D31FCA6099B, 0x0F52C4A378ED6DCC, + 0x50CCBF5EBAD98134, 0x6BD582117F662A4F, 0x94CE9A50D4FDD9DF, 0x2B25BCFB45207526, + 0x67C42B661F49FCBF, 0x492420FC723259DD, 0x03436DD418C2BB3C, 0x1F6E4517F872B391, + 0xA08563BC69AF1F68, 0xD43EA4BAEEBB86B6, 0x01CAD04C08B56914, 0xAC94CACB0980C998, + 0x54C3D8739A373864, 0x26FEC5C02DBACAC2, 0xDEA9D778BE0D3B3E, 0x040F672D20EEB950, + 0xE5B0EA377BB29045, 0xF30AB136CBB42560, 0x62019C0737122CFB, 0xE86B930C13282FA1, + 0xCC1CEB542EE5374B, 0x538FD28AA21B3A08, 0x1B61223AD89C0AC1, 0x36C24474AD25149F, + 0x7A23D3E9F74C9D06, 0xBE21F6E79968C5ED, 0xCF5F868036278C77, 0xF705D61BEB5A9C30, + 0x4D2B47D152DCE08D, 0x5F9E7BFDC234ECF8, 0x247778583DCD18EA, 0x867BA67C4415D5AA, + 0x4CE1979D5A698999, 0x0000000000000000, 0xEC64F42133C696F1, 0xB57C5569C16B1171, + 0xC1C7926F467F88AF, 0x654D96FE0F3E2E97, 0x15F936D5A8C40E19, 0xB8A72C52A9F1AE95, + 0xA9517DAA21DB19DC, 0x58D27104FA18EE94, 0x5918A148F2AD8780, 0x5CDD1629DAF657C4, + 0x8274C15164FB6CFA, 0xD1FB13DBC6E056F2, 0x7D6FD910CF609F6A, 0xB63F38BDD9A9AA4D, + 0x3D9FE7FAF526C003, 0x74BBC706871499DE, 0xDF630734B6B8522A, 0x3AD3ED03CD0AC26F, + 0xFADEAF2083C023D4, 0xC00D42234ECAE1BB, 0x8538CBA85CD76E96, 0xC402250E6E2458EB, + 0x47BC3413026A5D05, 0xAFD7A71F114272A4, 0x978DF784CC3F62E3, 0xB96DFC1EA144C781, + 0x21B2CF391596C8AE, 0x318E4E8D950916F3, 0xCE9556CC3E92E563, 0x385A509BDD7D1047, + 0x358129A0B5E7AFA3, 0xE6F387E363702B79, 0xE0755D5653E94001, 0x7BE903A5FFF9F412, + 0x12B53C2C90E80C75, 0x3307F315857EC4DB, 0x8FAFB86A0C61D31E, 0xD9E5DD8186213952, + 0x77F8AAD29FD622E2, 0x25BDA814357871FE, 0x7571174A8FA1F0CA, 0x137FEC60985D6561, + 0x30449EC19DBC7FE7, 0xA540D4DD41F4CF2C, 0xDC206AE0AE7AE916, 0x5B911CD0E2DA55A8, + 0xB2305F90F947131D, 0x344BF9ECBD52C6B7, 0x5D17C665D2433ED0, 0x18224FEEC05EB1FD, + 0x9E59E992844B6457, 0x9A568EBFA4A5DD07, 0xA3C60E68716DA454, 0x7E2CB4C4D7A22456, + 0x87B176304CA0BCBE, 0x413AEEA632F3367D, 0x9915E36BBC67663B, 0x40F03EEA3A465F69, + 0x1C2D28C3E0B008AD, 0x4E682A054A1E5BB1, 0x05C5B761285BD044, 0xE1BF8D1A5B5C2915, + 0xF2C0617AC3014C74, 0xB7F5E8F1D11CC359, 0x63CB4C4B3FA745EF, 0x9D1A84469C89DF6B, + 0xE33630824B2BFB3D, 0xD5F474F6E60EEFA2, 0xF58C6B83FB2D4E18, 0x4676E45F0ADF3411, + 0x20781F751D23A1BA, 0xBD629B3381AA7ED1, 0xAE1D775319F71BB0, 0xFED1C80DA32E9A84, + 0x5509083F92825170, 0x29AC01635557A70E, 0xA7C9694551831D04, 0x8E65682604D4BA0A, + 0x11F651F8882AB749, 0xD77DC96EF6793D8A, 0xEF2799F52B042DCD, 0x48EEF0B07A8730C9, + 0x22F1A2ED0D547392, 0x6142F1D32FD097C7, 0x4A674D286AF0E2E1, 0x80FD7CC9748CBED2, + 0x717E7067AF4F499A, 0x938290A9ECD1DBB3, 0x88E3B293344DD172, 0x2734158C250FA3D6 +}; + +// local copy of T0..T7 for each block +__shared__ static uint64_t T0S[256]; +__shared__ static uint64_t T1S[256]; +__shared__ static uint64_t T2S[256]; +__shared__ static uint64_t T3S[256]; +__shared__ static uint64_t T4S[256]; +__shared__ static uint64_t T5S[256]; +__shared__ static uint64_t T6S[256]; +__shared__ static uint64_t T7S[256]; + +// KeySchedule +__constant__ static uint64_t CC[12][8] = {{ + 0xe9daca1eda5b08b1, 0x1f7c65c0812fcbeb, 0x16d0452e43766a2f, 0xfcc485758db84e71, + 0x0169679291e07c4b, 0x15d360a4082a42a2, 0x234d74cc36747605, 0x0745a6f2596580dd +}, { + 0x1a2f9da98ab5a36f, 0xd7b5700f469de34f, 0x982b230a72eafef3, 0x3101b5160f5ed561, + 0x5899d6126b17b59a, 0xcaa70adbc261b55c, 0x56cdcbd71ba2dd55, 0xb79bb121700479e6 +}, { + 0xc72fce2bacdc74f5, 0x35843d6a28fc390a, 0x8b1f9c525f5ef106, 0x7b7b29b11475eaf2, + 0xb19e3590e40fe2d3, 0x09db6260373ac9c1, 0x31db7a8643f4b6c2, 0xb20aba0af5961e99 +}, { + 0xd26615e8b3df1fef, 0xdde4715da0e148f9, 0x7d3c5c337e858e48, 0x3f355e68ad1c729d, + 0x75d603ed822cd7a9, 0xbe0352933313b7d8, 0xf137e893a1ea5334, 0x2ed1e384bcbe0c22 +}, { + 0x994747adac6bea4b, 0x6323a96c0c413f9a, 0x4a1086161f1c157f, 0xbdff0f80d7359e35, + 0xa3f53a254717cdbf, 0x161a2723b700ffdf, 0xf563eaa97ea2567a, 0x57fe6c7cfd581760 +}, { + 0xd9d33a1daeae4fae, 0xc039307a3bc3a46f, 0x6ca44251f9c4662d, 0xc68ef09ab49a7f18, + 0xb4b79a1cb7a6facf, 0xb6c6bec2661ff20a, 0x354f903672c571bf, 0x6e7d64467a4068fa +}, { + 0xecc5aaee160ec7f4, 0x540924bffe86ac51, 0xc987bfe6c7c69e39, 0xc9937a19333e47d3, + 0x372c822dc5ab9209, 0x04054a2883694706, 0xf34a3ca24c451735, 0x93d4143a4d568688 +}, { + 0xa7c9934d425b1f9b, 0x41416e0c02aae703, 0x1ede369c71f8b74e, 0x9ac4db4d3b44b489, + 0x90069b92cb2b89f4, 0x2fc4a5d12b8dd169, 0xd9a8515935c2ac36, 0x1ee702bfd40d7fa4 +}, { + 0x9b223116545a8f37, 0xde5f16ecd89a4c94, 0x244289251b3a7d3a, 0x84090de0b755d93c, + 0xb1ceb2db0b440a80, 0x549c07a69a8a2b7b, 0x602a1fcb92dc380e, 0xdb5a238351446172 +}, { + 0x526f0580a6debeab, 0xf3f3e4b248e52a38, 0xdb788aff1ce74189, 0x0361331b8ae1ff1f, + 0x4b3369af0267e79f, 0xf452763b306c1e7a, 0xc3b63b15d1fa9836, 0xed9c4598fbc7b474 +}, { + 0xfb89c8efd09ecd7b, 0x94fe5a63cdc60230, 0x6107abebbb6bfad8, 0x7966841421800120, + 0xcab948eaef711d8a, 0x986e477d1dcdbaef, 0x5dd86fc04a59a2de, 0x1b2df381cda4ca6b +}, { + 0xba3116f167e78e37, 0x7ab14904b08013d2, 0x771ddfbc323ca4cd, 0x9b9f2130d41220f8, + 0x86cc91189def805d, 0x5228e188aaa41de7, 0x991bb2d9d517f4fa, 0x20d71bf14a92bc48 +}}; + + +__constant__ static uint64_t F0[8] = // GOST_F(0) +{ + 0x74a5d4ce2efc83b3, 0x74a5d4ce2efc83b3, 0x74a5d4ce2efc83b3, 0x74a5d4ce2efc83b3, + 0x74a5d4ce2efc83b3, 0x74a5d4ce2efc83b3, 0x74a5d4ce2efc83b3, 0x74a5d4ce2efc83b3 +}; + +__constant__ static uint64_t CC_F0[12][8] = +{ + { 0x8FD72F640708B0D0, 0x0DE874C7EBC3F213, 0xE92EEF3AD202E9E0, 0xC1E9DA0708013DA7, 0x9727DAB2F014BE88, 0x103051A02BCD6935, 0x33EC7E1DBD28F736, 0x1ECF460CF78AD1F4 }, + { 0x0B2D9F89C775449D, 0x6B6EEFC6DAB7E8B0, 0xF1A0D31667F6EC44, 0x2A71132D5E108166, 0x0E9357C2EC87931A, 0xC99F5C1B4A01612D, 0x7E60B16E637D4EE2, 0xA9FCB827F9BA6D81 }, + { 0x231FECA5AB3D285C, 0x70C6E1483C838C3B, 0x9C21C3C40CE4E2DA, 0x2FA796BD5688E573, 0x04C0E3FF55809FDF, 0x5FF978BFB8E3CDC8, 0xC54A19D6A3D07033, 0x0FCA83FDDE872478 }, + { 0xBDF9312726339F10, 0x51A5BA1793BC9C56, 0xC4428DA14F96D2D4, 0xEC925222374EAB1F, 0x79477893747DD92F, 0xC495E19A46886304, 0x9C23F893BA7CFA36, 0x0C47268881FC5FEB }, + { 0xCF117966029B2CB3, 0x07179ABE77088A8F, 0x671EF4CC2650E257, 0x7474B8B170DAB5C6, 0x4224FEBECF35113E, 0x993D156C675C5537, 0x2DEE3A5782C39B45, 0xE7C586F2990DD385 }, + { 0x8608FD95B1C1138A, 0x8BB0847D9E9849AC, 0x5E76623F4F0EB0C7, 0x34C2BDBAFC5060CE, 0xE9E814475907826C, 0x22C9ED94D6AAC7C9, 0xE6B75E28171EB0D6, 0xF1329E5534E60215 }, + { 0x86BB4814B1C3CE52, 0xE8F226C9FBDDD017, 0xCEDED67991CB3087, 0x76C33E32FDBFACA5, 0xDBB13BE1A9F7474C, 0x3D0273470342C356, 0x8E7246C51CF07F61, 0xAC8C125DDEF8DF71 }, + { 0x6D73E747795B8CF3, 0x4E4AA65EA0072050, 0xA14A1582CB43C2B9, 0x748EF2B7BB63B938, 0x126789534410D7D4, 0xD4D48FF40301D791, 0xC67DFBE315C41FC0, 0x35E7A1A1AF88601C }, + { 0x9BD33EA0FAB34007, 0xF51B7CDBE3D67D25, 0xD3ABDA0CE4186E6B, 0x8E61DDADCBCE1706, 0x58994565B41BE6A5, 0x7A87ABC1240CD31D, 0xFAFE6C28487968D0, 0x15B368609FF9EEA7 }, + { 0xAE33263CCF115818, 0x93B2DBE9CADFCFC8, 0x0A91952BF91B0147, 0x458E67CA5F1ED73A, 0x94C2E5F288F074E3, 0x377895E85C69E996, 0xF11A4456AAB37B10, 0x163131934816821A }, + { 0xD07E4A2366BF469D, 0x5EF1A3D220213B6C, 0x3C5BB78971D8ED0F, 0x0DE05E6B9006F2D2, 0xC58CFB00B8EAA1C9, 0xEFCDB54D1F250B76, 0xFD135634FA527042, 0x4CEE791290516407 }, + { 0xD800B9264010790F, 0x974C4823E2B668D7, 0xA605A4B385C5E361, 0x3F6C92DA5A56D8D2, 0x82B9D67C12EF8277, 0x0AB6B4582561BF90, 0x46954FD98FC2CBA3, 0x70BE45CB21B6760D } +}; + + +__constant__ static uint64_t F1[8] = // GOST_F(1) +{ + 0x155f7bb040eec523, 0x155f7bb040eec523, 0x155f7bb040eec523, 0x155f7bb040eec523, + 0x155f7bb040eec523, 0x155f7bb040eec523, 0x155f7bb040eec523, 0x155f7bb040eec523 +}; + +__constant__ static uint64_t CC_F1[12][8] = +{ + { 0xeaebb276318fee18, 0xea4c693382cbd63b, 0xbf26be88df699734, 0x49a504a9b6fa1c45, 0xb1666aa693de22da, 0x113563ea5e6b7e9c, 0xcdbf01848cd611e6, 0xb95e4a9dc30c7d0c }, + { 0x919565a231cfa4aa, 0x46fde791cec8ae57, 0xe3c56411e2de27bf, 0x1f9d9e511aba0b94, 0x57773e25f11309ce, 0x2ce14b67cd005091, 0x00fb26ba738ef6c7, 0x2d5f800141af74fd }, + { 0xf57a17cc650afe61, 0x26d3deadafe23502, 0xf87b7436229a32a5, 0x85459ccaae2842a5, 0x0d3a74dda91e80cd, 0x330e2b60f01ed098, 0x56c16add5dfb6720, 0x8692832019310082 }, + { 0x6f63d34f5f688399, 0xa826bf5fb7abd51f, 0x3ecb2eaa144393e2, 0x4e7d6cc0863c69e4, 0x61e175af40d59b16, 0xba60d963cd6a540a, 0x69bf99c14c3995d5, 0x5a3de79f30d5a599 }, + { 0x25f0e72cae7257f0, 0xfdb8c6bc7f9a6c15, 0x326e9413d635e7f1, 0xeaff2028e5942992, 0x1a55b07e905d6162, 0x882060860a9970d1, 0xe2b0cd223cc898af, 0x56a1f7c0137c29be }, + { 0x4e6e5462c344d15a, 0xb7fb298868e7b346, 0x33741921c3e95374, 0xacb5e26b0e8d2b0b, 0x59f16751b3b69ec8, 0xa659593ea405b0b7, 0x98408efc8cb1a951, 0x8dbbcf819b3df0fc }, + { 0x8d0aa21b9aec6c6a, 0x2b3534b940a84fb6, 0x2a1230d58e638c51, 0xc9daefb8e02f3383, 0xc709f5a9e5878201, 0x6f42d5dc6a746c8d, 0x3fb7df9057ada0b0, 0xaa6d0139a591f1c1 }, + { 0xb3a97a7336702199, 0x51bd05f743668d8a, 0xc50f8f941f5351f3, 0xbdd89dee5fa35fe3, 0x9c4e220a589d4cbb, 0xed49fc69200e2ed8, 0x38354437945f7d36, 0x0904ddf5a8b68f2b }, + { 0x1afa89fcc0636790, 0xda9d9eecd88892e6, 0xfec3d6bfe830769a, 0xafae622e5dc303d7, 0x7f7a31a7805db3f0, 0x916752f22230f876, 0x7b33cb8f67df8fca, 0xd205cb3c39e54fd7 }, + { 0x648e61636c99ce88, 0x8533e43ee0c8a504, 0xbb9189e6eee32a4e, 0x6edbda389dc2f3bf, 0xdf6ddca6e9daa1d6, 0xd3962f27af34ce52, 0xe1e63f4c628c9c15, 0xd5ad89fc0b5c693d }, + { 0x0646bda91e280a3e, 0x3a6f57000155ec3e, 0x579182cf68a16a50, 0x382fa3cafc78b976, 0x45ca8299c7305fb5, 0x778479d865838e62, 0x2a119981c6495ae7, 0xdbf255760f5a7b1d }, + { 0xeb1ab39e4073b2f0, 0x22216718aefb32e4, 0xf9926a2b4248c862, 0x838bd14eb5ba6c3f, 0xa33f1ec5ff1cb214, 0xdb6aef763e43ff19, 0xa17f903ce0f5f90e, 0x03bf0065a0ecf9fc } +}; + + +__device__ __forceinline__ +void GOST_Add128(void *x, void * const a, void * const b) +{ + uint16_t t = 0; + #pragma unroll + for(int i = 15; i >= 0; i--) + { + t = ((uint8_t *)a)[i] + ((uint8_t *)b)[i] + (t >> 8); + ((uint8_t *)x)[i] = t & 0xFF; + } +} + +__device__ __forceinline__ +void GOST_Copy512(uint64_t* dst, uint64_t* const __restrict__ src) +{ + #pragma unroll + for (int i=0; i<8; i++) + dst[i] = src[i]; +} + +__device__ __forceinline__ +void GOST_Copy256(uint64_t* dst, uint64_t* const __restrict__ src) +{ + #pragma unroll + for (int i=0; i<4; i++) + dst[i] = src[i]; +} + +__device__ __forceinline__ +void GOST_Xor512(uint64_t* C, uint64_t* const A, const uint64_t* B) +{ + C[0] = A[0] ^ B[0]; + C[1] = A[1] ^ B[1]; + C[2] = A[2] ^ B[2]; + C[3] = A[3] ^ B[3]; + C[4] = A[4] ^ B[4]; + C[5] = A[5] ^ B[5]; + C[6] = A[6] ^ B[6]; + C[7] = A[7] ^ B[7]; +} + +__device__ __forceinline__ +void GOST_Xor512_3(uint64_t* C, uint64_t* const A, uint64_t* const B) +{ + #pragma unroll 8 + for(int i=0; i<8; i++) { + C[i] ^= A[i] ^ B[i]; + } +} + +__device__ __forceinline__ +void GOST_Xor512_c(uint64_t* C, uint64_t* const A, const uint64_t* B, uint64_t c) +{ + #pragma unroll 8 + for(int i=0; i<8; i++) { + C[i] = A[i] ^ B[i] ^ c; + } +} + + +#define EXTRACT_BYTE(x,i) __byte_perm(x,0,0x4440 + i) + +__device__ __forceinline__ +void GOST_FS(uint64_t* const state64, uint64_t* return_state) +{ + uint32_t * state32 = (uint32_t *)state64; + + #pragma unroll 4 + for (int b=0; b<4; b++) + { +#if (__CUDA_ARCH__ >= 500) + return_state[b] = __ldg (&T0[EXTRACT_BYTE(state32[14], b)]) + ^ __ldg (&T1[EXTRACT_BYTE(state32[12], b)]) + ^ T2S[EXTRACT_BYTE(state32[10], b)] + ^ __ldg (&T3[EXTRACT_BYTE(state32[8], b)]) + ^ T4S[EXTRACT_BYTE(state32[6], b)] + ^ T5S[EXTRACT_BYTE(state32[4], b)] + ^ T6S[EXTRACT_BYTE(state32[2], b)] + ^ T7S[EXTRACT_BYTE(state32[0], b)]; + + return_state[b+4] = T0S[EXTRACT_BYTE(state32[15], b)] + ^ T1S[EXTRACT_BYTE(state32[13], b)] + ^ T2S[EXTRACT_BYTE(state32[11], b)] + ^ T3S[EXTRACT_BYTE(state32[9], b)] + ^ __ldg (&T4[EXTRACT_BYTE(state32[7], b)]) + ^ T5S[EXTRACT_BYTE(state32[5], b)] + ^ __ldg (&T6[EXTRACT_BYTE(state32[3], b)]) + ^ __ldg (&T7[EXTRACT_BYTE(state32[1], b)]); +#else + return_state[b] = T0S[EXTRACT_BYTE(state32[14], b)] + ^ T1S[EXTRACT_BYTE(state32[12], b)] + ^ T2S[EXTRACT_BYTE(state32[10], b)] + ^ T3S[EXTRACT_BYTE(state32[8], b)] + ^ T4S[EXTRACT_BYTE(state32[6], b)] + ^ T5S[EXTRACT_BYTE(state32[4], b)] + ^ T6S[EXTRACT_BYTE(state32[2], b)] + ^ T7S[EXTRACT_BYTE(state32[0], b)]; + + return_state[b+4] = T0S[EXTRACT_BYTE(state32[15], b)] + ^ T1S[EXTRACT_BYTE(state32[13], b)] + ^ T2S[EXTRACT_BYTE(state32[11], b)] + ^ T3S[EXTRACT_BYTE(state32[9], b)] + ^ T4S[EXTRACT_BYTE(state32[7], b)] + ^ T5S[EXTRACT_BYTE(state32[5], b)] + ^ T6S[EXTRACT_BYTE(state32[3], b)] + ^ T7S[EXTRACT_BYTE(state32[1], b)]; +#endif + } +} + +__device__ __forceinline__ +static void GOST_F(uint64_t* state) +{ + uint64_t t[8]; + GOST_FS(state, t); + //memcpy(state, t, 64); + GOST_Copy512(state, t); +} + + +__device__ +static void GOST_E12(uint64_t* const K, uint64_t *state) +{ + uint64_t state1[8], K1[8]; + + GOST_Xor512(state1, K, CC[0]); + GOST_FS(state1, K1); + GOST_FS(state, state1); + GOST_Xor512(state, state1, K1); + +#if (__CUDA_ARCH__ >= 500) + #pragma unroll 11 +#else + #pragma unroll 5 +#endif + for(int i=1; i<12; i++) + { + GOST_Xor512(state1, K1, CC[i]); + GOST_FS(state1, K1); + GOST_FS(state, state1); + GOST_Xor512(state, state1, K1); + } +} + +__device__ +void GOST_E(uint64_t* const K, uint64_t* const m, uint64_t *state /* out only */) +{ + GOST_Xor512(state, m, K); // state = m ^ K + GOST_E12(K, state); +} + +__device__ +void GOST_E_F0(uint64_t* const m, uint64_t *state /* out only */) +{ + GOST_Xor512(state, m, F0); // state = m ^ F0 + + uint64_t state1[8]; + #pragma unroll 12 + for(int i=0; i<12; i++) + { + GOST_FS(state, state1); + GOST_Xor512(state, state1, CC_F0[i]); + } +} + +__device__ +void GOST_E_F1(uint64_t* const m, uint64_t *state /* out only */) +{ + GOST_Xor512(state, m, F1); // state = m ^ F0 + + uint64_t state1[8]; + #pragma unroll 12 + for(int i=0; i<12; i++) + { + GOST_FS(state, state1); + GOST_Xor512(state, state1, CC_F1[i]); + } +} + +__device__ +void GOST_g_N(uint64_t* h, uint64_t* const M, uint64_t* const N) +{ + uint64_t K[8]; + + GOST_Xor512(K, N, h); // K = N ^ h + GOST_F(K); + + uint64_t t[8]; + GOST_E(K, M, t); + +// GOST_Xor512(t, t, h); +// GOST_Xor512(h, t, M); + GOST_Xor512_3(h, t, M); // h = h ^ t ^ M +} + + +__device__ +void GOST_g_0(uint64_t* h, uint64_t* const M) +{ + uint64_t K[8]; + GOST_FS(h, K); + + uint64_t t[8]; + GOST_E(K, M, t); + + GOST_Xor512_3(h, t, M); // h = h ^ t ^ M +} + +__device__ +void GOST_g_0_0(uint64_t* h, uint64_t* const M) // input h assumed zero, for iv 512 +{ + GOST_E_F0 (M, h); + GOST_Xor512 (h, h, M); // h = h ^ M +} + +__device__ +void GOST_g_0_1(uint64_t* h, uint64_t* const M) // input h assumed all bytes one, for iv 256 +{ + GOST_E_F1 (M, h); + GOST_Xor512_c(h, h, M, 0x0101010101010101); // h = h ^ M ^ 1 +} + +__global__ +/*__launch_bounds__(256,3)*/ +void gostd_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + // copy table to shared memory, we assume 256 threads per block + T0S[threadIdx.x] = __ldg (&T0[threadIdx.x]); + T1S[threadIdx.x] = __ldg (&T1[threadIdx.x]); + T2S[threadIdx.x] = __ldg (&T2[threadIdx.x]); + T3S[threadIdx.x] = __ldg (&T3[threadIdx.x]); + T4S[threadIdx.x] = __ldg (&T4[threadIdx.x]); + T5S[threadIdx.x] = __ldg (&T5[threadIdx.x]); + T6S[threadIdx.x] = __ldg (&T6[threadIdx.x]); + T7S[threadIdx.x] = __ldg (&T7[threadIdx.x]); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + uint64_t __align__(8) N[8] = {0}; + // first hash (GOST 34.11-512 over 80 bytes) + uint64_t __align__(8) block2[8]; + // copy second block of header + #pragma unroll + for (int i = 0; i < 8; i++) block2[i] = c_header2[i]; + ((uint32_t *)block2)[15] = cuda_swab32 (nonce); // change nonce + uint64_t __align__(8) hash1[8]; + // second block + GOST_g_0_0 (hash1, block2); // zero iv for 512 assumed + N[7] = 0x0002000000000000; // 512 + // first block + GOST_g_N(hash1, c_header1 + 2, N); + N[7] |= 0x8000000000000000; // +128 + GOST_g_0(hash1, N); + GOST_Add128(block2 + 6, block2 + 6, c_header1 + 8); + block2[5] += 0x0100000000000000; + GOST_g_0(hash1, block2); + + // second hash (GOST 34.11-256 over 64 bytes) + uint64_t __align__(8) hash[8]; + // second block + GOST_g_0_1(hash, hash1); // // iv for 256 assumed (all bytes one) + N[7] = 0x0002000000000000; // 512 + // first block + GOST_g_N(hash, c_header1, N); + GOST_g_0(hash, N); + hash1[7] += 0x0100000000000000; + GOST_g_0(hash, hash1); + // result is first 32 bytes of hash + + uint64_t high = MAKE_ULONGLONG(cuda_swab32(_HIDWORD(hash[0])), cuda_swab32(_LODWORD(hash[0]))); // swab uint64_t and invert + // check nonce + if (high <= d_target[0]) + { + //printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]); + resNonces[1] = atomicExch(resNonces, nonce); + //d_target[0] = high; + } + } +} + +__host__ +void gostd_init(int thr_id) +{ + cuda_get_arch(thr_id); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t))); +} + +__host__ +void gostd_free(int thr_id) +{ + if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]); + d_resNonces[thr_id] = NULL; +} + +__host__ +void gostd_setBlock_80(uint32_t *pdata, uint32_t *ptarget) +{ + // copy first 16 bytes to the end of c_header1 + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_header1, pdata, 16, 64, cudaMemcpyHostToDevice)); + // other 64 bytes, but since we set nonce later on we don't copy it + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_header2, pdata + 4, 60, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice)); +} + +__host__ +void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces) +{ + const uint32_t threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t))); + cudaThreadSynchronize(); + gostd_gpu_hash_80 <<>> (threads, startNonce, d_resNonces[thr_id]); + cudaThreadSynchronize(); + + CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + if (resNonces[0] == resNonces[1]) { + resNonces[1] = UINT32_MAX; + } +} \ No newline at end of file diff --git a/gost/gost.cu b/gost/gost.cu new file mode 100644 index 0000000000..1a43dd9c4c --- /dev/null +++ b/gost/gost.cu @@ -0,0 +1,150 @@ +extern "C" { +#include "sph/sph_streebog.h" +} + +#include "miner.h" +#include "cuda_helper.h" + +#include +#include + +#define NBN 2 + +// GOST CPU Hash +extern "C" void gostd_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[64]; + + sph_gost512(hash, (const void*)input, 80); + sph_gost256(hash, (const void*)hash, 64); + + memcpy(output, hash, 32); +} + +extern "C" void gostd(void *output, const void *input, size_t len) +{ + unsigned char _ALIGN(64) hash[64]; + + sph_gost512(hash, (const void*)input, len); + sph_gost256(hash, (const void*)hash, 64); + + memcpy(output, hash, 32); +} + +//#define _DEBUG +#define _DEBUG_PREFIX "gost" +#include "cuda_debug.cuh" + +static bool init[MAX_GPUS] = { 0 }; +extern void gostd_init(int thr_id); +extern void gostd_free(int thr_id); +extern void gostd_setBlock_80(uint32_t *pdata, uint32_t *ptarget); +extern void gostd_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces); + +extern "C" int scanhash_gostd(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + //ptarget[7] = 0x000000FF; + const uint32_t first_nonce = pdata[19]; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << 25); + if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce)); + + if (opt_benchmark) + ((uint32_t*)ptarget)[7] = 0x03; + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + gostd_init(thr_id); + + init[thr_id] = true; + } + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + gostd_setBlock_80(endiandata, ptarget); + + do { + // Hash with CUDA + *hashes_done = pdata[19] - first_nonce + throughput; + + gostd_hash_80(thr_id, throughput, pdata[19], work->nonces); + if (work->nonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + uint32_t _ALIGN(64) vhash_le[8]; + + endiandata[19] = swab32 (work->nonces[0]); + gostd_hash(vhash, endiandata); + if (swab32(vhash[0]) <= ptarget[7] /*&& fulltest(vhash, ptarget)*/) + { + work->valid_nonces = 1; + + for (int i = 0; i < 8; i++) + vhash_le[i] = swab32(vhash[7-i]); + + work_set_target_ratio(work, vhash_le); + if (work->nonces[1] != UINT32_MAX) + { + endiandata[19] = swab32 (work->nonces[1]); + gostd_hash(vhash, endiandata); + if (swab32(vhash[0]) <= ptarget[7] /*&& fulltest(vhash, ptarget)*/) + { + work->valid_nonces++; + bn_set_target_ratio(work, vhash, 1); + } + pdata[19] = max(work->nonces[0], work->nonces[1]); + } + else + pdata[19] = work->nonces[0]; + return work->valid_nonces; + } + else if (swab32(vhash[0]) > ptarget[7]) + { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + + return 0; +} + +// cleanup +extern "C" void free_gostd(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + gostd_free(thr_id); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/lyra2/Lyra2.c b/lyra2/Lyra2.c index 1f0a953e4a..23eed4e163 100644 --- a/lyra2/Lyra2.c +++ b/lyra2/Lyra2.c @@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa return 0; } + +int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols) +{ + //============================= Basic variables ============================// + int64_t row = 2; //index of row to be processed + int64_t prev = 1; //index of prev (last row ever computed/modified) + int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering) + int64_t tau; //Time Loop iterator + int64_t step = 1; //Visitation step (used during Setup and Wandering phases) + int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup) + int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1 + int64_t i; //auxiliary iteration counter + int64_t v64; // 64bit var for memcpy + uint64_t instance = 0; + //==========================================================================/ + + //========== Initializing the Memory Matrix and pointers to it =============// + //Tries to allocate enough space for the whole memory matrix + + const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols; + const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8; + // for Lyra2REv2, nCols = 4, v1 was using 8 + const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES; + + size_t sz = (size_t)ROW_LEN_BYTES * nRows; + uint64_t *wholeMatrix = malloc(sz); + if (wholeMatrix == NULL) { + return -1; + } + memset(wholeMatrix, 0, sz); + + //Allocates pointers to each row of the matrix + uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows); + if (memMatrix == NULL) { + return -1; + } + //Places the pointers in the correct positions + uint64_t *ptrWord = wholeMatrix; + for (i = 0; i < nRows; i++) { + memMatrix[i] = ptrWord; + ptrWord += ROW_LEN_INT64; + } + //==========================================================================/ + + //============= Getting the password + salt + basil padded with 10*1 ===============// + //OBS.:The memory matrix will temporarily hold the password: not for saving memory, + //but this ensures that the password copied locally will be overwritten as soon as possible + + //First, we clean enough blocks for the password, salt, basil and padding + int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1; + + byte *ptrByte = (byte*) wholeMatrix; + + //Prepends the password + memcpy(ptrByte, pwd, pwdlen); + ptrByte += pwdlen; + + //Concatenates the salt + memcpy(ptrByte, salt, saltlen); + ptrByte += saltlen; + + memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen))); + + //Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface + memcpy(ptrByte, &kLen, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = pwdlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = saltlen; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = timeCost; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nRows; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + v64 = nCols; + memcpy(ptrByte, &v64, sizeof(int64_t)); + ptrByte += sizeof(uint64_t); + + //Now comes the padding + *ptrByte = 0x80; //first byte of padding: right after the password + ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix + ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block + *ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block + //==========================================================================/ + + //======================= Initializing the Sponge State ====================// + //Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c) + uint64_t state[16]; + initState(state); + //==========================================================================/ + + //================================ Setup Phase =============================// + //Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits + ptrWord = wholeMatrix; + for (i = 0; i < nBlocksInput; i++) { + absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil) + ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil) + } + + //Initializes M[0] and M[1] + reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here + + reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols); + + do { + //M[row] = rand; //M[row*] = M[row*] XOR rotW(rand) + + reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //updates the value of row* (deterministically picked during Setup)) + rowa = (rowa + step) & (window - 1); + //update prev: it now points to the last row ever computed + prev = row; + //updates row: goes to the next row to be computed + row++; + + //Checks if all rows in the window where visited. + if (rowa == 0) { + step = window + gap; //changes the step: approximately doubles its value + window *= 2; //doubles the size of the re-visitation window + gap = -gap; //inverts the modifier to the step + } + + } while (row < nRows); + //==========================================================================/ + + //============================ Wandering Phase =============================// + row = 0; //Resets the visitation to the first row of the memory matrix + for (tau = 1; tau <= timeCost; tau++) { + //Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1 + step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1; + do { + //Selects a pseudorandom index row* (the only change in REv3) + //------------------------------------------------------------------------------------------ + instance = state[instance & 0xF]; + rowa = state[instance & 0xF] & (unsigned int)(nRows-1); + + //rowa = state[0] & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + //Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row] + reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols); + + //update prev: it now points to the last row ever computed + prev = row; + + //updates row: goes to the next row to be computed + //------------------------------------------------------------------------------------------ + row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2) + //row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE) + //------------------------------------------------------------------------------------------ + + } while (row != 0); + } + + //============================ Wrap-up Phase ===============================// + //Absorbs the last block of the memory matrix + absorbBlock(state, memMatrix[rowa]); + + //Squeezes the key + squeeze(state, K, (unsigned int) kLen); + + //========================= Freeing the memory =============================// + free(memMatrix); + free(wholeMatrix); + + return 0; +} \ No newline at end of file diff --git a/lyra2/Lyra2.h b/lyra2/Lyra2.h index edf917927b..f866462a9a 100644 --- a/lyra2/Lyra2.h +++ b/lyra2/Lyra2.h @@ -38,5 +38,6 @@ typedef unsigned char byte; #endif int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); +int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols); #endif /* LYRA2_H_ */ diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 9cb8e56ad2..5cdb6ee3a3 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -1,1357 +1,575 @@ /** - * fancyIX * Lyra2 (v1) cuda implementation based on djm34 work * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) + * tpruvot@github 2018 for phi2 double lyra2-32 support */ - #include - #include - - #define TPB52 32 - - #include "cuda_lyra2_sm2.cuh" - #include "cuda_lyra2_sm5.cuh" - - #ifdef __INTELLISENSE__ - /* just for vstudio code colors */ - #define __CUDA_ARCH__ 520 - #endif - - #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500 - - #include "cuda_lyra2_vectors.h" - - #ifdef __INTELLISENSE__ - /* just for vstudio code colors */ - __device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); - #endif - - #define Nrow 8 - #define Ncol 8 - #define memshift 3 - - #define BUF_COUNT 0 - - __device__ uint2 *DMatrix; - - __device__ __forceinline__ void LD4SSB(uint2 res[3], const int row, const int col, const int thread, const int threads) - { - extern __shared__ uint2 shared_mem[]; - const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; - - res[0] = shared_mem[((s0 + 0) * 8 + threadIdx.y) * 4 + threadIdx.x]; - res[1] = shared_mem[((s0 + 1) * 8 + threadIdx.y) * 4 + threadIdx.x]; - res[2] = shared_mem[((s0 + 2) * 8 + threadIdx.y) * 4 + threadIdx.x]; - } - - __device__ __forceinline__ void ST4SSB(const int row, const int col, const uint2 data[3], const int thread, const int threads) - { - extern __shared__ uint2 shared_mem[]; - const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; - - shared_mem[((s0 + 0) * 8 + threadIdx.y) * 4 + threadIdx.x] = data[0]; - shared_mem[((s0 + 1) * 8 + threadIdx.y) * 4 + threadIdx.x] = data[1]; - shared_mem[((s0 + 2) * 8 + threadIdx.y) * 4 + threadIdx.x] = data[2]; - } - - __device__ __forceinline__ void LD4SS(uint2 res[3], const int row, const int col, const int thread, const int threads) - { - extern __shared__ uint2 shared_mem[]; - const int s0 = (Ncol / 2 * (row - BUF_COUNT) + col / 2) * memshift; - - res[0] = shared_mem[((s0 + 0) * 16 + threadIdx.y) * 4 + threadIdx.x]; - res[1] = shared_mem[((s0 + 1) * 16 + threadIdx.y) * 4 + threadIdx.x]; - res[2] = shared_mem[((s0 + 2) * 16 + threadIdx.y) * 4 + threadIdx.x]; - } - - __device__ __forceinline__ void ST4SS(const int row, const int col, const uint2 data[3], const int thread, const int threads) - { - extern __shared__ uint2 shared_mem[]; - const int s0 = (Ncol / 2 * (row - BUF_COUNT) + col / 2) * memshift; - - shared_mem[((s0 + 0) * 16 + threadIdx.y) * 4 + threadIdx.x] = data[0]; - shared_mem[((s0 + 1) * 16 + threadIdx.y) * 4 + threadIdx.x] = data[1]; - shared_mem[((s0 + 2) * 16 + threadIdx.y) * 4 + threadIdx.x] = data[2]; - } - - __device__ __forceinline__ void LD4SL(uint2 res[3], const int row, const int col, uint2 pad[Ncol / 2][Nrow][3]) - { - res[0] = pad[col / 2][row][0]; - res[1] = pad[col / 2][row][1]; - res[2] = pad[col / 2][row][2]; - } - - __device__ __forceinline__ void ST4SL(const int row, const int col, const uint2 data[3], uint2 pad[Ncol / 2][Nrow][3]) - { - pad[col / 2][row][0] = data[0]; - pad[col / 2][row][1] = data[1]; - pad[col / 2][row][2] = data[2]; - } - - __device__ __forceinline__ void LD4SG(uint2 res[3], const int row, const int col, const int thread, const int threads, const int offset, uint64_t *GPad, uint64_t *GPadO) - { - uint64_t *p = (uint64_t *) ((GPad + offset + 2 * (threadIdx.x + 4 * threadIdx.y + (((col /2) * 64 + row * 64 * (Ncol / 2)))))); - uint64_t *q = (uint64_t *) (GPadO + offset + (threadIdx.x + 4 * threadIdx.y + ((col /2) * 64 + row * 64 * (Ncol / 2)))); - /*uint4 tmp = __ldg((uint4 *)p); - res[0] = make_uint2(tmp.x, tmp.y); - res[1] = make_uint2(tmp.z, tmp.w); - res[2] = __ldg((uint2 *)q);*/ - uint64_t * r = (uint64_t *)(&res[0]); - asm ("ld.global.ca.v2.u64 {%0, %1}, [%3];\n\tld.global.ca.u64 %2, [%4];" - : "=l"(r[0]), "=l"(r[1]), "=l"(r[2]) : "l"(p), "l"(q) : "memory"); - } - - __device__ __forceinline__ void ST4SG(const int row, const int col, const uint2 data[3], const int thread, const int threads, const int offset, uint64_t *GPad, uint64_t *GPadO) - { - uint64_t a = MAKE_ULONGLONG(data[0].x, data[0].y); - uint64_t b = MAKE_ULONGLONG(data[1].x, data[1].y); - uint64_t c = MAKE_ULONGLONG(data[2].x, data[2].y); - uint64_t *p = (uint64_t *) ((GPad + offset + 2 * (threadIdx.x + 4 * threadIdx.y + (((col /2) * 64 + row * 64 * (Ncol / 2)))))); - uint64_t *q = (uint64_t *) (GPadO + offset + (threadIdx.x + 4 * threadIdx.y + ((col /2) * 64 + row * 64 * (Ncol / 2)))); - asm ("st.global.v2.u64 [%0], {%2, %3};\n\t st.global.u64 [%1], %4;\n\t" :: "l"(p), "l"(q), "l"(a), "l"(b), "l"(c): "memory"); - //((uint4*)GPad)[offset + ((threadIdx.x + 4 * threadIdx.y + (col /2) * 64 + row * 64 * (Ncol / 2)))] = tmp; - //GPad[offset + ((threadIdx.x + 4 * threadIdx.y + (col /2) * 64 + row * 64 * (Ncol / 2))) + 64 * Nrow * (Ncol / 2) * 2] = data[2]; - } - - #if __CUDA_ARCH__ >= 300 - __device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) - { - return __shfl(a, b, c); - } - - __device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) - { - return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); - } - - __device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) - { - a1 = WarpShuffle(a1, b1, c); - a2 = WarpShuffle(a2, b2, c); - a3 = WarpShuffle(a3, b3, c); - } - - #else - __device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) - { - extern __shared__ uint2 shared_mem[]; - - const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; - uint32_t *_ptr = (uint32_t*)shared_mem; - - __threadfence_block(); - uint32_t buf = _ptr[thread]; - - _ptr[thread] = a; - __threadfence_block(); - uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; - - __threadfence_block(); - _ptr[thread] = buf; - - __threadfence_block(); - return result; - } - - __device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) - { - extern __shared__ uint2 shared_mem[]; - - const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; - - __threadfence_block(); - uint2 buf = shared_mem[thread]; - - shared_mem[thread] = a; - __threadfence_block(); - uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; - - __threadfence_block(); - shared_mem[thread] = buf; - - __threadfence_block(); - return result; - } - - __device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) - { - extern __shared__ uint2 shared_mem[]; - - const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; - - __threadfence_block(); - uint2 buf = shared_mem[thread]; - - shared_mem[thread] = a1; - __threadfence_block(); - a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; - __threadfence_block(); - shared_mem[thread] = a2; - __threadfence_block(); - a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; - __threadfence_block(); - shared_mem[thread] = a3; - __threadfence_block(); - a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; - - __threadfence_block(); - shared_mem[thread] = buf; - __threadfence_block(); - } - - #endif - - #if __CUDA_ARCH__ > 500 || !defined(__CUDA_ARCH) - static __device__ __forceinline__ - void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) - { - a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y; - c += d; b ^= c; b = ROR24(b); - a += b; d ^= a; d = ROR16(d); - c += d; b ^= c; b = ROR2(b, 63); - } - #endif - - __device__ __forceinline__ void round_lyra(uint2 s[4]) - { - Gfunc(s[0], s[1], s[2], s[3]); - WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); - Gfunc(s[0], s[1], s[2], s[3]); - WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); - } - - static __device__ __forceinline__ - void round_lyra(uint2x4* s) - { - Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); - Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); - Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); - Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); - Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); - Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); - Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); - Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); - } - - static __device__ __forceinline__ - void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads, uint2 pad[Ncol / 2][Nrow][3]) - { - uint2 state1[3]; - uint2 state2[3]; - - - for (int i = 0; i < Nrow; i++) - { - if ((i & 1) == 1) - ST4SS(0, Ncol - i - 1, state, thread, threads); - else - ST4SL(0, Ncol - i - 1, state, pad); - - round_lyra(state); - } - - for (int i = 0; i < Nrow; i+=2) - { - LD4SS(state1, 0, i, thread, threads); - LD4SL(state2, 0, i + 1, pad); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state2[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - ST4SL(1, Ncol - i - 1, state1, pad); - ST4SS(1, Ncol - (i + 1) - 1, state2, thread, threads); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads, uint2 pad[Ncol / 2][Nrow][3]) - { - uint2 state1[3], state2[3], state3[3], state4[3]; - - for (int i = 0; i < Nrow; i+=2) - { - LD4SS(state1, rowIn, i, thread, threads); - LD4SS(state2, rowInOut, i, thread, threads); - LD4SL(state3, rowIn, i + 1, pad); - LD4SL(state4, rowInOut, i + 1, pad); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - ST4SL(rowOut, Ncol - i - 1, state1, pad); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state2[0] ^= Data2; - state2[1] ^= Data0; - state2[2] ^= Data1; - } else { - state2[0] ^= Data0; - state2[1] ^= Data1; - state2[2] ^= Data2; - } - - ST4SS(rowInOut, i, state2, thread, threads); - - //===================================== - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state3[j] ^= state[j]; - - ST4SS(rowOut, Ncol - (i + 1) - 1, state3, thread, threads); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data01 = state[0]; - uint2 Data11 = state[1]; - uint2 Data21 = state[2]; - WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state4[0] ^= Data21; - state4[1] ^= Data01; - state4[2] ^= Data11; - } else { - state4[0] ^= Data01; - state4[1] ^= Data11; - state4[2] ^= Data21; - } - - ST4SL(rowInOut, (i + 1), state4, pad); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads, uint2 pad[Ncol / 2][Nrow][3]) - { - for (int i = 0; i < Nrow; i+=2) - { - uint2 state1[3], state2[3], state3[3], state4[3]; - - LD4SS(state1, rowIn, i, thread, threads); - LD4SS(state2, rowInOut, i, thread, threads); - LD4SL(state3, rowIn, i + 1, pad); - LD4SL(state4, rowInOut, i + 1, pad); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - LD4SS(state1, rowOut, i, thread, threads); - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state2[0] ^= Data2; - state2[1] ^= Data0; - state2[2] ^= Data1; - } - else - { - state2[0] ^= Data0; - state2[1] ^= Data1; - state2[2] ^= Data2; - } - - if (rowInOut != rowOut) { - ST4SS(rowInOut, i, state2, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] = state1[j]; - } - -#pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - - ST4SS(rowOut, i, state2, thread, threads); - - //====================================== - - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - LD4SL(state3, rowOut, i + 1, pad); - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data01 = state[0]; - uint2 Data11 = state[1]; - uint2 Data21 = state[2]; - WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state4[0] ^= Data21; - state4[1] ^= Data01; - state4[2] ^= Data11; - } - else - { - state4[0] ^= Data01; - state4[1] ^= Data11; - state4[2] ^= Data21; - } - - if (rowInOut != rowOut) { - ST4SL(rowInOut, i + 1, state4, pad); - #pragma unroll - for (int j = 0; j < 3; j++) - state4[j] = state3[j]; - } - -#pragma unroll - for (int j = 0; j < 3; j++) - state4[j] ^= state[j]; - - ST4SL(rowOut, i + 1, state4, pad); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads, uint2 pad[Ncol / 2][Nrow][3]) - { - uint2 state1[3], state2[3], state3[3], state4[3], last[3]; - - LD4SS(state1, 2, 0, thread, threads); - LD4SS(last, rowInOut, 0, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + last[j]; - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - last[0] ^= Data2; - last[1] ^= Data0; - last[2] ^= Data1; - } else { - last[0] ^= Data0; - last[1] ^= Data1; - last[2] ^= Data2; - } - - if (rowInOut == 5) - { - #pragma unroll - for (int j = 0; j < 3; j++) - last[j] ^= state[j]; - } - - LD4SL(state1, 2, 1, pad); - LD4SL(state2, rowInOut, 1, pad); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - for (int i = 2; i < Nrow; i+=2) - { - LD4SS(state1, 2, i, thread, threads); - LD4SS(state2, rowInOut, i, thread, threads); - LD4SL(state3, 2, i + 1, pad); - LD4SL(state4, rowInOut, i + 1, pad); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - //============================ - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - } - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= last[j]; - } - - // ================================= big local mem ========================== - static __device__ __forceinline__ - void reduceDuplex_biglocal(uint2 state[4], uint32_t thread, const uint32_t threads) - { - uint2 state1[3]; - uint2 state2[3]; - - - for (int i = 0; i < Nrow; i++) - { - ST4SSB(0, Ncol - i - 1, state, thread, threads); - - round_lyra(state); - } - - for (int i = 0; i < Nrow; i+=2) - { - LD4SSB(state1, 0, i, thread, threads); - LD4SSB(state2, 0, i + 1, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state2[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - ST4SSB(1, Ncol - i - 1, state1, thread, threads); - ST4SSB(1, Ncol - (i + 1) - 1, state2, thread, threads); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowSetup_biglocal(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads) - { - uint2 state1[3], state2[3], state3[3], state4[3]; - - for (int i = 0; i < Nrow; i+=2) - { - LD4SSB(state1, rowIn, i, thread, threads); - LD4SSB(state2, rowInOut, i, thread, threads); - LD4SSB(state3, rowIn, i + 1, thread, threads); - LD4SSB(state4, rowInOut, i + 1, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - ST4SSB(rowOut, Ncol - i - 1, state1, thread, threads); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state2[0] ^= Data2; - state2[1] ^= Data0; - state2[2] ^= Data1; - } else { - state2[0] ^= Data0; - state2[1] ^= Data1; - state2[2] ^= Data2; - } - - ST4SSB(rowInOut, i, state2, thread, threads); - - //===================================== - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state3[j] ^= state[j]; - - ST4SSB(rowOut, Ncol - (i + 1) - 1, state3, thread, threads); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data01 = state[0]; - uint2 Data11 = state[1]; - uint2 Data21 = state[2]; - WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state4[0] ^= Data21; - state4[1] ^= Data01; - state4[2] ^= Data11; - } else { - state4[0] ^= Data01; - state4[1] ^= Data11; - state4[2] ^= Data21; - } - - ST4SSB(rowInOut, (i + 1), state4, thread, threads); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowt_biglocal(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) - { - for (int i = 0; i < Nrow; i+=2) - { - uint2 state1[3], state2[3], state3[3], state4[3]; - - LD4SSB(state1, rowIn, i, thread, threads); - LD4SSB(state2, rowInOut, i, thread, threads); - LD4SSB(state3, rowIn, i + 1, thread, threads); - LD4SSB(state4, rowInOut, i + 1, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - LD4SSB(state1, rowOut, i, thread, threads); - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state2[0] ^= Data2; - state2[1] ^= Data0; - state2[2] ^= Data1; - } - else - { - state2[0] ^= Data0; - state2[1] ^= Data1; - state2[2] ^= Data2; - } - - if (rowInOut != rowOut) { - ST4SSB(rowInOut, i, state2, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] = state1[j]; - } - -#pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - - ST4SSB(rowOut, i, state2, thread, threads); - - //====================================== - - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - LD4SSB(state3, rowOut, i + 1, thread, threads); - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data01 = state[0]; - uint2 Data11 = state[1]; - uint2 Data21 = state[2]; - WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state4[0] ^= Data21; - state4[1] ^= Data01; - state4[2] ^= Data11; - } - else - { - state4[0] ^= Data01; - state4[1] ^= Data11; - state4[2] ^= Data21; - } - - if (rowInOut != rowOut) { - ST4SSB(rowInOut, i + 1, state4, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) - state4[j] = state3[j]; - } - -#pragma unroll - for (int j = 0; j < 3; j++) - state4[j] ^= state[j]; - - ST4SSB(rowOut, i + 1, state4, thread, threads); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowt_8_biglocal(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) - { - uint2 state1[3], state2[3], state3[3], state4[3], last[3]; - - LD4SSB(state1, 2, 0, thread, threads); - LD4SSB(last, rowInOut, 0, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + last[j]; - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - last[0] ^= Data2; - last[1] ^= Data0; - last[2] ^= Data1; - } else { - last[0] ^= Data0; - last[1] ^= Data1; - last[2] ^= Data2; - } - - if (rowInOut == 5) - { - #pragma unroll - for (int j = 0; j < 3; j++) - last[j] ^= state[j]; - } - - LD4SSB(state1, 2, 1, thread, threads); - LD4SSB(state2, rowInOut, 1, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - for (int i = 2; i < Nrow; i+=2) - { - LD4SSB(state1, 2, i, thread, threads); - LD4SSB(state2, rowInOut, i, thread, threads); - LD4SSB(state3, 2, i + 1, thread, threads); - LD4SSB(state4, rowInOut, i + 1, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - //============================ - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - } - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= last[j]; - } - // ================================= local mem big ========================== - - // ================================= high end ========================== - static __device__ __forceinline__ - void reduceDuplex_high_end(uint2 state[4], uint32_t thread, const uint32_t threads, uint64_t *g_pad) - { - - const int offset = blockIdx.x * 64 * Nrow * (Ncol / 2) * 3; - uint64_t *g_pado = g_pad + 64 * Nrow * (Ncol / 2) * 2; - - uint2 state1[3]; - uint2 state2[3]; - - - for (int i = 0; i < Nrow; i++) - { - if ((i & 1) == 1) - ST4SS(0, Ncol - i - 1, state, thread, threads); - else - ST4SG(0, Ncol - i - 1, state, thread, threads, offset, g_pad, g_pado); - - round_lyra(state); - } - - for (int i = 0; i < Nrow; i+=2) - { - LD4SS(state1, 0, i, thread, threads); - LD4SG(state2, 0, i + 1, thread, threads, offset, g_pad, g_pado); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state2[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - ST4SG(1, Ncol - i - 1, state1, thread, threads, offset, g_pad, g_pado); - ST4SS(1, Ncol - (i + 1) - 1, state2, thread, threads); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowSetup_high_end(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads, uint64_t *g_pad) - { - const int offset = blockIdx.x * 64 * Nrow * (Ncol / 2) * 3; - uint64_t *g_pado = g_pad + 64 * Nrow * (Ncol / 2) * 2; - - uint2 state1[3], state2[3], state3[3], state4[3]; - - for (int i = 0; i < Nrow; i+=2) - { - LD4SS(state1, rowIn, i, thread, threads); - LD4SS(state2, rowInOut, i, thread, threads); - LD4SG(state3, rowIn, i + 1, thread, threads, offset, g_pad, g_pado); - LD4SG(state4, rowInOut, i + 1, thread, threads, offset, g_pad, g_pado); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - ST4SG(rowOut, Ncol - i - 1, state1, thread, threads, offset, g_pad, g_pado); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state2[0] ^= Data2; - state2[1] ^= Data0; - state2[2] ^= Data1; - } else { - state2[0] ^= Data0; - state2[1] ^= Data1; - state2[2] ^= Data2; - } - - ST4SS(rowInOut, i, state2, thread, threads); - - //===================================== - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state3[j] ^= state[j]; - - ST4SS(rowOut, Ncol - (i + 1) - 1, state3, thread, threads); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data01 = state[0]; - uint2 Data11 = state[1]; - uint2 Data21 = state[2]; - WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state4[0] ^= Data21; - state4[1] ^= Data01; - state4[2] ^= Data11; - } else { - state4[0] ^= Data01; - state4[1] ^= Data11; - state4[2] ^= Data21; - } - - ST4SG(rowInOut, (i + 1), state4, thread, threads, offset, g_pad, g_pado); - } - } - - static __device__ __forceinline__ - void reduceDuplexRowt_high_end(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads, uint64_t *g_pad) - { - const int offset = blockIdx.x * 64 * Nrow * (Ncol / 2) * 3; - uint64_t *g_pado = g_pad + 64 * Nrow * (Ncol / 2) * 2; - - uint2 state3[3], state4[3], state5[3]; - - LD4SG(state3, rowIn, 0 + 1, thread, threads, offset, g_pad, g_pado); - LD4SG(state4, rowInOut, 0 + 1, thread, threads, offset, g_pad, g_pado); - LD4SG(state5, rowOut, 0 + 1, thread, threads, offset, g_pad, g_pado); - - for (int i = 0; i < Nrow; i+=2) - { - uint2 state1[3], state2[3], state7[3], state8[3], state9[3]; - - LD4SS(state1, rowIn, i, thread, threads); - LD4SS(state2, rowInOut, i, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - LD4SS(state1, rowOut, i, thread, threads); - - round_lyra(state); - LD4SG(state7, rowIn, i + 3, thread, threads, offset, g_pad, g_pado); - LD4SG(state8, rowInOut, i + 3, thread, threads, offset, g_pad, g_pado); - LD4SG(state9, rowOut, i + 3, thread, threads, offset, g_pad, g_pado); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state2[0] ^= Data2; - state2[1] ^= Data0; - state2[2] ^= Data1; - } - else - { - state2[0] ^= Data0; - state2[1] ^= Data1; - state2[2] ^= Data2; - } - - if (rowInOut != rowOut) { - ST4SS(rowInOut, i, state2, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] = state1[j]; - } - -#pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - - ST4SS(rowOut, i, state2, thread, threads); - - //====================================== - - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data01 = state[0]; - uint2 Data11 = state[1]; - uint2 Data21 = state[2]; - WarpShuffle3(Data01, Data11, Data21, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - state4[0] ^= Data21; - state4[1] ^= Data01; - state4[2] ^= Data11; - } - else - { - state4[0] ^= Data01; - state4[1] ^= Data11; - state4[2] ^= Data21; - } - - if (rowInOut != rowOut) { - ST4SG(rowInOut, i + 1, state4, thread, threads, offset, g_pad, g_pado); - #pragma unroll - for (int j = 0; j < 3; j++) - state4[j] = state5[j]; - } - -#pragma unroll - for (int j = 0; j < 3; j++) - state4[j] ^= state[j]; - - ST4SG(rowOut, i + 1, state4, thread, threads, offset, g_pad, g_pado); - - #pragma unroll - for (int j = 0; j < 3; j++) - state3[j] = state7[j]; - #pragma unroll - for (int j = 0; j < 3; j++) - state4[j] = state8[j]; - #pragma unroll - for (int j = 0; j < 3; j++) - state5[j] = state9[j]; - } - } - - static __device__ __forceinline__ - void reduceDuplexRowt_8_high_end(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads, uint64_t *g_pad) - { - const int offset = blockIdx.x * 64 * Nrow * (Ncol / 2) * 3; - uint64_t *g_pado = g_pad + 64 * Nrow * (Ncol / 2) * 2; - - uint2 state1[3], state2[3], state3[3], state4[3], last[3]; - - LD4SS(state1, 2, 0, thread, threads); - LD4SS(last, rowInOut, 0, thread, threads); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + last[j]; - - round_lyra(state); - - // simultaneously receive data from preceding thread and send data to following thread - uint2 Data0 = state[0]; - uint2 Data1 = state[1]; - uint2 Data2 = state[2]; - WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - - if (threadIdx.x == 0) - { - last[0] ^= Data2; - last[1] ^= Data0; - last[2] ^= Data1; - } else { - last[0] ^= Data0; - last[1] ^= Data1; - last[2] ^= Data2; - } - - if (rowInOut == 5) - { - #pragma unroll - for (int j = 0; j < 3; j++) - last[j] ^= state[j]; - } - - LD4SG(state1, 2, 1, thread, threads, offset, g_pad, g_pado); - LD4SG(state2, rowInOut, 1, thread, threads, offset, g_pad, g_pado); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - for (int i = 2; i < Nrow; i+=2) - { - LD4SS(state1, 2, i, thread, threads); - LD4SS(state2, rowInOut, i, thread, threads); - LD4SG(state3, 2, i + 1, thread, threads, offset, g_pad, g_pado); - LD4SG(state4, rowInOut, i + 1, thread, threads, offset, g_pad, g_pado); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; - - round_lyra(state); - - //============================ - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state3[j] + state4[j]; - - round_lyra(state); - } - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= last[j]; - } - // ================================= end high ========================== - - __constant__ uint2x4 blake2b_IV[2] = { - 0xf3bcc908lu, 0x6a09e667lu, - 0x84caa73blu, 0xbb67ae85lu, - 0xfe94f82blu, 0x3c6ef372lu, - 0x5f1d36f1lu, 0xa54ff53alu, - 0xade682d1lu, 0x510e527flu, - 0x2b3e6c1flu, 0x9b05688clu, - 0xfb41bd6blu, 0x1f83d9ablu, - 0x137e2179lu, 0x5be0cd19lu - }; - - __global__ __launch_bounds__(64, 1) - void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) - { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - - if (thread < threads) - { - uint2x4 state[4]; - - state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); - state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); - state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); - state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]); - state[2] = blake2b_IV[0]; - state[3] = blake2b_IV[1]; - - for (int i = 0; i<24; i++) - round_lyra(state); //because 12 is not enough - - ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; - ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; - ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; - ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; - } - } - - __global__ - __launch_bounds__(64, 1) - void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) - { - const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - - if (thread < threads) - { - uint2 pad[Ncol / 2][Nrow][3]; - - uint2 state[4]; - state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); - state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); - state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); - state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); - - reduceDuplex(state, thread, threads, pad); - reduceDuplexRowSetup(1, 0, 2, state, thread, threads, pad); - reduceDuplexRowSetup(2, 1, 3, state, thread, threads, pad); - reduceDuplexRowSetup(3, 0, 4, state, thread, threads, pad); - reduceDuplexRowSetup(4, 3, 5, state, thread, threads, pad); - reduceDuplexRowSetup(5, 2, 6, state, thread, threads, pad); - reduceDuplexRowSetup(6, 1, 7, state, thread, threads, pad); - - uint32_t rowa; - uint32_t row = 0; - uint32_t pre = 7; - for (int i = 0; i < 7; i++) { - rowa = WarpShuffle(state[0].x, 0, 4) & 7; - reduceDuplexRowt(pre, rowa, row, state, thread, threads, pad); - pre = row; - row = (row + 3) % 8; - } - rowa = WarpShuffle(state[0].x, 0, 4) & 7; - reduceDuplexRowt_8(rowa, state, thread, threads, pad); - - DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; - DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; - DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; - DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; - } - } - - __global__ - __launch_bounds__(64, 1) - void lyra2_gpu_hash_32_2_biglocal(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) - { - const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - - if (thread < threads) - { - uint2 state[4]; - state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); - state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); - state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); - state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); - - reduceDuplex_biglocal(state, thread, threads); - reduceDuplexRowSetup_biglocal(1, 0, 2, state, thread, threads); - reduceDuplexRowSetup_biglocal(2, 1, 3, state, thread, threads); - reduceDuplexRowSetup_biglocal(3, 0, 4, state, thread, threads); - reduceDuplexRowSetup_biglocal(4, 3, 5, state, thread, threads); - reduceDuplexRowSetup_biglocal(5, 2, 6, state, thread, threads); - reduceDuplexRowSetup_biglocal(6, 1, 7, state, thread, threads); - - uint32_t rowa; - uint32_t row = 0; - uint32_t pre = 7; - for (int i = 0; i < 7; i++) { - rowa = WarpShuffle(state[0].x, 0, 4) & 7; - reduceDuplexRowt_biglocal(pre, rowa, row, state, thread, threads); - pre = row; - row = (row + 3) % 8; - } - rowa = WarpShuffle(state[0].x, 0, 4) & 7; - reduceDuplexRowt_8_biglocal(rowa, state, thread, threads); - - DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; - DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; - DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; - DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; - } - } +#include +#include + +#define TPB52 32 + +#include "cuda_lyra2_sm2.cuh" +#include "cuda_lyra2_sm5.cuh" + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 520 +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500 + +#include "cuda_lyra2_vectors.h" + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); +#endif + +#define Nrow 8 +#define Ncol 8 +#define memshift 3 + +#define BUF_COUNT 0 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); +#elif BUF_COUNT == 0 + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +#else + if (row < BUF_COUNT) + { + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); + } + else + { + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; + } +#endif +} + +__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + +#elif BUF_COUNT == 0 + #pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; + +#else + if (row < BUF_COUNT) + { + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + } + else + { + #pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; + } +#endif +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif + +#if __CUDA_ARCH__ > 500 || !defined(__CUDA_ARCH) +static __device__ __forceinline__ +void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y; + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 63); +} +#endif + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + +static __device__ __forceinline__ +void round_lyra(uint2x4* s) +{ + Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); + Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) +{ + uint2 state1[3]; + + #pragma unroll + for (int i = 0; i < Nrow; i++) + { + ST4S(0, Ncol - i - 1, state, thread, threads); + + round_lyra(state); + } + + #pragma unroll 4 + for (int i = 0; i < Nrow; i++) + { + LD4S(state1, 0, i, thread, threads); + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + ST4S(1, Ncol - i - 1, state1, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3]; + + #pragma unroll 1 + for (int i = 0; i < Nrow; i++) + { + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + ST4S(rowOut, Ncol - i - 1, state1, thread, threads); + + // simultaneously receive data from preceding thread and send data to following thread + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + ST4S(rowInOut, i, state2, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + for (int i = 0; i < Nrow; i++) + { + uint2 state1[3], state2[3]; + + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + // simultaneously receive data from preceding thread and send data to following thread + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + ST4S(rowInOut, i, state2, thread, threads); + + LD4S(state1, rowOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + ST4S(rowOut, i, state1, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3], last[3]; + + LD4S(state1, 2, 0, thread, threads); + LD4S(last, rowInOut, 0, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + last[j]; + + round_lyra(state); + + // simultaneously receive data from preceding thread and send data to following thread + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == 5) + { + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < Nrow; i++) + { + LD4S(state1, 2, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} + +__constant__ uint2x4 blake2b_IV[2] = { + 0xf3bcc908lu, 0x6a09e667lu, + 0x84caa73blu, 0xbb67ae85lu, + 0xfe94f82blu, 0x3c6ef372lu, + 0x5f1d36f1lu, 0xa54ff53alu, + 0xade682d1lu, 0x510e527flu, + 0x2b3e6c1flu, 0x9b05688clu, + 0xfb41bd6blu, 0x1f83d9ablu, + 0x137e2179lu, 0x5be0cd19lu +}; + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<24; i++) + round_lyra(state); //because 12 is not enough + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} __global__ - __launch_bounds__(64, 1) - void lyra2_gpu_hash_32_2_high_end(uint32_t threads, uint32_t startNounce, uint64_t *g_pad) - { - const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - - if (thread < threads) - { - uint2 state[4]; - state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); - state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); - state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); - state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); - - reduceDuplex_high_end(state, thread, threads, g_pad); - reduceDuplexRowSetup_high_end(1, 0, 2, state, thread, threads, g_pad); - reduceDuplexRowSetup_high_end(2, 1, 3, state, thread, threads, g_pad); - reduceDuplexRowSetup_high_end(3, 0, 4, state, thread, threads, g_pad); - reduceDuplexRowSetup_high_end(4, 3, 5, state, thread, threads, g_pad); - reduceDuplexRowSetup_high_end(5, 2, 6, state, thread, threads, g_pad); - reduceDuplexRowSetup_high_end(6, 1, 7, state, thread, threads, g_pad); - - uint32_t rowa; - uint32_t row = 0; - uint32_t pre = 7; - for (int i = 0; i < 7; i++) { - rowa = WarpShuffle(state[0].x, 0, 4) & 7; - reduceDuplexRowt_high_end(pre, rowa, row, state, thread, threads, g_pad); - pre = row; - row = (row + 3) % 8; - } - rowa = WarpShuffle(state[0].x, 0, 4) & 7; - reduceDuplexRowt_8_high_end(rowa, state, thread, threads, g_pad); - - DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; - DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; - DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; - DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; - } - } - - __global__ __launch_bounds__(64, 1) - void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) - { - const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - - uint28 state[4]; - - if (thread < threads) - { - state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); - state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); - state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); - state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); - - for (int i = 0; i < 12; i++) - round_lyra(state); - - g_hash[thread + threads * 0] = state[0].x; - g_hash[thread + threads * 1] = state[0].y; - g_hash[thread + threads * 2] = state[0].z; - g_hash[thread + threads * 3] = state[0].w; - - } //thread - } - #else - #if __CUDA_ARCH__ < 500 - - /* for unsupported SM arch */ - __device__ void* DMatrix; - #endif - __global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} - __global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} - __global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} - #endif -__host__ -void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) +__launch_bounds__(TPB52, 1) +void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash) { - // just assign the device pointer allocated in main loop - cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; + if (thread < threads) + { + uint2 state[4]; + state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); + + reduceDuplex(state, thread, threads); + reduceDuplexRowSetup(1, 0, 2, state, thread, threads); + reduceDuplexRowSetup(2, 1, 3, state, thread, threads); + reduceDuplexRowSetup(3, 0, 4, state, thread, threads); + reduceDuplexRowSetup(4, 3, 5, state, thread, threads); + reduceDuplexRowSetup(5, 2, 6, state, thread, threads); + reduceDuplexRowSetup(6, 1, 7, state, thread, threads); + + uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(7, rowa, 0, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(0, rowa, 3, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(3, rowa, 6, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(6, rowa, 1, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(1, rowa, 4, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(4, rowa, 7, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(7, rowa, 2, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt_8(rowa, state, thread, threads); + + DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; + } } -__host__ -void lyra2_cpu_init_high_end(int thr_id, uint32_t threads, uint64_t *g_pad) + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) { + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + g_hash[thread + threads * 0] = state[0].x; + g_hash[thread + threads * 1] = state[0].y; + g_hash[thread + threads * 2] = state[0].z; + g_hash[thread + threads * 3] = state[0].w; + } } +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *psrc = (uint2*)(&d_hash_512[offset]); + state[0].x = state[1].x = __ldg(&psrc[0]); + state[0].y = state[1].y = __ldg(&psrc[1]); + state[0].z = state[1].z = __ldg(&psrc[2]); + state[0].w = state[1].w = __ldg(&psrc[3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<24; i++) + round_lyra(state); + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) +{ + // This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *pdst = (uint2*)(&d_hash_512[offset]); + pdst[0] = state[0].x; + pdst[1] = state[0].y; + pdst[2] = state[0].z; + pdst[3] = state[0].w; + } +} +#else +#if __CUDA_ARCH__ < 500 + +/* for unsupported SM arch */ +__device__ void* DMatrix; +#endif +__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {} +__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {} +__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {} +#endif + __host__ -void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti, uint32_t high_end) +void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) { + // just assign the device pointer allocated in main loop + cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); } __host__ -void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, uint64_t *g_pad, bool gtx750ti, uint32_t high_end) +void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx750ti) { int dev_id = device_map[thr_id % MAX_GPUS]; @@ -1361,11 +579,8 @@ void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounc else if (cuda_arch[dev_id] >= 500) tpb = TPB50; else if (cuda_arch[dev_id] >= 200) tpb = TPB20; - dim3 grid0((threads * 4 + 32 - 1) / 32); - dim3 block0(4, 32 >> 2); - - dim3 grid1((threads * 4 + 64 - 1) / 64); - dim3 block1(4, 64 >> 2); + dim3 grid1((threads * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); dim3 grid2((threads + 64 - 1) / 64); dim3 block2(64); @@ -1375,16 +590,9 @@ void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounc if (cuda_arch[dev_id] >= 520) { - lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); - - if (high_end == 1) - lyra2_gpu_hash_32_2_high_end <<< grid1, block1, 12 * (8 - 0) * sizeof(uint2) * 64 >>> (threads, startNounce, g_pad); - else if (high_end == 2) - lyra2_gpu_hash_32_2_biglocal <<< grid0, block0, 24 * (8 - 0) * sizeof(uint2) * 32 >>> (threads, startNounce, g_pad); - else - lyra2_gpu_hash_32_2 <<< grid1, block1, 12 * (8 - 0) * sizeof(uint2) * 64 >>> (threads, startNounce, d_hash); - - lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash); + lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash); + lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash); } else if (cuda_arch[dev_id] >= 500) { @@ -1397,12 +605,58 @@ void lyra2_cpu_hash_32_fancyIX(int thr_id, uint32_t threads, uint32_t startNounc // suitable amount to adjust for 10warp shared_mem = 6144; - lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash); + lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash); + } + else + lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash); +} + +__host__ +void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti) +{ + int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; + if (cuda_arch[dev_id] >= 520) tpb = TPB52; + else if (cuda_arch[dev_id] >= 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; - lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash); + dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); - lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + + if (cuda_arch[dev_id] >= 520) + { + const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152; + lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256); + lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + + lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256); + lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + } + else if (cuda_arch[dev_id] >= 500) + { + size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps + lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256); + lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0); + + lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256); + lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1); + } + else { + // alternative method for SM 3.x + hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0); + lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti); + hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0); + hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1); + lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti); + hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1); } - else - lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash); } diff --git a/lyra2/cuda_lyra2Z_sm5.cuh b/lyra2/cuda_lyra2Z_sm5.cuh index d4773c2d01..1d8ae68930 100644 --- a/lyra2/cuda_lyra2Z_sm5.cuh +++ b/lyra2/cuda_lyra2Z_sm5.cuh @@ -53,7 +53,7 @@ __device__ __forceinline__ void ST4S(const int index, const uint2 data) shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; } -#if __CUDA_ARCH__ >= 300 +#if __CUDA_ARCH__ == 300 __device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) { return __shfl(a, b, c); diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index 18263b2828..cc0bd82d76 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -3,7 +3,7 @@ #ifdef __INTELLISENSE__ /* just for vstudio code colors, only uncomment that temporary, dont commit it */ //#undef __CUDA_ARCH__ -//#define __CUDA_ARCH__ 500 +//#define __CUDA_ARCH__ 300 #endif #include "cuda_helper.h" @@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, } __global__ __launch_bounds__(TPB30, 1) -void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -224,5 +224,68 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h #else /* if __CUDA_ARCH__ < 200 .. host */ -__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} +__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {} #endif + +// ------------------------------------------------------------------------------------------------------------------------- + +// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo... + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350 + +__global__ __launch_bounds__(128, 8) +void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const size_t offset = (size_t) 16 * thread + (round * 8U); + uint2 *psrc = (uint2*) (&d_hash64[offset]); + uint2 *pdst = (uint2*) (&d_hash_lyra[thread]); + pdst[threads*0] = __ldg(&psrc[0]); + pdst[threads*1] = __ldg(&psrc[1]); + pdst[threads*2] = __ldg(&psrc[2]); + pdst[threads*3] = __ldg(&psrc[3]); + } +} + +__global__ __launch_bounds__(128, 8) +void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const size_t offset = (size_t) 16 * thread + (round * 8U); + uint2 *psrc = (uint2*) (&d_hash_lyra[thread]); + uint2 *pdst = (uint2*) (&d_hash64[offset]); + pdst[0] = psrc[0]; + pdst[1] = psrc[threads*1]; + pdst[2] = psrc[threads*2]; + pdst[3] = psrc[threads*3]; + } +} +#else +/* if __CUDA_ARCH__ < 200 .. host */ +__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {} +__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {} +#endif + +__host__ +void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + hash64_to_lyra32_gpu <<>> (threads, d_hash64, (uint2*) d_hash_lyra, round); +} + +__host__ +void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round) +{ + const uint32_t threadsperblock = 128; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + hash64_from_lyra32_gpu <<>> (threads, d_hash64, (uint2*) d_hash_lyra, round); +} diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh index d90ff05f29..85adfd91fc 100644 --- a/lyra2/cuda_lyra2_sm5.cuh +++ b/lyra2/cuda_lyra2_sm5.cuh @@ -33,7 +33,7 @@ __device__ __forceinline__ void ST4S(const int index, const uint2 data) shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; } -#if __CUDA_ARCH__ >= 300 +#if __CUDA_ARCH__ == 300 __device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) { return __shfl(a, b, c); @@ -589,15 +589,14 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr } __global__ __launch_bounds__(64, 1) -void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - const uint2x4 blake2b_IV[2] = { { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } }; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint2x4 state[4]; @@ -622,14 +621,13 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha } __global__ __launch_bounds__(TPB50, 1) -void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); if (thread < threads) { uint2 state[4]; - state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); @@ -662,14 +660,13 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha } __global__ __launch_bounds__(64, 1) -void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { uint2x4 state[4]; - state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); @@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha } } +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) +{ + const uint2x4 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; + // This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint2x4 state[4]; + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *psrc = (uint2*)(&d_hash_512[offset]); + state[0].x = state[1].x = __ldg(&psrc[0]); + state[0].y = state[1].y = __ldg(&psrc[1]); + state[0].z = state[1].z = __ldg(&psrc[2]); + state[0].w = state[1].w = __ldg(&psrc[3]); + + state[1] = state[0]; + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<24; i++) + round_lyra(state); + + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) +{ + // This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + if (thread < threads) + { + uint2x4 state[4]; + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + const size_t offset = (size_t)8 * thread + (round * 4U); + uint2 *pdst = (uint2*)(&d_hash_512[offset]); + pdst[0] = state[0].x; + pdst[1] = state[0].y; + pdst[2] = state[0].z; + pdst[3] = state[0].w; + } +} #else /* if __CUDA_ARCH__ != 500 .. host */ -__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} -__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} -__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {} +__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {} #endif diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index df3291c1fc..a95fb800c2 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -1,59 +1,155 @@ -/** - * Lyra2 (v2) CUDA Implementation - * - * Based on djm34/VTC sources and incredible 2x boost by Nanashi Meiyo-Meijin (May 2016) - */ #include #include #include -#include "cuda_lyra2v2_sm3.cuh" +#define TPB52 32 +#define TPB50 32 +#define TPB30 32 +#define TPB20 32 #ifdef __INTELLISENSE__ /* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 +#define __CUDA_ARCH__ 520 #endif -#define TPB 32 - -#if __CUDA_ARCH__ >= 500 - #include "cuda_lyra2_vectors.h" +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +__device__ void __threadfence_block(); +#if __CUDA_ARCH__ >= 300 +__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); +#endif +#endif + #define Nrow 4 #define Ncol 4 #define memshift 3 -__device__ uint2x4 *DMatrix; +__device__ uint2x4 *DState; __device__ __forceinline__ uint2 LD4S(const int index) { extern __shared__ uint2 shared_mem[]; + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; } __device__ __forceinline__ void ST4S(const int index, const uint2 data) { extern __shared__ uint2 shared_mem[]; + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; } -__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c) +__device__ __forceinline__ +void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y; + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 63); +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) { return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); } -__device__ __forceinline__ -void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) { - a += b; d ^= a; d = SWAPUINT2(d); - c += d; b ^= c; b = ROR2(b, 24); - a += b; d ^= a; d = ROR2(d, 16); - c += d; b ^= c; b = ROR2(b, 63); + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif + + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc_v5(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc_v5(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); } __device__ __forceinline__ -void round_lyra_v5(uint2x4 s[4]) +void round_lyra(uint2x4* s) { Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); @@ -66,374 +162,473 @@ void round_lyra_v5(uint2x4 s[4]) Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); } -__device__ __forceinline__ -void round_lyra_v5(uint2 s[4]) -{ - Gfunc_v5(s[0], s[1], s[2], s[3]); - s[1] = shuffle2(s[1], threadIdx.x + 1, 4); - s[2] = shuffle2(s[2], threadIdx.x + 2, 4); - s[3] = shuffle2(s[3], threadIdx.x + 3, 4); - Gfunc_v5(s[0], s[1], s[2], s[3]); - s[1] = shuffle2(s[1], threadIdx.x + 3, 4); - s[2] = shuffle2(s[2], threadIdx.x + 2, 4); - s[3] = shuffle2(s[3], threadIdx.x + 1, 4); -} -__device__ __forceinline__ -void reduceDuplexRowSetup2(uint2 state[4]) +__device__ __forceinline__ void reduceDuplexRowSetupV2(uint2 state[4]) { - uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; int i, j; + uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; - #pragma unroll +#if __CUDA_ARCH__ > 500 +#pragma unroll +#endif for (int i = 0; i < Ncol; i++) { - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state0[Ncol - i - 1][j] = state[j]; - round_lyra_v5(state); + round_lyra(state); } //#pragma unroll 4 for (i = 0; i < Ncol; i++) { - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state[j] ^= state0[i][j]; - round_lyra_v5(state); + round_lyra(state); - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state1[Ncol - i - 1][j] = state0[i][j]; - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state1[Ncol - i - 1][j] ^= state[j]; } + uint32_t s0 = 0; + uint32_t s2 = 33; for (i = 0; i < Ncol; i++) { - const uint32_t s0 = memshift * Ncol * 0 + i * memshift; - const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift; - - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state[j] ^= state1[i][j] + state0[i][j]; - round_lyra_v5(state); + round_lyra(state); - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state2[j] = state1[i][j]; - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state2[j] ^= state[j]; - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) ST4S(s2 + j, state2[j]); - uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); - uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); - uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + //��Ž�O�̃X���b�h����f�[�^��Ⴄ(�����Ɉ��̃X���b�h�Ƀf�[�^�𑗂�) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { state0[i][0] ^= Data2; state0[i][1] ^= Data0; state0[i][2] ^= Data1; - } else { + } + else + { state0[i][0] ^= Data0; state0[i][1] ^= Data1; state0[i][2] ^= Data2; } - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) ST4S(s0 + j, state0[i][j]); - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state0[i][j] = state2[j]; + s0 += memshift; + s2 -= memshift; } + s2 += 24; for (i = 0; i < Ncol; i++) { - const uint32_t s1 = memshift * Ncol * 1 + i*memshift; - const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift; - - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state[j] ^= state1[i][j] + state0[Ncol - i - 1][j]; - round_lyra_v5(state); + round_lyra(state); - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) state0[Ncol - i - 1][j] ^= state[j]; - - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) - ST4S(s3 + j, state0[Ncol - i - 1][j]); + ST4S(s2 + j, state0[Ncol - i - 1][j]); - uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); - uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); - uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + //��Ž�O�̃X���b�h����f�[�^��Ⴄ(�����Ɉ��̃X���b�h�Ƀf�[�^�𑗂�) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { state1[i][0] ^= Data2; state1[i][1] ^= Data0; state1[i][2] ^= Data1; - } else { + } + else + { state1[i][0] ^= Data0; state1[i][1] ^= Data1; state1[i][2] ^= Data2; } - #pragma unroll +#pragma unroll for (j = 0; j < 3; j++) - ST4S(s1 + j, state1[i][j]); + ST4S(s0 + j, state1[i][j]); + + s0 += memshift; + s2 -= memshift; } } -__device__ -void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4]) +__device__ void reduceDuplexRowtV2(uint2 state[4]) { - uint2 state1[3], state2[3]; - const uint32_t ps1 = memshift * Ncol * rowIn; - const uint32_t ps2 = memshift * Ncol * rowInOut; - const uint32_t ps3 = memshift * Ncol * rowOut; + uint32_t rowInOut = WarpShuffle(state[0].x, 0, 4) & 3; + + uint2 state2[3], state1[3], last[3]; + uint32_t s1 = 36; + uint32_t s2 = 12 * rowInOut; + uint32_t s3 = 0; for (int i = 0; i < Ncol; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; - const uint32_t s3 = ps3 + i*memshift; +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s2 + j); + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= LD4S(s1 + j) + state2[j]; + + round_lyra(state); + + //��Ž�O�̃X���b�h����f�[�^��Ⴄ(�����Ɉ��̃X���b�h�Ƀf�[�^�𑗂�) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - state1[j] = LD4S(s1 + j); + { + ST4S(s2 + j, state2[j]); + ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); + } + + s1 += memshift; + s2 += memshift; + s3 += memshift; + } + s1 = 0; + rowInOut = WarpShuffle(state[0].x, 0, 4) & 3; + s2 = 12 * rowInOut; - #pragma unroll + for (int i = 0; i < Ncol; i++) + { +#pragma unroll for (int j = 0; j < 3; j++) state2[j] = LD4S(s2 + j); - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - state[j] ^= state1[j] + state2[j]; + state[j] ^= LD4S(s1 + j) + state2[j]; - round_lyra_v5(state); + round_lyra(state); - uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); - uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); - uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + //��Ž�O�̃X���b�h����f�[�^��Ⴄ(�����Ɉ��̃X���b�h�Ƀf�[�^�𑗂�) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { state2[0] ^= Data2; state2[1] ^= Data0; state2[2] ^= Data1; - } else { + } + else + { state2[0] ^= Data0; state2[1] ^= Data1; state2[2] ^= Data2; } - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) + { ST4S(s2 + j, state2[j]); + ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); + } + + s1 += memshift; + s2 += memshift; + s3 += memshift; + } + + rowInOut = WarpShuffle(state[0].x, 0, 4) & 3; + s2 = 12 * rowInOut; + + for (int i = 0; i < Ncol; i++) + { +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s2 + j); + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= LD4S(s1 + j) + state2[j]; + + round_lyra(state); + + //��Ž�O�̃X���b�h����f�[�^��Ⴄ(�����Ɉ��̃X���b�h�Ƀf�[�^�𑗂�) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) + { + ST4S(s2 + j, state2[j]); ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); + } + + s1 += memshift; + s2 += memshift; + s3 += memshift; } -} -__device__ -void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4]) -{ - const int rowIn = 2; - const int rowOut = 3; + rowInOut = WarpShuffle(state[0].x, 0, 4) & 3; + s2 = 12 * rowInOut; - int i, j; - uint2 last[3]; - const uint32_t ps1 = memshift * Ncol * rowIn; - const uint32_t ps2 = memshift * Ncol * rowInOut; - - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - last[j] = LD4S(ps2 + j); + last[j] = LD4S(s2 + j); - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - state[j] ^= LD4S(ps1 + j) + last[j]; + state[j] ^= LD4S(s1 + j) + last[j]; - round_lyra_v5(state); + round_lyra(state); - uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); - uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); - uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + //��Ž�O�̃X���b�h����f�[�^��Ⴄ(�����Ɉ��̃X���b�h�Ƀf�[�^�𑗂�) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - if (threadIdx.x == 0) { + if (threadIdx.x == 0) + { last[0] ^= Data2; last[1] ^= Data0; last[2] ^= Data1; - } else { + } + else + { last[0] ^= Data0; last[1] ^= Data1; last[2] ^= Data2; } - if (rowInOut == rowOut) + if (rowInOut == 3) { - #pragma unroll - for (j = 0; j < 3; j++) +#pragma unroll + for (int j = 0; j < 3; j++) last[j] ^= state[j]; } + s1 += memshift; + s2 += memshift; - for (i = 1; i < Ncol; i++) + for (int i = 1; i < Ncol; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; - - #pragma unroll - for (j = 0; j < 3; j++) +#pragma unroll + for (int j = 0; j < 3; j++) state[j] ^= LD4S(s1 + j) + LD4S(s2 + j); - round_lyra_v5(state); + round_lyra(state); + + s1 += memshift; + s2 += memshift; } - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) state[j] ^= last[j]; } -__global__ -__launch_bounds__(TPB, 1) -void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) +__constant__ uint28 blake2b_IV[2] = { + 0xf3bcc908lu, 0x6a09e667lu, + 0x84caa73blu, 0xbb67ae85lu, + 0xfe94f82blu, 0x3c6ef372lu, + 0x5f1d36f1lu, 0xa54ff53alu, + 0xade682d1lu, 0x510e527flu, + 0x2b3e6c1flu, 0x9b05688clu, + 0xfb41bd6blu, 0x1f83d9ablu, + 0x137e2179lu, 0x5be0cd19lu +}; + +__constant__ uint28 Mask[2] = { + 0x00000020lu, 0x00000000lu, + 0x00000020lu, 0x00000000lu, + 0x00000020lu, 0x00000000lu, + 0x00000001lu, 0x00000000lu, + 0x00000004lu, 0x00000000lu, + 0x00000004lu, 0x00000000lu, + 0x00000080lu, 0x00000000lu, + 0x00000000lu, 0x01000000lu +}; + +__global__ __launch_bounds__(64, 1) +void lyra2v2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - const uint2x4 blake2b_IV[2] = { - 0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL, - 0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL, - 0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL, - 0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL - }; - - const uint2x4 Mask[2] = { - 0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, - 0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL, - 0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL, - 0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL - }; - - uint2x4 state[4]; + uint28 state[4]; if (thread < threads) { - state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]); - state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]); - state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]); - state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]); + /* + state[0].x = state[1].x = __ldg(&outputHash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&outputHash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&outputHash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&outputHash[thread + threads * 3]); + */ + state[0].x = state[1].x = __ldg(&outputHash[thread * 8 + 0]); + state[0].y = state[1].y = __ldg(&outputHash[thread * 8 + 1]); + state[0].z = state[1].z = __ldg(&outputHash[thread * 8 + 2]); + state[0].w = state[1].w = __ldg(&outputHash[thread * 8 + 3]); state[2] = blake2b_IV[0]; state[3] = blake2b_IV[1]; +#pragma unroll 2 for (int i = 0; i<12; i++) - round_lyra_v5(state); + round_lyra(state); state[0] ^= Mask[0]; state[1] ^= Mask[1]; +#pragma unroll 2 for (int i = 0; i<12; i++) - round_lyra_v5(state); + round_lyra(state); - DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0]; - DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1]; - DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2]; - DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3]; - } + DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0]; + DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1]; + DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2]; + DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3]; + + } //thread } -__global__ -__launch_bounds__(TPB, 1) -void lyra2v2_gpu_hash_32_2(uint32_t threads) +#if __CUDA_ARCH__ < 300 +__global__ __launch_bounds__(TPB20, 1) +#elif __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(TPB30, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(TPB50, 1) +#else +__global__ __launch_bounds__(TPB52, 1) +#endif +void lyra2v2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) { const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; if (thread < threads) { uint2 state[4]; - state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; - state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; - state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; - state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; - - reduceDuplexRowSetup2(state); - - uint32_t rowa; - int prev = 3; + state[0] = ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[1] = ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[2] = ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[3] = ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; - for (int i = 0; i < 3; i++) - { - rowa = __shfl(state[0].x, 0, 4) & 3; - reduceDuplexRowt2(prev, rowa, i, state); - prev = i; - } + reduceDuplexRowSetupV2(state); - rowa = __shfl(state[0].x, 0, 4) & 3; - reduceDuplexRowt2x4(rowa, state); + reduceDuplexRowtV2(state); - ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; - ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; - ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; - ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; - } + ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; + ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; + ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; + ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; + } //thread } -__global__ -__launch_bounds__(TPB, 1) -void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) +__global__ __launch_bounds__(64, 1) +void lyra2v2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - uint2x4 state[4]; + uint28 state[4]; if (thread < threads) { - state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]); - state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]); - state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]); - state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]); + state[0] = __ldg4(&DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]); + state[1] = __ldg4(&DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]); + state[2] = __ldg4(&DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]); + state[3] = __ldg4(&DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]); +#pragma unroll 2 for (int i = 0; i < 12; i++) - round_lyra_v5(state); + round_lyra(state); + /* outputHash[thread + threads * 0] = state[0].x; outputHash[thread + threads * 1] = state[0].y; outputHash[thread + threads * 2] = state[0].z; outputHash[thread + threads * 3] = state[0].w; - } -} - -#else -#include "cuda_helper.h" -#if __CUDA_ARCH__ < 200 -__device__ void* DMatrix; -#endif -__global__ void lyra2v2_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {} -__global__ void lyra2v2_gpu_hash_32_2(uint32_t threads) {} -__global__ void lyra2v2_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {} -#endif + */ + outputHash[thread * 8 + 0] = state[0].x; + outputHash[thread * 8 + 1] = state[0].y; + outputHash[thread * 8 + 2] = state[0].z; + outputHash[thread * 8 + 3] = state[0].w; + } //thread +} __host__ void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) { - cuda_get_arch(thr_id); + int dev_id = device_map[thr_id % MAX_GPUS]; // just assign the device pointer allocated in main loop - cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(DState, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); } __host__ @@ -441,29 +636,26 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin { int dev_id = device_map[thr_id % MAX_GPUS]; - if (device_sm[dev_id] >= 500) { + uint32_t tpb = TPB52; - const uint32_t tpb = TPB; + if (cuda_arch[dev_id] > 500) tpb = TPB52; + else if (cuda_arch[dev_id] == 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 300) tpb = TPB30; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; - dim3 grid2((threads + tpb - 1) / tpb); - dim3 block2(tpb); - dim3 grid4((threads * 4 + tpb - 1) / tpb); - dim3 block4(4, tpb / 4); + dim3 grid1((threads * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); - lyra2v2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash); - lyra2v2_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads); - lyra2v2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash); + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); - } else { + if (cuda_arch[dev_id] < 500) + cudaFuncSetCacheConfig(lyra2v2_gpu_hash_32_2, cudaFuncCachePreferShared); - uint32_t tpb = 16; - if (cuda_arch[dev_id] >= 350) tpb = TPB35; - else if (cuda_arch[dev_id] >= 300) tpb = TPB30; - else if (cuda_arch[dev_id] >= 200) tpb = TPB20; + lyra2v2_gpu_hash_32_1 << > > (threads, startNounce, (uint2*)g_hash); - dim3 grid((threads + tpb - 1) / tpb); - dim3 block(tpb); - lyra2v2_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash); + lyra2v2_gpu_hash_32_2 << > > (threads, startNounce, g_hash); - } + lyra2v2_gpu_hash_32_3 << > > (threads, startNounce, (uint2*)g_hash); + //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/lyra2/cuda_lyra2v3.cu b/lyra2/cuda_lyra2v3.cu new file mode 100644 index 0000000000..0278cabc78 --- /dev/null +++ b/lyra2/cuda_lyra2v3.cu @@ -0,0 +1,481 @@ +/** + * Lyra2 (v3) CUDA Implementation + * + * Based on VTC sources + */ +#include +#include +#include +#include "cuda_helper.h" + +#include "cuda_lyra2v3_sm3.cuh" + + + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif + +#define TPB 32 + +#if __CUDA_ARCH__ >= 500 + +#include "cuda_lyra2_vectors.h" + +#define Nrow 4 +#define Ncol 4 +#define memshift 3 + + +__device__ uint2x4 *DMatrix; + +__device__ __forceinline__ uint2 LD4S(const int index) +{ + extern __shared__ uint2 shared_mem[]; + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(const int index, const uint2 data) +{ + extern __shared__ uint2 shared_mem[]; + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} + +__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ +void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d ^= a; d = SWAPUINT2(d); + c += d; b ^= c; b = ROR2(b, 24); + a += b; d ^= a; d = ROR2(d, 16); + c += d; b ^= c; b = ROR2(b, 63); +} + +__device__ __forceinline__ +void round_lyra_v5(uint2x4 s[4]) +{ + Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); +} + +__device__ __forceinline__ +void round_lyra_v5(uint2 s[4]) +{ + Gfunc_v5(s[0], s[1], s[2], s[3]); + s[1] = shuffle2(s[1], threadIdx.x + 1, 4); + s[2] = shuffle2(s[2], threadIdx.x + 2, 4); + s[3] = shuffle2(s[3], threadIdx.x + 3, 4); + Gfunc_v5(s[0], s[1], s[2], s[3]); + s[1] = shuffle2(s[1], threadIdx.x + 3, 4); + s[2] = shuffle2(s[2], threadIdx.x + 2, 4); + s[3] = shuffle2(s[3], threadIdx.x + 1, 4); +} + +__device__ __forceinline__ +void reduceDuplexRowSetup2(uint2 state[4]) +{ + uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; + int i, j; + + #pragma unroll + for (int i = 0; i < Ncol; i++) + { + #pragma unroll + for (j = 0; j < 3; j++) + state0[Ncol - i - 1][j] = state[j]; + round_lyra_v5(state); + } + + //#pragma unroll 4 + for (i = 0; i < Ncol; i++) + { + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state0[i][j]; + + round_lyra_v5(state); + + #pragma unroll + for (j = 0; j < 3; j++) + state1[Ncol - i - 1][j] = state0[i][j]; + + #pragma unroll + for (j = 0; j < 3; j++) + state1[Ncol - i - 1][j] ^= state[j]; + } + + for (i = 0; i < Ncol; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift; + + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[i][j]; + + round_lyra_v5(state); + + #pragma unroll + for (j = 0; j < 3; j++) + state2[j] = state1[i][j]; + + #pragma unroll + for (j = 0; j < 3; j++) + state2[j] ^= state[j]; + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s2 + j, state2[j]); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + state0[i][0] ^= Data2; + state0[i][1] ^= Data0; + state0[i][2] ^= Data1; + } else { + state0[i][0] ^= Data0; + state0[i][1] ^= Data1; + state0[i][2] ^= Data2; + } + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s0 + j, state0[i][j]); + + #pragma unroll + for (j = 0; j < 3; j++) + state0[i][j] = state2[j]; + + } + + for (i = 0; i < Ncol; i++) + { + const uint32_t s1 = memshift * Ncol * 1 + i*memshift; + const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift; + + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[Ncol - i - 1][j]; + + round_lyra_v5(state); + + #pragma unroll + for (j = 0; j < 3; j++) + state0[Ncol - i - 1][j] ^= state[j]; + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s3 + j, state0[Ncol - i - 1][j]); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + state1[i][0] ^= Data2; + state1[i][1] ^= Data0; + state1[i][2] ^= Data1; + } else { + state1[i][0] ^= Data0; + state1[i][1] ^= Data1; + state1[i][2] ^= Data2; + } + + #pragma unroll + for (j = 0; j < 3; j++) + ST4S(s1 + j, state1[i][j]); + } +} + +__device__ +void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4]) +{ + uint2 state1[3], state2[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + const uint32_t ps3 = memshift * Ncol * rowOut; + + for (int i = 0; i < Ncol; i++) + { + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + const uint32_t s3 = ps3 + i*memshift; + + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = LD4S(s1 + j); + + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s2 + j); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra_v5(state); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s2 + j, state2[j]); + + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); + } +} + +__device__ +void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4]) +{ + const int rowIn = 2; + const int rowOut = 3; + + int i, j; + uint2 last[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] = LD4S(ps2 + j); + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= LD4S(ps1 + j) + last[j]; + + round_lyra_v5(state); + + uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4); + uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4); + uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == rowOut) + { + #pragma unroll + for (j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (i = 1; i < Ncol; i++) + { + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + + #pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= LD4S(s1 + j) + LD4S(s2 + j); + + round_lyra_v5(state); + } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} + +__global__ +__launch_bounds__(TPB, 1) +void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + const uint2x4 blake2b_IV[2] = { + 0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL, + 0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL, + 0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL, + 0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL + }; + + const uint2x4 Mask[2] = { + 0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL, + 0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL, + 0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL, + 0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL + }; + + uint2x4 state[4]; + + if (thread < threads) + { + state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i<12; i++) + round_lyra_v5(state); + + state[0] ^= Mask[0]; + state[1] ^= Mask[1]; + + for (int i = 0; i<12; i++) + round_lyra_v5(state); + + DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0]; + DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1]; + DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2]; + DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3]; + } +} + +__global__ +__launch_bounds__(TPB, 1) +void lyra2v3_gpu_hash_32_2(uint32_t threads) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; + + if (thread < threads) + { + uint2 state[4]; + state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + + reduceDuplexRowSetup2(state); + + uint32_t rowa; + int prev = 3; + unsigned int instance = 0; + for (int i = 0; i < 3; i++) + { + instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4); + rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3; + + //rowa = __shfl(state[0].x, 0, 4) & 3; + reduceDuplexRowt2(prev, rowa, i, state); + prev = i; + } + + instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4); + rowa = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3; + + //rowa = __shfl(state[0].x, 0, 4) & 3; + reduceDuplexRowt2x4(rowa, state); + + ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; + ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; + ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; + ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ +__launch_bounds__(TPB, 1) +void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint2x4 state[4]; + + if (thread < threads) + { + state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]); + state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]); + state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]); + state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]); + + for (int i = 0; i < 12; i++) + round_lyra_v5(state); + + outputHash[thread + threads * 0] = state[0].x; + outputHash[thread + threads * 1] = state[0].y; + outputHash[thread + threads * 2] = state[0].z; + outputHash[thread + threads * 3] = state[0].w; + } +} + +#else +#include "cuda_helper.h" +#if __CUDA_ARCH__ < 200 +__device__ void* DMatrix; +#endif +__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {} +__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {} +__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {} +#endif + + +__host__ +void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) +{ + cuda_get_arch(thr_id); + // just assign the device pointer allocated in main loop + cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); +} + +__host__ +void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order) +{ + int dev_id = device_map[thr_id % MAX_GPUS]; + + if (device_sm[dev_id] >= 500) { + + const uint32_t tpb = TPB; + + dim3 grid2((threads + tpb - 1) / tpb); + dim3 block2(tpb); + dim3 grid4((threads * 4 + tpb - 1) / tpb); + dim3 block4(4, tpb / 4); + + lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash); + lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads); + lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash); + + } else { + + uint32_t tpb = 16; + if (cuda_arch[dev_id] >= 350) tpb = TPB35; + else if (cuda_arch[dev_id] >= 300) tpb = TPB30; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; + + dim3 grid((threads + tpb - 1) / tpb); + dim3 block(tpb); + lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash); + + } +} + + diff --git a/lyra2/cuda_lyra2v3_sm3.cuh b/lyra2/cuda_lyra2v3_sm3.cuh new file mode 100644 index 0000000000..f84521c869 --- /dev/null +++ b/lyra2/cuda_lyra2v3_sm3.cuh @@ -0,0 +1,348 @@ +/* SM 2/3/3.5 Variant for lyra2REv2 */ + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors, only uncomment that temporary, dont commit it */ +//#undef __CUDA_ARCH__ +//#define __CUDA_ARCH__ 500 +#endif + +#define TPB20 64 +#define TPB30 64 +#define TPB35 64 + +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500 + +#include "cuda_lyra2_vectors.h" + +#define Nrow 4 +#define Ncol 4 + +#define vectype ulonglong4 +#define memshift 4 + +__device__ vectype *DMatrix; + +static __device__ __forceinline__ +void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) +{ + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); +} + +static __device__ __forceinline__ +void round_lyra_v35(vectype* s) +{ + Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV3(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); + + #pragma unroll 4 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i *memshift; + uint32_t s2 = ps2 - Nrow * i *memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3], state1[3]; + + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); + + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow*i*memshift; + uint32_t s2 = ps2 + Nrow*i*memshift; + uint32_t s3 = ps3 - Nrow*i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1 )[j]); + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2 )[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3], state2[3]; + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); + + #pragma nounroll + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i*memshift; + uint32_t s2 = ps2 + Nrow * i*memshift; + uint32_t s3 = ps3 + Nrow * i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } + } +} + +#if __CUDA_ARCH__ >= 300 +__global__ __launch_bounds__(TPB35, 1) +void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + if (threadIdx.x == 0) { + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0, + 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 + ); + } + + if (thread < threads) + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + + state[1] = state[0]; + state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); + state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= shuffle4(((vectype*)padding)[0], 0); + state[1] ^= shuffle4(((vectype*)padding)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + unsigned int instance = 0; + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + //rowa = ((uint2*)state)[0].x & 3; + + instance = ((uint2*)state)[instance & 0xf].x; + rowa = ((uint2*)state)[instance & 0xf].x & 0x3; + reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#elif __CUDA_ARCH__ >= 200 +__global__ __launch_bounds__(TPB20, 1) +void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, + 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 + ); + + if (thread < threads) + { + + ((uint2*)state)[0] = outputHash[thread]; + ((uint2*)state)[1] = outputHash[thread + threads]; + ((uint2*)state)[2] = outputHash[thread + 2 * threads]; + ((uint2*)state)[3] = outputHash[thread + 3 * threads]; + + state[1] = state[0]; + state[2] = ((vectype*)blake2b_IV)[0]; + state[3] = ((vectype*)blake2b_IV)[1]; + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= ((vectype*)padding)[0]; + state[1] ^= ((vectype*)padding)[1]; + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint instance = 0; + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + // rowa = ((uint2*)state)[0].x & 3; + + instance = ((uint2*)state)[instance & 0xf]; + rowa = ((uint2*)state)[instance & 0xf] & 0x3; + reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#endif + +#else +/* host & sm5+ */ +__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} +#endif diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index 88a2133e95..b43537122b 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti, uint32_t high_end); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_free(int thr_id); @@ -80,7 +80,6 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, ptarget[7] = 0x00ff; static __thread bool gtx750ti; - static __thread uint32_t high_end; if (!init[thr_id]) { int dev_id = device_map[thr_id]; @@ -98,9 +97,6 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, if (strstr(props.name, "750 Ti")) gtx750ti = true; else gtx750ti = false; - if (strstr(props.name, "1080")) high_end = true; - else high_end = false; - gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); blake256_cpu_init(thr_id, throughput); @@ -134,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, //blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); //keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti, high_end); + lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; diff --git a/lyra2/lyra2REv3.cu b/lyra2/lyra2REv3.cu new file mode 100644 index 0000000000..7e1b4a7046 --- /dev/null +++ b/lyra2/lyra2REv3.cu @@ -0,0 +1,183 @@ +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_cubehash.h" +#include "lyra2/Lyra2.h" +} + +#include +#include + +static uint64_t *d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; + +extern void blake256_cpu_init(int thr_id, uint32_t threads); +extern void blake256_cpu_setBlock_80(uint32_t *pdata); +extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); + +extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); + +extern void lyra2v3_setTarget(const void *pTargetIn); +extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix); +extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); + +extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); + +extern void bmw256_setTarget(const void *ptarget); +extern void bmw256_cpu_init(int thr_id, uint32_t threads); +extern void bmw256_cpu_free(int thr_id); +extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); + +extern "C" void lyra2v3_hash(void *state, const void *input) +{ + uint32_t hashA[8], hashB[8]; + + sph_blake256_context ctx_blake; + sph_cubehash256_context ctx_cube; + sph_bmw256_context ctx_bmw; + + sph_blake256_set_rounds(14); + + sph_blake256_init(&ctx_blake); + sph_blake256(&ctx_blake, input, 80); + sph_blake256_close(&ctx_blake, hashA); + + LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_cubehash256_init(&ctx_cube); + sph_cubehash256(&ctx_cube, hashB, 32); + sph_cubehash256_close(&ctx_cube, hashA); + + LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4); + + sph_bmw256_init(&ctx_bmw); + sph_bmw256(&ctx_bmw, hashB, 32); + sph_bmw256_close(&ctx_bmw, hashA); + + memcpy(state, hashA, 32); +} + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20; + if (strstr(device_name[dev_id], "GTX 1")) intensity = 20; + if (strstr(device_name[dev_id], "RTX 20")) intensity = 20; + uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0x000f; + + + if (!init[thr_id]) + { + size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3; + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + blake256_cpu_init(thr_id, throughput); + bmw256_cpu_init(thr_id, throughput); + + cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256 + + // SM 3 implentation requires a bit more memory + if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) + matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; + + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); + lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]); + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + + api_set_throughput(thr_id, throughput); + init[thr_id] = true; + } + + uint32_t endiandata[20]; + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + blake256_cpu_setBlock_80(pdata); + bmw256_setTarget(ptarget); + + do { + int order = 0; + + blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + memset(work->nonces, 0, sizeof(work->nonces)); + bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces); + + *hashes_done = pdata[19] - first_nonce + throughput; + + if (work->nonces[0] != 0) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + lyra2v3_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + lyra2v3_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart && !abort_flag); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_lyra2v3(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + cudaFree(d_matrix[thr_id]); + + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/miner.h b/miner.h index aafb624fbb..1cbd19775c 100644 --- a/miner.h +++ b/miner.h @@ -267,6 +267,7 @@ json_t * json_load_url(char* cfg_url, json_error_t *err); void sha256_init(uint32_t *state); void sha256_transform(uint32_t *state, const uint32_t *block, int swap); void sha256d(unsigned char *hash, const unsigned char *data, int len); +void gostd(void *output, const void *input, size_t len); #define HAVE_SHA256_4WAY 0 #define HAVE_SHA256_8WAY 0 @@ -274,11 +275,13 @@ void sha256d(unsigned char *hash, const unsigned char *data, int len); struct work; extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_anime(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_heavyhash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds); extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_bmw512(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -287,6 +290,7 @@ extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsi extern int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_gostd(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen); @@ -308,6 +312,7 @@ extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, uns extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_sha3d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -327,7 +332,12 @@ extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsig extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_x16rt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_x16rv2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_x21s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); /* free device allocated memory per algo */ @@ -340,6 +350,7 @@ extern void free_bitcore(int thr_id); extern void free_blake256(int thr_id); extern void free_blake2s(int thr_id); extern void free_bmw(int thr_id); +extern void free_bmw512(int thr_id); extern void free_c11(int thr_id); extern void free_cryptolight(int thr_id); extern void free_cryptonight(int thr_id); @@ -348,6 +359,7 @@ extern void free_deep(int thr_id); extern void free_keccak256(int thr_id); extern void free_fresh(int thr_id); extern void free_fugue256(int thr_id); +extern void free_gostd(int thr_id); extern void free_groestlcoin(int thr_id); extern void free_heavy(int thr_id); extern void free_hmq17(int thr_id); @@ -369,6 +381,7 @@ extern void free_quark(int thr_id); extern void free_qubit(int thr_id); extern void free_sha256d(int thr_id); extern void free_sha256t(int thr_id); +extern void free_sha3d(int thr_id); extern void free_sia(int thr_id); extern void free_sib(int thr_id); extern void free_skeincoin(int thr_id); @@ -377,7 +390,6 @@ extern void free_skunk(int thr_id); extern void free_s3(int thr_id); extern void free_timetravel(int thr_id); extern void free_tribus(int thr_id); -extern void free_bitcore(int thr_id); extern void free_vanilla(int thr_id); extern void free_veltor(int thr_id); extern void free_whirl(int thr_id); @@ -387,7 +399,12 @@ extern void free_x11(int thr_id); extern void free_x13(int thr_id); extern void free_x14(int thr_id); extern void free_x15(int thr_id); +extern void free_x16r(int thr_id); +extern void free_x16rt(int thr_id); +extern void free_x16rv2(int thr_id); +extern void free_x16s(int thr_id); extern void free_x17(int thr_id); +extern void free_x21s(int thr_id); extern void free_zr5(int thr_id); /* api related */ void *api_thread(void *userdata); @@ -882,6 +899,7 @@ void blake256hash(void *output, const void *input, int8_t rounds); void blake2b_hash(void *output, const void *input); void blake2s_hash(void *output, const void *input); void bmw_hash(void *state, const void *input); +void bmw512_hash(void *state, const void *input); void c11hash(void *output, const void *input); void cryptolight_hash(void* output, const void* input, int len); void cryptonight_hash(void* output, const void* input, size_t len); @@ -890,10 +908,12 @@ void deephash(void *state, const void *input); void luffa_hash(void *state, const void *input); void fresh_hash(void *state, const void *input); void fugue256_hash(unsigned char* output, const unsigned char* input, int len); +void gostd_hash(void *output, const void *input); void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void hmq17hash(void *output, const void *input); void hsr_hash(void *output, const void *input); void keccak256_hash(void *state, const void *input); +void sha3d_hash(void *state, const void *input); void jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); void jha_hash(void *output, const void *input); @@ -930,7 +950,12 @@ void x11hash(void *output, const void *input); void x13hash(void *output, const void *input); void x14hash(void *output, const void *input); void x15hash(void *output, const void *input); +void x16r_hash(void *output, const void *input); +void x16rt_hash(void *output, const void *input); +void x16rv2_hash(void *output, const void *input); +void x16s_hash(void *output, const void *input); void x17hash(void *output, const void *input); +void x21s_hash(void *output, const void *input); void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize); void zr5hash(void *output, const void *input); void zr5hash_pok(void *output, uint32_t *pdata); diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index eb25ad0ea7..4de386545b 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ - idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \ a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } @@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const ui idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ a += b; d = __byte_perm(d^a, 0, 0x1032); \ c += d; b = rotateR(b^c, 12); \ - idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \ a += b; d = __byte_perm(d^a, 0, 0x0321); \ c += d; b = rotateR(b^c, 7); \ } @@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ a += b; d = ROTR32(d^a,16); \ c += d; b = ROTR32(b^c, 12); \ - idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ + idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \ a += b; d = ROTR32(d^a,8); \ c += d; b = ROTR32(b^c, 7); \ } @@ -1319,7 +1319,6 @@ static void Blake2Shost(uint32_t * inout, const uint32_t * inkey) } -#define SHIFT 128U #define TPB 32 #define TPB2 64 @@ -1346,7 +1345,7 @@ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_chacha1() { const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); - const uint32_t shift = SHIFT * 8U * (thread & 8191); + const uint32_t threads = (gridDim.x * blockDim.y); const uint32_t shiftTr = 8U * thread; uint4 X[4]; @@ -1361,7 +1360,7 @@ void neoscrypt_gpu_hash_chacha1() #pragma nounroll for (int i = 0; i < 128; i++) { - uint32_t offset = shift + i * 8U; + uint32_t offset = 8U * (thread + threads * i); for (int j = 0; j < 4; j++) ((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j]; neoscrypt_chacha(X); @@ -1370,7 +1369,7 @@ void neoscrypt_gpu_hash_chacha1() #pragma nounroll for (int t = 0; t < 128; t++) { - uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U; + uint32_t offset = 8U * (thread + threads * (WarpShuffle(X[3].x, 0, 4) & 0x7F)); for (int j = 0; j < 4; j++) X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; neoscrypt_chacha(X); @@ -1391,7 +1390,7 @@ __launch_bounds__(TPB, 1) void neoscrypt_gpu_hash_salsa1() { const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); - const uint32_t shift = SHIFT * 8U * (thread & 8191); + const uint32_t threads = (gridDim.x * blockDim.y); const uint32_t shiftTr = 8U * thread; uint4 Z[4]; @@ -1406,7 +1405,7 @@ void neoscrypt_gpu_hash_salsa1() #pragma nounroll for (int i = 0; i < 128; i++) { - uint32_t offset = shift + i * 8U; + uint32_t offset = 8U * (thread + threads * i); for (int j = 0; j < 4; j++) ((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j]; neoscrypt_salsa(Z); @@ -1415,7 +1414,7 @@ void neoscrypt_gpu_hash_salsa1() #pragma nounroll for (int t = 0; t < 128; t++) { - uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U; + uint32_t offset = 8U * (thread + threads * (WarpShuffle(Z[3].x, 0, 4) & 0x7F)); for (int j = 0; j < 4; j++) Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; neoscrypt_salsa(Z); @@ -1474,7 +1473,7 @@ void neoscrypt_init(int thr_id, uint32_t threads) cuda_get_arch(thr_id); CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads))); + CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads)); @@ -1550,4 +1549,3 @@ void neoscrypt_setBlockTarget(uint32_t* const pdata, uint32_t* const target) cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); CUDA_SAFE_CALL(cudaGetLastError()); } - diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index 22cfbd6a86..ce701d03f0 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -22,6 +22,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign int dev_id = device_map[thr_id]; int intensity = is_windows() ? 18 : 19; if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB + if (strstr(device_name[dev_id], "TITAN")) intensity = 21; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); throughput = throughput / 32; /* set for max intensity ~= 20 */ @@ -117,4 +118,4 @@ void free_neoscrypt(int thr_id) init[thr_id] = false; cudaDeviceSynchronize(); -} +} \ No newline at end of file diff --git a/quark/animecoin.cu b/quark/animecoin.cu new file mode 100644 index 0000000000..b28d5b3a31 --- /dev/null +++ b/quark/animecoin.cu @@ -0,0 +1,339 @@ +extern "C" +{ +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" +} + +#include "miner.h" + +#include "cuda_helper.h" +#include "cuda_quark.h" + +#include + +extern uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2); +extern void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2); + +static uint32_t *d_hash[MAX_GPUS]; +static uint32_t* d_hash_br2[MAX_GPUS]; // SM 2 + +// Speicher zur Generierung der Noncevektoren für die bedingten Hashes +static uint32_t *d_branch1Nonces[MAX_GPUS]; +static uint32_t *d_branch2Nonces[MAX_GPUS]; +static uint32_t *d_branch3Nonces[MAX_GPUS]; + +// Original Animehash Funktion aus einem miner Quelltext +extern "C" void animehash(void *state, const void *input) +{ + uint32_t _ALIGN(128) hash[64]; + + sph_bmw512_context ctx_bmw; + sph_blake512_context ctx_blake; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, input, 80); + sph_bmw512_close(&ctx_bmw, (void*) hash); + + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, (const void*) hash, 64); + sph_blake512_close(&ctx_blake, (void*) hash); + + if (hash[0] & 0x8) + { + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + } + else + { + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + } + + sph_groestl512_init(&ctx_groestl); + sph_groestl512 (&ctx_groestl, (const void*) hash, 64); + sph_groestl512_close(&ctx_groestl, (void*) hash); + + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + + if (hash[0] & 0x8) + { + sph_blake512_init(&ctx_blake); + sph_blake512 (&ctx_blake, (const void*) hash, 64); + sph_blake512_close(&ctx_blake, (void*) hash); + } + else + { + sph_bmw512_init(&ctx_bmw); + sph_bmw512 (&ctx_bmw, (const void*) hash, 64); + sph_bmw512_close(&ctx_bmw, (void*) hash); + } + + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + + sph_skein512_init(&ctx_skein); + sph_skein512 (&ctx_skein, (const void*) hash, 64); + sph_skein512_close(&ctx_skein, (void*) hash); + + if (hash[0] & 0x8) + { + sph_keccak512_init(&ctx_keccak); + sph_keccak512 (&ctx_keccak, (const void*) hash, 64); + sph_keccak512_close(&ctx_keccak, (void*) hash); + } + else + { + sph_jh512_init(&ctx_jh); + sph_jh512 (&ctx_jh, (const void*) hash, 64); + sph_jh512_close(&ctx_jh, (void*) hash); + } + + memcpy(state, hash, 32); +} + +#ifdef _DEBUG +#define TRACE(algo) { \ + if (max_nonce == 1 && pdata[19] <= 1) { \ + uint32_t* debugbuf = NULL; \ + cudaMallocHost(&debugbuf, 32); \ + cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \ + printf("anime %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \ + cudaFreeHost(debugbuf); \ + } \ +} +#else +#define TRACE(algo) {} +#endif + +static bool init[MAX_GPUS] = { 0 }; + +extern "C" int scanhash_anime(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(64) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + + int dev_id = device_map[thr_id]; + uint32_t def_thr = 1U << 20; // 256*4096 + uint32_t throughput = cuda_default_throughput(thr_id, def_thr); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) + ptarget[7] = 0x00F; + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cudaGetLastError(); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); + + quark_blake512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_compactTest_cpu_init(thr_id, throughput); + + if (cuda_arch[dev_id] >= 300) { + cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput); + cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput); + } else { + cudaMalloc(&d_hash_br2[thr_id], (size_t) 64 * throughput); + } + + cuda_check_cpu_init(thr_id, throughput); + CUDA_SAFE_CALL(cudaGetLastError()); + + init[thr_id] = true; + } + + for (int k=0; k < 20; k++) + be32enc(&endiandata[k], pdata[k]); + + quark_bmw512_cpu_setBlock_80(endiandata); + cuda_check_cpu_setTarget(ptarget); + + do { + int order = 0; + uint32_t nrm1=0, nrm2=0, nrm3=0; + + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw :"); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + + if (cuda_arch[dev_id] >= 300) { + + quark_compactTest_single_false_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], NULL, + d_branch3Nonces[thr_id], &nrm3, order++); + + // nur den Skein Branch weiterverfolgen + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Groestl512 + quark_groestl512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für JH512 + quark_jh512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + // das ist der bedingte Branch für Blake512 + quark_blake512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der bedingte Branch für Bmw512 + quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + + // das ist der unbedingte Branch für Keccak512 + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + + // das ist der unbedingte Branch für Skein512 + quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + + // quarkNonces in branch1 und branch2 aufsplitten gemäss if (hash[0] & 0x8) + quark_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], + d_branch1Nonces[thr_id], &nrm1, + d_branch2Nonces[thr_id], &nrm2, + order++); + + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); + + work->nonces[0] = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + work->nonces[1] = 0; + } else { + /* algo permutations are made with 2 different buffers */ + + quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("perm1 :"); + + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + + quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("perm2 :"); + + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("keccak :"); + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + + quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); + quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); + TRACE("perm3 :"); + + CUDA_LOG_ERROR(); + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + } + + *hashes_done = pdata[19] - first_nonce + throughput; + + if (work->nonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + animehash(vhash, endiandata); + + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + animehash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + return work->valid_nonces; + } + else if (vhash[7] > ptarget[7]) { + gpu_increment_reject(thr_id); + if (!opt_quiet) + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]); + pdata[19] = work->nonces[0] + 1; + continue; + } + } + + if ((uint64_t) throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (!work_restart[thr_id].restart); + + return 0; +} + +// cleanup +extern "C" void free_anime(int thr_id) +{ + int dev_id = device_map[thr_id]; + if (!init[thr_id]) + return; + + cudaDeviceSynchronize(); + + cudaFree(d_hash[thr_id]); + + if (cuda_arch[dev_id] >= 300) { + cudaFree(d_branch1Nonces[thr_id]); + cudaFree(d_branch2Nonces[thr_id]); + cudaFree(d_branch3Nonces[thr_id]); + } else { + cudaFree(d_hash_br2[thr_id]); + } + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + quark_compactTest_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} \ No newline at end of file diff --git a/quark/cuda_bmw512.cu b/quark/cuda_bmw512.cu index 6011beb513..f6f47b5cd0 100644 --- a/quark/cuda_bmw512.cu +++ b/quark/cuda_bmw512.cu @@ -1,19 +1,657 @@ +/* +Based on SP's BMW kernel +Provos Alexis - 2016 +Optimized for pascal sp - may 2018 +*/ + #include #include #define WANT_BMW512_80 -#include "cuda_helper.h" +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" -__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +#define CONST_EXP3d(i) devectorize(ROL2(q[i+ 1], 5)) + devectorize(ROL2(q[i+ 3],11)) + devectorize(ROL2(q[i+5], 27)) + \ + devectorize(SWAPDWORDS2(q[i+7])) + devectorize(ROL2(q[i+9], 37)) + devectorize(ROL2(q[i+11],43)) + \ + devectorize(ROL2(q[i+13],53)) + devectorize(SHR2(q[i+14],1) ^ q[i+14]) + devectorize(SHR2(q[i+15],2) ^ q[i+15]) + +__device__ __forceinline__ +static void bmw512_round1(uint2* q, uint2* h, const uint64_t* msg){ + const uint2 hash[16] = + { + { 0x84858687, 0x80818283 }, { 0x8C8D8E8F, 0x88898A8B }, { 0x94959697, 0x90919293 }, { 0x9C9D9E9F, 0x98999A9B }, + { 0xA4A5A6A7, 0xA0A1A2A3 }, { 0xACADAEAF, 0xA8A9AAAB }, { 0xB4B5B6B7, 0xB0B1B2B3 }, { 0xBCBDBEBF, 0xB8B9BABB }, + { 0xC4C5C6C7, 0xC0C1C2C3 }, { 0xCCCDCECF, 0xC8C9CACB }, { 0xD4D5D6D7, 0xD0D1D2D3 }, { 0xDCDDDEDF, 0xD8D9DADB }, + { 0xE4E5E6E7, 0xE0E1E2E3 }, { 0xECEDEEEF, 0xE8E9EAEB }, { 0xF4F5F6F7, 0xF0F1F2F3 }, { 0xFCFDFEFF, 0xF8F9FAFB } + }; + + const uint64_t hash2[16] = + { + 0x8081828384858687, 0x88898A8B8C8D8E8F, 0x9091929394959697, 0x98999A9B9C9D9E9F, + 0xA0A1A2A3A4A5A6A7, 0xA8A9AAABACADAEAF, 0xB0B1B2B3B4B5B6B7, 0xB8B9BABBBCBDBEBF, + 0xC0C1C2C3C4C5C6C7 ^ 0x80, 0xC8C9CACBCCCDCECF, 0xD0D1D2D3D4D5D6D7, 0xD8D9DADBDCDDDEDF, + 0xE0E1E2E3E4E5E6E7, 0xE8E9EAEBECEDEEEF, 0xF0F1F2F3F4F5F6F7, 0xF8F9FAFBFCFDFEFF + }; + + const uint2 precalcf[9] = + { + { 0x55555550, 0x55555555 }, { 0xAAAAAAA5, 0x5AAAAAAA }, { 0xFFFFFFFA, 0x5FFFFFFF }, { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, { 0xFE00FFF9, 0x6FFFFFFF }, { 0xAAAAAAA1, 0x9AAAAAAA }, { 0xFFFEFFF6, 0x9FFFFFFF }, { 0x5755554B, 0xA5555555 } + }; + + uint2 tmp; + uint64_t mxh[8]; + + mxh[0] = msg[0] ^ hash2[0]; + mxh[1] = msg[1] ^ hash2[1]; + mxh[2] = msg[2] ^ hash2[2]; + mxh[3] = msg[3] ^ hash2[3]; + mxh[4] = msg[4] ^ hash2[4]; + mxh[5] = msg[5] ^ hash2[5]; + mxh[6] = msg[6] ^ hash2[6]; + mxh[7] = msg[7] ^ hash2[7]; + + tmp = vectorize(mxh[5] - mxh[7]) + hash[10] + hash[13] + hash[14]; + q[0] = hash[1] + (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)); + + tmp = vectorize(mxh[6]) + hash[11] + hash[14] - (hash[15] ^ 512) - (hash[8] ^ 0x80); + q[1] = hash[2] + (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)); + + tmp = vectorize(mxh[0] + mxh[7]) + hash[9] - hash[12] + (hash[15] ^ 0x200); + q[2] = hash[3] + (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)); + + q[16] = (SHR2(q[0], 1) ^ SHL2(q[0], 2) ^ ROL2(q[0], 13) ^ ROL2(q[0], 43)) + (SHR2(q[1], 2) ^ SHL2(q[1], 1) ^ ROL2(q[1], 19) ^ ROL2(q[1], 53)); + q[17] = (SHR2(q[1], 1) ^ SHL2(q[1], 2) ^ ROL2(q[1], 13) ^ ROL2(q[1], 43)) + (SHR2(q[2], 2) ^ SHL2(q[2], 1) ^ ROL2(q[2], 19) ^ ROL2(q[2], 53)); + + tmp = vectorize((mxh[0] - mxh[1]) + hash2[8] - hash2[10] + hash2[13]); + q[3] = hash[4] + (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)); + + tmp = vectorize((mxh[1] + mxh[2]) + hash2[9] - hash2[11] - hash2[14]); + q[4] = hash[5] + (SHR2(tmp, 1) ^ tmp); + + q[16] += (SHR2(q[2], 2) ^ SHL2(q[2], 2) ^ ROL2(q[2], 28) ^ ROL2(q[2], 59)) + (SHR2(q[3], 1) ^ SHL2(q[3], 3) ^ ROL2(q[3], 4) ^ ROL2(q[3], 37)); + q[17] += (SHR2(q[3], 2) ^ SHL2(q[3], 2) ^ ROL2(q[3], 28) ^ ROL2(q[3], 59)) + (SHR2(q[4], 1) ^ SHL2(q[4], 3) ^ ROL2(q[4], 4) ^ ROL2(q[4], 37)); + + tmp = vectorize((mxh[3] - mxh[2] + hash2[10] - hash2[12] + (512 ^ hash2[15]))); + q[5] = hash[6] + (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)); + + tmp = vectorize((mxh[4]) - (mxh[0]) - (mxh[3]) + hash2[13] - hash2[11]); + q[6] = hash[7] + (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)); + + q[16] += (SHR2(q[4], 1) ^ SHL2(q[4], 2) ^ ROL2(q[4], 13) ^ ROL2(q[4], 43)) + (SHR2(q[5], 2) ^ SHL2(q[5], 1) ^ ROL2(q[5], 19) ^ ROL2(q[5], 53)); + q[17] += (SHR2(q[5], 1) ^ SHL2(q[5], 2) ^ ROL2(q[5], 13) ^ ROL2(q[5], 43)) + (SHR2(q[6], 2) ^ SHL2(q[6], 1) ^ ROL2(q[6], 19) ^ ROL2(q[6], 53)); + + tmp = vectorize((mxh[1]) - (mxh[4]) - (mxh[5]) - hash2[12] - hash2[14]); + q[7] = hash[8] + (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)); + + tmp = vectorize((mxh[2]) - (mxh[5]) - (mxh[6]) + hash2[13] - (512 ^ hash2[15])); + q[8] = hash[9] + (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)); + + q[16] += (SHR2(q[6], 2) ^ SHL2(q[6], 2) ^ ROL2(q[6], 28) ^ ROL2(q[6], 59)) + (SHR2(q[7], 1) ^ SHL2(q[7], 3) ^ ROL2(q[7], 4) ^ ROL2(q[7], 37)); + q[17] += (SHR2(q[7], 2) ^ SHL2(q[7], 2) ^ ROL2(q[7], 28) ^ ROL2(q[7], 59)) + (SHR2(q[8], 1) ^ SHL2(q[8], 3) ^ ROL2(q[8], 4) ^ ROL2(q[8], 37)); + + tmp = vectorize((mxh[0]) - (mxh[3]) + (mxh[6]) - (mxh[7]) + (hash2[14])); + q[9] = hash[10] + (SHR2(tmp, 1) ^ tmp); + + tmp = vectorize((512 ^ hash2[15]) + hash2[8] - (mxh[1]) - (mxh[4]) - (mxh[7])); + q[10] = hash[11] + (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)); + + q[16] += (SHR2(q[8], 1) ^ SHL2(q[8], 2) ^ ROL2(q[8], 13) ^ ROL2(q[8], 43)) + (SHR2(q[9], 2) ^ SHL2(q[9], 1) ^ ROL2(q[9], 19) ^ ROL2(q[9], 53)); + q[17] += (SHR2(q[9], 1) ^ SHL2(q[9], 2) ^ ROL2(q[9], 13) ^ ROL2(q[9], 43)) + (SHR2(q[10], 2) ^ SHL2(q[10], 1) ^ ROL2(q[10], 19) ^ ROL2(q[10], 53)); + + tmp = vectorize(hash2[9] + hash2[8] - (mxh[0]) - (mxh[2]) - (mxh[5])); + q[11] = hash[12] + (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)); + + tmp = vectorize((mxh[1]) + (mxh[3]) - (mxh[6]) + hash2[10] - hash2[9]); + q[12] = hash[13] + (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)); + + q[16] += (SHR2(q[10], 2) ^ SHL2(q[10], 2) ^ ROL2(q[10], 28) ^ ROL2(q[10], 59)) + (SHR2(q[11], 1) ^ SHL2(q[11], 3) ^ ROL2(q[11], 4) ^ ROL2(q[11], 37)); + q[17] += (SHR2(q[11], 2) ^ SHL2(q[11], 2) ^ ROL2(q[11], 28) ^ ROL2(q[11], 59)) + (SHR2(q[12], 1) ^ SHL2(q[12], 3) ^ ROL2(q[12], 4) ^ ROL2(q[12], 37)); + + tmp = vectorize((mxh[2]) + (mxh[4]) + (mxh[7]) + hash2[10] + hash2[11]); + q[13] = hash[14] + (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)); + + tmp = vectorize((mxh[3]) - (mxh[5]) + hash2[8] - hash2[11] - hash2[12]); + q[14] = hash[15] + (SHR2(tmp, 1) ^ tmp); + + q[16] += (SHR2(q[12], 1) ^ SHL2(q[12], 2) ^ ROL2(q[12], 13) ^ ROL2(q[12], 43)) + (SHR2(q[13], 2) ^ SHL2(q[13], 1) ^ ROL2(q[13], 19) ^ ROL2(q[13], 53)); + q[17] += (SHR2(q[13], 1) ^ SHL2(q[13], 2) ^ ROL2(q[13], 13) ^ ROL2(q[13], 43)) + (SHR2(q[14], 2) ^ SHL2(q[14], 1) ^ ROL2(q[14], 19) ^ ROL2(q[14], 53)); + + tmp = vectorize(hash2[12] - hash2[9] + hash2[13] - (mxh[4]) - (mxh[6])); + q[15] = hash[0] + (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)); + + q[16] += (SHR2(q[14], 2) ^ SHL2(q[14], 2) ^ ROL2(q[14], 28) ^ ROL2(q[14], 59)) + (SHR2(q[15], 1) ^ SHL2(q[15], 3) ^ ROL2(q[15], 4) ^ ROL2(q[15], 37)) + + ((precalcf[0] + ROTL64(msg[0], 1) + ROTL64(msg[3], 4)) ^ hash[7]); + + q[17] += + (SHR2(q[15], 2) ^ SHL2(q[15], 2) ^ ROL2(q[15], 28) ^ ROL2(q[15], 59)) + (SHR2(q[16], 1) ^ SHL2(q[16], 3) ^ ROL2(q[16], 4) ^ ROL2(q[16], 37)) + + ((precalcf[1] + ROTL64(msg[1], 2) + ROTL64(msg[4], 5)) ^ hash[8]); + + uint64_t add1 = devectorize(q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14]); + uint64_t add2 = devectorize(q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15]); + + uint2 XL64 = q[16] ^ q[17]; + + q[18] = vectorize(CONST_EXP3d(2) + add1 + devectorize((precalcf[2] + ROTL64(msg[2], 3) + ROTL64(msg[5], 6)) ^ hash[9])); + q[19] = vectorize(CONST_EXP3d(3) + add2 + devectorize((precalcf[3] + ROTL64(msg[3], 4) + ROTL64(msg[6], 7)) ^ hash[10])); + + add1 += devectorize(q[16] - q[2]); + add2 += devectorize(q[17] - q[3]); + + XL64 = xor3x(XL64, q[18], q[19]); + + q[20] = vectorize(CONST_EXP3d(4) + add1 + devectorize((precalcf[4] + ROTL64(msg[4], 5) + ROTL64(msg[7], 8)) ^ hash[11])); + q[21] = vectorize(CONST_EXP3d(5) + add2 + devectorize((precalcf[5] + ROTL64(msg[5], 6)) ^ hash[5 + 7])); + + add1 += devectorize(q[18] - q[4]); + add2 += devectorize(q[19] - q[5]); + + XL64 = xor3x(XL64, q[20], q[21]); + + q[22] = vectorize(CONST_EXP3d(6) + add1 + devectorize((vectorize((22)*(0x0555555555555555ull)) + ROTL64(msg[6], 7) - ROTL64(msg[0], 1)) ^ hash[13])); + q[23] = vectorize(CONST_EXP3d(7) + add2 + devectorize((vectorize((23)*(0x0555555555555555ull)) + ROTL64(msg[7], 8) - ROTL64(msg[1], 2)) ^ hash[14])); + + add1 += devectorize(q[20] - q[6]); + add2 += devectorize(q[21] - q[7]); + + XL64 = xor3x(XL64, q[22], q[23]); + + q[24] = vectorize(CONST_EXP3d(8) + add1 + devectorize((vectorize((24)*(0x0555555555555555ull) + 0x10000) - ROTL64(msg[2], 3)) ^ hash[15])); + q[25] = vectorize(CONST_EXP3d(9) + add2 + devectorize((vectorize((25)*(0x0555555555555555ull)) - ROTL64(msg[3], 4)) ^ hash[0])); + + add1 += devectorize(q[22] - q[8]); + add2 += devectorize(q[23] - q[9]); + + uint2 XH64 = xor3x(XL64, q[24], q[25]); + + q[26] = vectorize(CONST_EXP3d(10) + add1 + devectorize((vectorize((26)*(0x0555555555555555ull)) - ROTL64(msg[4], 5)) ^ hash[1])); + q[27] = vectorize(CONST_EXP3d(11) + add2 + devectorize((vectorize((27)*(0x0555555555555555ull)) - ROTL64(msg[5], 6)) ^ hash[2])); + + add1 += devectorize(q[24] - q[10]); + add2 += devectorize(q[25] - q[11]); + + XH64 = xor3x(XH64, q[26], q[27]); + + q[28] = vectorize(CONST_EXP3d(12) + add1 + devectorize((vectorize(0x955555555755554C) - ROTL64(msg[6], 7)) ^ hash[3])); + q[29] = vectorize(CONST_EXP3d(13) + add2 + devectorize((precalcf[6] + ROTL64(msg[0], 1) - ROTL64(msg[7], 8)) ^ hash[4])); + + add1 += devectorize(q[26] - q[12]); + add2 += devectorize(q[27] - q[13]); + + XH64 = xor3x(XH64, q[28], q[29]); + + q[30] = vectorize(CONST_EXP3d(14) + add1 + devectorize((precalcf[7] + ROTL64(msg[1], 2)) ^ hash[5])); + q[31] = vectorize(CONST_EXP3d(15) + add2 + devectorize((precalcf[8] + ROTL64(msg[2], 3)) ^ hash[6])); + + XH64 = xor3x(XH64, q[30], q[31]); + + h[0] = (SHL2(XH64, 5) ^ SHR2(q[16], 5) ^ vectorize(msg[0])) + (XL64 ^ q[24] ^ q[0]); + h[1] = (SHR2(XH64, 7) ^ SHL8(q[17]) ^ vectorize(msg[1])) + (XL64 ^ q[25] ^ q[1]); + h[2] = (SHR2(XH64, 5) ^ SHL2(q[18], 5) ^ vectorize(msg[2])) + (XL64 ^ q[26] ^ q[2]); + h[3] = (SHR2(XH64, 1) ^ SHL2(q[19], 5) ^ vectorize(msg[3])) + (XL64 ^ q[27] ^ q[3]); + h[4] = (SHR2(XH64, 3) ^ q[20] ^ vectorize(msg[4])) + (XL64 ^ q[28] ^ q[4]); + h[5] = (SHL2(XH64, 6) ^ SHR2(q[21], 6) ^ vectorize(msg[5])) + (XL64 ^ q[29] ^ q[5]); + h[6] = (SHR2(XH64, 4) ^ SHL2(q[22], 6) ^ vectorize(msg[6])) + (XL64 ^ q[30] ^ q[6]); + h[7] = (SHR2(XH64, 11) ^ SHL2(q[23], 2) ^ vectorize(msg[7])) + (XL64 ^ q[31] ^ q[7]); + + h[8] = (ROL2(h[4], 9)) + (XH64 ^ q[24] ^ 0x80) + (SHL8(XL64) ^ q[23] ^ q[8]); + h[9] = (ROL2(h[5], 10)) + (XH64 ^ q[25]) + (SHR2(XL64, 6) ^ q[16] ^ q[9]); + h[10] = (ROL2(h[6], 11)) + (XH64 ^ q[26]) + (SHL2(XL64, 6) ^ q[17] ^ q[10]); + h[11] = (ROL2(h[7], 12)) + (XH64 ^ q[27]) + (SHL2(XL64, 4) ^ q[18] ^ q[11]); + h[12] = (ROL2(h[0], 13)) + (XH64 ^ q[28]) + (SHR2(XL64, 3) ^ q[19] ^ q[12]); + h[13] = (ROL2(h[1], 14)) + (XH64 ^ q[29]) + (SHR2(XL64, 4) ^ q[20] ^ q[13]); + h[14] = (ROL2(h[2], 15)) + (XH64 ^ q[30]) + (SHR2(XL64, 7) ^ q[21] ^ q[14]); + h[15] = (ROL16(h[3])) + (XH64 ^ q[31] ^ 512) + (SHR2(XL64, 2) ^ q[22] ^ q[15]); +} + +__global__ __launch_bounds__(256, 2) +void quark_bmw512_gpu_hash_64(uint32_t threads, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads){ + + const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread]; + + uint64_t *inpHash = &g_hash[8 * hashPosition]; + + uint64_t __align__(16) msg[16]; + uint2 __align__(16) h[16]; + + uint2x4* phash = (uint2x4*)inpHash; + uint2x4* outpt = (uint2x4*)msg; + outpt[0] = __ldg4(&phash[0]); + outpt[1] = __ldg4(&phash[1]); + + uint2 q[32]; + + bmw512_round1(q, h, msg); + + const uint2 __align__(16) cmsg[16] = { + 0xaaaaaaa0, 0xaaaaaaaa, 0xaaaaaaa1, 0xaaaaaaaa, 0xaaaaaaa2, 0xaaaaaaaa, 0xaaaaaaa3, 0xaaaaaaaa, + 0xaaaaaaa4, 0xaaaaaaaa, 0xaaaaaaa5, 0xaaaaaaaa, 0xaaaaaaa6, 0xaaaaaaaa, 0xaaaaaaa7, 0xaaaaaaaa, + 0xaaaaaaa8, 0xaaaaaaaa, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaab, 0xaaaaaaaa, + 0xaaaaaaac, 0xaaaaaaaa, 0xaaaaaaad, 0xaaaaaaaa, 0xaaaaaaae, 0xaaaaaaaa, 0xaaaaaaaf, 0xaaaaaaaa + }; + +#pragma unroll 16 + for (int i = 0; i < 16; i++) + msg[i] = devectorize(cmsg[i] ^ h[i]); + + const uint2 __align__(16) precalc[16] = { + { 0x55555550, 0x55555555 }, { 0xAAAAAAA5, 0x5AAAAAAA }, { 0xFFFFFFFA, 0x5FFFFFFF }, { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, { 0xFFFFFFF9, 0x6FFFFFFF }, { 0x5555554E, 0x75555555 }, { 0xAAAAAAA3, 0x7AAAAAAA }, + { 0xFFFFFFF8, 0x7FFFFFFF }, { 0x5555554D, 0x85555555 }, { 0xAAAAAAA2, 0x8AAAAAAA }, { 0xFFFFFFF7, 0x8FFFFFFF }, + { 0x5555554C, 0x95555555 }, { 0xAAAAAAA1, 0x9AAAAAAA }, { 0xFFFFFFF6, 0x9FFFFFFF }, { 0x5555554B, 0xA5555555 } + }; + + const uint64_t p2 = msg[15] - msg[12]; + const uint64_t p3 = msg[14] - msg[7]; + const uint64_t p4 = msg[6] + msg[9]; + const uint64_t p5 = msg[8] - msg[5]; + const uint64_t p6 = msg[1] - msg[14]; + const uint64_t p7 = msg[8] - msg[1]; + const uint64_t p8 = msg[3] + msg[10]; + + + uint2 tmp = vectorize((msg[5]) + (msg[10]) + (msg[13]) + p3); + q[0] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[1]; + + tmp = vectorize((msg[6]) - (msg[8]) + (msg[11]) + (msg[14]) - (msg[15])); + q[1] = (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)) + cmsg[2]; + + tmp = vectorize((msg[0]) + (msg[7]) + (msg[9]) + p2); + q[2] = (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)) + cmsg[3]; + + tmp = vectorize((msg[0]) + p7 - (msg[10]) + (msg[13])); + q[3] = (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)) + cmsg[4]; + + tmp = vectorize((msg[2]) + (msg[9]) - (msg[11]) + p6); + q[4] = (SHR2(tmp, 1) ^ tmp) + cmsg[5]; + + tmp = vectorize(p8 + p2 - (msg[2])); + q[5] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[6]; + + tmp = vectorize((msg[4]) - (msg[0]) - (msg[3]) - (msg[11]) + (msg[13])); + q[6] = (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)) + cmsg[7]; + + tmp = vectorize(p6 - (msg[4]) - (msg[5]) - (msg[12])); + q[7] = (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)) + cmsg[8]; + + tmp = vectorize((msg[2]) - (msg[5]) - (msg[6]) + (msg[13]) - (msg[15])); + q[8] = (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)) + cmsg[9]; + + tmp = vectorize((msg[0]) - (msg[3]) + (msg[6]) + p3); + q[9] = (SHR2(tmp, 1) ^ tmp) + cmsg[10]; + + tmp = vectorize(p7 - (msg[4]) - (msg[7]) + (msg[15])); + q[10] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[11]; + + tmp = vectorize(p5 - (msg[0]) - (msg[2]) + (msg[9])); + q[11] = (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)) + cmsg[12]; + + tmp = vectorize(p8 + msg[1] - p4); + q[12] = (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)) + cmsg[13]; + + tmp = vectorize((msg[2]) + (msg[4]) + (msg[7]) + (msg[10]) + (msg[11])); + q[13] = (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)) + cmsg[14]; + + tmp = vectorize((msg[3]) + p5 - (msg[11]) - (msg[12])); + q[14] = (SHR2(tmp, 1) ^ tmp) + cmsg[15]; + + tmp = vectorize((msg[12]) - (msg[4]) - p4 + (msg[13])); + q[15] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[0]; + + q[16] = + vectorize(devectorize(SHR2(q[0], 1) ^ SHL2(q[0], 2) ^ ROL2(q[0], 13) ^ ROL2(q[0], 43)) + devectorize(SHR2(q[1], 2) ^ SHL2(q[1], 1) ^ ROL2(q[1], 19) ^ ROL2(q[1], 53)) + + devectorize(SHR2(q[2], 2) ^ SHL2(q[2], 2) ^ ROL2(q[2], 28) ^ ROL2(q[2], 59)) + devectorize(SHR2(q[3], 1) ^ SHL2(q[3], 3) ^ ROL2(q[3], 4) ^ ROL2(q[3], 37)) + + devectorize(SHR2(q[4], 1) ^ SHL2(q[4], 2) ^ ROL2(q[4], 13) ^ ROL2(q[4], 43)) + devectorize(SHR2(q[5], 2) ^ SHL2(q[5], 1) ^ ROL2(q[5], 19) ^ ROL2(q[5], 53)) + + devectorize(SHR2(q[6], 2) ^ SHL2(q[6], 2) ^ ROL2(q[6], 28) ^ ROL2(q[6], 59)) + devectorize(SHR2(q[7], 1) ^ SHL2(q[7], 3) ^ ROL2(q[7], 4) ^ ROL2(q[7], 37)) + + devectorize(SHR2(q[8], 1) ^ SHL2(q[8], 2) ^ ROL2(q[8], 13) ^ ROL2(q[8], 43)) + devectorize(SHR2(q[9], 2) ^ SHL2(q[9], 1) ^ ROL2(q[9], 19) ^ ROL2(q[9], 53)) + + devectorize(SHR2(q[10], 2) ^ SHL2(q[10], 2) ^ ROL2(q[10], 28) ^ ROL2(q[10], 59)) + devectorize(SHR2(q[11], 1) ^ SHL2(q[11], 3) ^ ROL2(q[11], 4) ^ ROL2(q[11], 37)) + + devectorize(SHR2(q[12], 1) ^ SHL2(q[12], 2) ^ ROL2(q[12], 13) ^ ROL2(q[12], 43)) + devectorize(SHR2(q[13], 2) ^ SHL2(q[13], 1) ^ ROL2(q[13], 19) ^ ROL2(q[13], 53)) + + devectorize(SHR2(q[14], 2) ^ SHL2(q[14], 2) ^ ROL2(q[14], 28) ^ ROL2(q[14], 59)) + devectorize(SHR2(q[15], 1) ^ SHL2(q[15], 3) ^ ROL2(q[15], 4) ^ ROL2(q[15], 37)) + + devectorize((precalc[0] + ROL2(h[0], 1) + ROL2(h[3], 4) - ROL2(h[10], 11)) ^ cmsg[7])); + q[17] = + vectorize(devectorize(SHR2(q[1], 1) ^ SHL2(q[1], 2) ^ ROL2(q[1], 13) ^ ROL2(q[1], 43)) + devectorize(SHR2(q[2], 2) ^ SHL2(q[2], 1) ^ ROL2(q[2], 19) ^ ROL2(q[2], 53)) + + devectorize(SHR2(q[3], 2) ^ SHL2(q[3], 2) ^ ROL2(q[3], 28) ^ ROL2(q[3], 59)) + devectorize(SHR2(q[4], 1) ^ SHL2(q[4], 3) ^ ROL2(q[4], 4) ^ ROL2(q[4], 37)) + + devectorize(SHR2(q[5], 1) ^ SHL2(q[5], 2) ^ ROL2(q[5], 13) ^ ROL2(q[5], 43)) + devectorize(SHR2(q[6], 2) ^ SHL2(q[6], 1) ^ ROL2(q[6], 19) ^ ROL2(q[6], 53)) + + devectorize(SHR2(q[7], 2) ^ SHL2(q[7], 2) ^ ROL2(q[7], 28) ^ ROL2(q[7], 59)) + devectorize(SHR2(q[8], 1) ^ SHL2(q[8], 3) ^ ROL2(q[8], 4) ^ ROL2(q[8], 37)) + + devectorize(SHR2(q[9], 1) ^ SHL2(q[9], 2) ^ ROL2(q[9], 13) ^ ROL2(q[9], 43)) + devectorize(SHR2(q[10], 2) ^ SHL2(q[10], 1) ^ ROL2(q[10], 19) ^ ROL2(q[10], 53)) + + devectorize(SHR2(q[11], 2) ^ SHL2(q[11], 2) ^ ROL2(q[11], 28) ^ ROL2(q[11], 59)) + devectorize(SHR2(q[12], 1) ^ SHL2(q[12], 3) ^ ROL2(q[12], 4) ^ ROL2(q[12], 37)) + + devectorize(SHR2(q[13], 1) ^ SHL2(q[13], 2) ^ ROL2(q[13], 13) ^ ROL2(q[13], 43)) + devectorize(SHR2(q[14], 2) ^ SHL2(q[14], 1) ^ ROL2(q[14], 19) ^ ROL2(q[14], 53)) + + devectorize(SHR2(q[15], 2) ^ SHL2(q[15], 2) ^ ROL2(q[15], 28) ^ ROL2(q[15], 59)) + devectorize(SHR2(q[16], 1) ^ SHL2(q[16], 3) ^ ROL2(q[16], 4) ^ ROL2(q[16], 37)) + + devectorize((precalc[1] + ROL2(h[1], 2) + ROL2(h[4], 5) - ROL2(h[11], 12)) ^ cmsg[8])); + + uint64_t add1 = devectorize(q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14]); + uint64_t add2 = devectorize(q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15]); + + uint2 XL64 = q[16] ^ q[17]; + + q[18] = vectorize(add1 + CONST_EXP3d(2) + devectorize((precalc[2] + ROL2(h[2], 3) + ROL2(h[5], 6) - ROL2(h[12], 13)) ^ cmsg[9])); + q[19] = vectorize(add2 + CONST_EXP3d(3) + devectorize((precalc[3] + ROL2(h[3], 4) + ROL2(h[6], 7) - ROL2(h[13], 14)) ^ cmsg[10])); + + add1 = add1 - devectorize(q[2] - q[16]); + add2 = add2 - devectorize(q[3] - q[17]); + + XL64 = xor3x(XL64, q[18], q[19]); + + q[20] = vectorize(add1 + CONST_EXP3d(4) + devectorize((precalc[4] + ROL2(h[4], 5) + ROL8(h[7]) - ROL2(h[14], 15)) ^ cmsg[11])); + q[21] = vectorize(add2 + CONST_EXP3d(5) + devectorize((precalc[5] + ROL2(h[5], 6) + ROL2(h[8], 9) - ROL16(h[15])) ^ cmsg[12])); + + add1 = add1 - devectorize(q[4] - q[18]); + add2 = add2 - devectorize(q[5] - q[19]); + + XL64 = xor3x(XL64, q[20], q[21]); -#include "cuda_bmw512_sm3.cuh" + q[22] = vectorize(add1 + CONST_EXP3d(6) + devectorize((precalc[6] + ROL2(h[6], 7) + ROL2(h[9], 10) - ROL2(h[0], 1)) ^ cmsg[13])); + q[23] = vectorize(add2 + CONST_EXP3d(7) + devectorize((precalc[7] + ROL8(h[7]) + ROL2(h[10], 11) - ROL2(h[1], 2)) ^ cmsg[14])); -#ifdef __INTELLISENSE__ -/* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 + add1 -= devectorize(q[6] - q[20]); + add2 -= devectorize(q[7] - q[21]); + + XL64 = xor3x(XL64, q[22], q[23]); + + q[24] = vectorize(add1 + CONST_EXP3d(8) + devectorize((precalc[8] + ROL2(h[8], 9) + ROL2(h[11], 12) - ROL2(h[2], 3)) ^ cmsg[15])); + q[25] = vectorize(add2 + CONST_EXP3d(9) + devectorize((precalc[9] + ROL2(h[9], 10) + ROL2(h[12], 13) - ROL2(h[3], 4)) ^ cmsg[0])); + + add1 -= devectorize(q[8] - q[22]); + add2 -= devectorize(q[9] - q[23]); + + uint2 XH64 = xor3x(XL64, q[24], q[25]); + + q[26] = vectorize(add1 + CONST_EXP3d(10) + devectorize((precalc[10] + ROL2(h[10], 11) + ROL2(h[13], 14) - ROL2(h[4], 5)) ^ cmsg[1])); + q[27] = vectorize(add2 + CONST_EXP3d(11) + devectorize((precalc[11] + ROL2(h[11], 12) + ROL2(h[14], 15) - ROL2(h[5], 6)) ^ cmsg[2])); + + add1 -= devectorize(q[10] - q[24]); + add2 -= devectorize(q[11] - q[25]); + + XH64 = xor3x(XH64, q[26], q[27]); + + q[28] = vectorize(add1 + CONST_EXP3d(12) + devectorize((precalc[12] + ROL2(h[12], 13) + ROL16(h[15]) - ROL2(h[6], 7)) ^ cmsg[3])); + q[29] = vectorize(add2 + CONST_EXP3d(13) + devectorize((precalc[13] + ROL2(h[13], 14) + ROL2(h[0], 1) - ROL8(h[7])) ^ cmsg[4])); + + add1 -= devectorize(q[12] - q[26]); + add2 -= devectorize(q[13] - q[27]); + + XH64 = xor3x(XH64, q[28], q[29]); + + q[30] = vectorize(add1 + CONST_EXP3d(14) + devectorize((precalc[14] + ROL2(h[14], 15) + ROL2(h[1], 2) - ROL2(h[8], 9)) ^ cmsg[5])); + q[31] = vectorize(add2 + CONST_EXP3d(15) + devectorize((precalc[15] + ROL16(h[15]) + ROL2(h[2], 3) - ROL2(h[9], 10)) ^ cmsg[6])); + + XH64 = xor3x(XH64, q[30], q[31]); + + msg[0] = devectorize((SHL2(XH64, 5) ^ SHR2(q[16], 5) ^ h[0]) + (XL64 ^ q[24] ^ q[0])); + msg[1] = devectorize((SHR2(XH64, 7) ^ SHL8(q[17]) ^ h[1]) + (XL64 ^ q[25] ^ q[1])); + msg[2] = devectorize((SHR2(XH64, 5) ^ SHL2(q[18], 5) ^ h[2]) + (XL64 ^ q[26] ^ q[2])); + msg[3] = devectorize((SHR2(XH64, 1) ^ SHL2(q[19], 5) ^ h[3]) + (XL64 ^ q[27] ^ q[3])); + msg[4] = devectorize((SHR2(XH64, 3) ^ q[20] ^ h[4]) + (XL64 ^ q[28] ^ q[4])); + msg[5] = devectorize((SHL2(XH64, 6) ^ SHR2(q[21], 6) ^ h[5]) + (XL64 ^ q[29] ^ q[5])); + msg[6] = devectorize((SHR2(XH64, 4) ^ SHL2(q[22], 6) ^ h[6]) + (XL64 ^ q[30] ^ q[6])); + msg[7] = devectorize((SHR2(XH64, 11) ^ SHL2(q[23], 2) ^ h[7]) + (XL64 ^ q[31] ^ q[7])); + msg[8] = devectorize((XH64 ^ q[24] ^ h[8]) + (SHL8(XL64) ^ q[23] ^ q[8]) + ROTL64(msg[4], 9)); + + msg[9] = devectorize((XH64 ^ q[25] ^ h[9]) + (SHR2(XL64, 6) ^ q[16] ^ q[9]) + ROTL64(msg[5], 10)); + msg[10] = devectorize((XH64 ^ q[26] ^ h[10]) + (SHL2(XL64, 6) ^ q[17] ^ q[10]) + ROTL64(msg[6], 11)); + msg[11] = devectorize((XH64 ^ q[27] ^ h[11]) + (SHL2(XL64, 4) ^ q[18] ^ q[11]) + ROTL64(msg[7], 12)); + +#if __CUDA_ARCH__ > 500 + * (uint2x4*)&inpHash[0] = *(uint2x4*)&msg[8]; #endif + msg[12] = devectorize((XH64 ^ q[28] ^ h[12]) + (SHR2(XL64, 3) ^ q[19] ^ q[12]) + ROTL64(msg[0], 13)); + msg[13] = devectorize((XH64 ^ q[29] ^ h[13]) + (SHR2(XL64, 4) ^ q[20] ^ q[13]) + ROTL64(msg[1], 14)); + msg[14] = devectorize((XH64 ^ q[30] ^ h[14]) + (SHR2(XL64, 7) ^ q[21] ^ q[14]) + ROTL64(msg[2], 15)); + msg[15] = devectorize((XH64 ^ q[31] ^ h[15]) + (SHR2(XL64, 2) ^ q[22] ^ q[15]) + ROTL64(msg[3], 16)); + +#if __CUDA_ARCH__ > 500 + * (uint2x4*)&inpHash[4] = *(uint2x4*)&msg[12]; +#else + *(uint2x4*)&inpHash[0] = *(uint2x4*)&msg[8]; + *(uint2x4*)&inpHash[4] = *(uint2x4*)&msg[12]; +#endif + } +} +__global__ __launch_bounds__(256, 2) +void quark_bmw512_gpu_hash_64_final(uint32_t threads, uint64_t *const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector, uint32_t* resNonce, const uint64_t target) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads){ + + const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread]; + + uint64_t *inpHash = &g_hash[8 * hashPosition]; + + uint64_t __align__(16) msg[16]; + uint2 __align__(16) h[16]; + + uint2x4* phash = (uint2x4*)inpHash; + uint2x4* outpt = (uint2x4*)msg; + outpt[0] = __ldg4(&phash[0]); + outpt[1] = __ldg4(&phash[1]); + + uint2 q[32]; + + bmw512_round1(q, h, msg); + + const uint2 __align__(16) cmsg[16] = { + 0xaaaaaaa0, 0xaaaaaaaa, 0xaaaaaaa1, 0xaaaaaaaa, 0xaaaaaaa2, 0xaaaaaaaa, 0xaaaaaaa3, 0xaaaaaaaa, + 0xaaaaaaa4, 0xaaaaaaaa, 0xaaaaaaa5, 0xaaaaaaaa, 0xaaaaaaa6, 0xaaaaaaaa, 0xaaaaaaa7, 0xaaaaaaaa, + 0xaaaaaaa8, 0xaaaaaaaa, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaab, 0xaaaaaaaa, + 0xaaaaaaac, 0xaaaaaaaa, 0xaaaaaaad, 0xaaaaaaaa, 0xaaaaaaae, 0xaaaaaaaa, 0xaaaaaaaf, 0xaaaaaaaa + }; + +#pragma unroll 16 + for (int i = 0; i < 16; i++) + msg[i] = devectorize(cmsg[i] ^ h[i]); + + const uint2 __align__(16) precalc[16] = { + { 0x55555550, 0x55555555 }, { 0xAAAAAAA5, 0x5AAAAAAA }, { 0xFFFFFFFA, 0x5FFFFFFF }, { 0x5555554F, 0x65555555 }, + { 0xAAAAAAA4, 0x6AAAAAAA }, { 0xFFFFFFF9, 0x6FFFFFFF }, { 0x5555554E, 0x75555555 }, { 0xAAAAAAA3, 0x7AAAAAAA }, + { 0xFFFFFFF8, 0x7FFFFFFF }, { 0x5555554D, 0x85555555 }, { 0xAAAAAAA2, 0x8AAAAAAA }, { 0xFFFFFFF7, 0x8FFFFFFF }, + { 0x5555554C, 0x95555555 }, { 0xAAAAAAA1, 0x9AAAAAAA }, { 0xFFFFFFF6, 0x9FFFFFFF }, { 0x5555554B, 0xA5555555 } + }; + + const uint64_t p2 = msg[15] - msg[12]; + const uint64_t p3 = msg[14] - msg[7]; + const uint64_t p4 = msg[6] + msg[9]; + const uint64_t p5 = msg[8] - msg[5]; + const uint64_t p6 = msg[1] - msg[14]; + const uint64_t p7 = msg[8] - msg[1]; + const uint64_t p8 = msg[3] + msg[10]; + + + uint2 tmp = vectorize((msg[5]) + (msg[10]) + (msg[13]) + p3); + q[0] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[1]; + + tmp = vectorize((msg[6]) - (msg[8]) + (msg[11]) + (msg[14]) - (msg[15])); + q[1] = (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)) + cmsg[2]; + + tmp = vectorize((msg[0]) + (msg[7]) + (msg[9]) + p2); + q[2] = (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)) + cmsg[3]; + + tmp = vectorize((msg[0]) + p7 - (msg[10]) + (msg[13])); + q[3] = (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)) + cmsg[4]; + + tmp = vectorize((msg[2]) + (msg[9]) - (msg[11]) + p6); + q[4] = (SHR2(tmp, 1) ^ tmp) + cmsg[5]; + + tmp = vectorize(p8 + p2 - (msg[2])); + q[5] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[6]; + + tmp = vectorize((msg[4]) - (msg[0]) - (msg[3]) - (msg[11]) + (msg[13])); + q[6] = (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)) + cmsg[7]; + + tmp = vectorize(p6 - (msg[4]) - (msg[5]) - (msg[12])); + q[7] = (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)) + cmsg[8]; + + tmp = vectorize((msg[2]) - (msg[5]) - (msg[6]) + (msg[13]) - (msg[15])); + q[8] = (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)) + cmsg[9]; + + tmp = vectorize((msg[0]) - (msg[3]) + (msg[6]) + p3); + q[9] = (SHR2(tmp, 1) ^ tmp) + cmsg[10]; + + tmp = vectorize(p7 - (msg[4]) - (msg[7]) + (msg[15])); + q[10] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[11]; + + tmp = vectorize(p5 - (msg[0]) - (msg[2]) + (msg[9])); + q[11] = (SHR2(tmp, 1) ^ SHL2(tmp, 2) ^ ROL2(tmp, 13) ^ ROL2(tmp, 43)) + cmsg[12]; + + tmp = vectorize(p8 + msg[1] - p4); + q[12] = (SHR2(tmp, 2) ^ SHL2(tmp, 1) ^ ROL2(tmp, 19) ^ ROL2(tmp, 53)) + cmsg[13]; + + tmp = vectorize((msg[2]) + (msg[4]) + (msg[7]) + (msg[10]) + (msg[11])); + q[13] = (SHR2(tmp, 2) ^ SHL2(tmp, 2) ^ ROL2(tmp, 28) ^ ROL2(tmp, 59)) + cmsg[14]; + + tmp = vectorize((msg[3]) + p5 - (msg[11]) - (msg[12])); + q[14] = (SHR2(tmp, 1) ^ tmp) + cmsg[15]; + + tmp = vectorize((msg[12]) - (msg[4]) - p4 + (msg[13])); + q[15] = (SHR2(tmp, 1) ^ SHL2(tmp, 3) ^ ROL2(tmp, 4) ^ ROL2(tmp, 37)) + cmsg[0]; + + q[16] = + vectorize(devectorize(SHR2(q[0], 1) ^ SHL2(q[0], 2) ^ ROL2(q[0], 13) ^ ROL2(q[0], 43)) + devectorize(SHR2(q[1], 2) ^ SHL2(q[1], 1) ^ ROL2(q[1], 19) ^ ROL2(q[1], 53)) + + devectorize(SHR2(q[2], 2) ^ SHL2(q[2], 2) ^ ROL2(q[2], 28) ^ ROL2(q[2], 59)) + devectorize(SHR2(q[3], 1) ^ SHL2(q[3], 3) ^ ROL2(q[3], 4) ^ ROL2(q[3], 37)) + + devectorize(SHR2(q[4], 1) ^ SHL2(q[4], 2) ^ ROL2(q[4], 13) ^ ROL2(q[4], 43)) + devectorize(SHR2(q[5], 2) ^ SHL2(q[5], 1) ^ ROL2(q[5], 19) ^ ROL2(q[5], 53)) + + devectorize(SHR2(q[6], 2) ^ SHL2(q[6], 2) ^ ROL2(q[6], 28) ^ ROL2(q[6], 59)) + devectorize(SHR2(q[7], 1) ^ SHL2(q[7], 3) ^ ROL2(q[7], 4) ^ ROL2(q[7], 37)) + + devectorize(SHR2(q[8], 1) ^ SHL2(q[8], 2) ^ ROL2(q[8], 13) ^ ROL2(q[8], 43)) + devectorize(SHR2(q[9], 2) ^ SHL2(q[9], 1) ^ ROL2(q[9], 19) ^ ROL2(q[9], 53)) + + devectorize(SHR2(q[10], 2) ^ SHL2(q[10], 2) ^ ROL2(q[10], 28) ^ ROL2(q[10], 59)) + devectorize(SHR2(q[11], 1) ^ SHL2(q[11], 3) ^ ROL2(q[11], 4) ^ ROL2(q[11], 37)) + + devectorize(SHR2(q[12], 1) ^ SHL2(q[12], 2) ^ ROL2(q[12], 13) ^ ROL2(q[12], 43)) + devectorize(SHR2(q[13], 2) ^ SHL2(q[13], 1) ^ ROL2(q[13], 19) ^ ROL2(q[13], 53)) + + devectorize(SHR2(q[14], 2) ^ SHL2(q[14], 2) ^ ROL2(q[14], 28) ^ ROL2(q[14], 59)) + devectorize(SHR2(q[15], 1) ^ SHL2(q[15], 3) ^ ROL2(q[15], 4) ^ ROL2(q[15], 37)) + + devectorize((precalc[0] + ROL2(h[0], 1) + ROL2(h[3], 4) - ROL2(h[10], 11)) ^ cmsg[7])); + q[17] = + vectorize(devectorize(SHR2(q[1], 1) ^ SHL2(q[1], 2) ^ ROL2(q[1], 13) ^ ROL2(q[1], 43)) + devectorize(SHR2(q[2], 2) ^ SHL2(q[2], 1) ^ ROL2(q[2], 19) ^ ROL2(q[2], 53)) + + devectorize(SHR2(q[3], 2) ^ SHL2(q[3], 2) ^ ROL2(q[3], 28) ^ ROL2(q[3], 59)) + devectorize(SHR2(q[4], 1) ^ SHL2(q[4], 3) ^ ROL2(q[4], 4) ^ ROL2(q[4], 37)) + + devectorize(SHR2(q[5], 1) ^ SHL2(q[5], 2) ^ ROL2(q[5], 13) ^ ROL2(q[5], 43)) + devectorize(SHR2(q[6], 2) ^ SHL2(q[6], 1) ^ ROL2(q[6], 19) ^ ROL2(q[6], 53)) + + devectorize(SHR2(q[7], 2) ^ SHL2(q[7], 2) ^ ROL2(q[7], 28) ^ ROL2(q[7], 59)) + devectorize(SHR2(q[8], 1) ^ SHL2(q[8], 3) ^ ROL2(q[8], 4) ^ ROL2(q[8], 37)) + + devectorize(SHR2(q[9], 1) ^ SHL2(q[9], 2) ^ ROL2(q[9], 13) ^ ROL2(q[9], 43)) + devectorize(SHR2(q[10], 2) ^ SHL2(q[10], 1) ^ ROL2(q[10], 19) ^ ROL2(q[10], 53)) + + devectorize(SHR2(q[11], 2) ^ SHL2(q[11], 2) ^ ROL2(q[11], 28) ^ ROL2(q[11], 59)) + devectorize(SHR2(q[12], 1) ^ SHL2(q[12], 3) ^ ROL2(q[12], 4) ^ ROL2(q[12], 37)) + + devectorize(SHR2(q[13], 1) ^ SHL2(q[13], 2) ^ ROL2(q[13], 13) ^ ROL2(q[13], 43)) + devectorize(SHR2(q[14], 2) ^ SHL2(q[14], 1) ^ ROL2(q[14], 19) ^ ROL2(q[14], 53)) + + devectorize(SHR2(q[15], 2) ^ SHL2(q[15], 2) ^ ROL2(q[15], 28) ^ ROL2(q[15], 59)) + devectorize(SHR2(q[16], 1) ^ SHL2(q[16], 3) ^ ROL2(q[16], 4) ^ ROL2(q[16], 37)) + + devectorize((precalc[1] + ROL2(h[1], 2) + ROL2(h[4], 5) - ROL2(h[11], 12)) ^ cmsg[8])); + + uint64_t add1 = devectorize(q[2] + q[4] + q[6] + q[8] + q[10] + q[12] + q[14]); + uint64_t add2 = devectorize(q[3] + q[5] + q[7] + q[9] + q[11] + q[13] + q[15]); + + uint2 XL64 = q[16] ^ q[17]; + + q[18] = vectorize(add1 + CONST_EXP3d(2) + devectorize((precalc[2] + ROL2(h[2], 3) + ROL2(h[5], 6) - ROL2(h[12], 13)) ^ cmsg[9])); + q[19] = vectorize(add2 + CONST_EXP3d(3) + devectorize((precalc[3] + ROL2(h[3], 4) + ROL2(h[6], 7) - ROL2(h[13], 14)) ^ cmsg[10])); + + add1 = add1 - devectorize(q[2] - q[16]); + add2 = add2 - devectorize(q[3] - q[17]); + + XL64 = xor3x(XL64, q[18], q[19]); + + q[20] = vectorize(add1 + CONST_EXP3d(4) + devectorize((precalc[4] + ROL2(h[4], 5) + ROL8(h[7]) - ROL2(h[14], 15)) ^ cmsg[11])); + q[21] = vectorize(add2 + CONST_EXP3d(5) + devectorize((precalc[5] + ROL2(h[5], 6) + ROL2(h[8], 9) - ROL16(h[15])) ^ cmsg[12])); + + add1 = add1 - devectorize(q[4] - q[18]); + add2 = add2 - devectorize(q[5] - q[19]); + + XL64 = xor3x(XL64, q[20], q[21]); + + q[22] = vectorize(add1 + CONST_EXP3d(6) + devectorize((precalc[6] + ROL2(h[6], 7) + ROL2(h[9], 10) - ROL2(h[0], 1)) ^ cmsg[13])); + q[23] = vectorize(add2 + CONST_EXP3d(7) + devectorize((precalc[7] + ROL8(h[7]) + ROL2(h[10], 11) - ROL2(h[1], 2)) ^ cmsg[14])); + + add1 -= devectorize(q[6] - q[20]); + add2 -= devectorize(q[7] - q[21]); + + XL64 = xor3x(XL64, q[22], q[23]); + + q[24] = vectorize(add1 + CONST_EXP3d(8) + devectorize((precalc[8] + ROL2(h[8], 9) + ROL2(h[11], 12) - ROL2(h[2], 3)) ^ cmsg[15])); + q[25] = vectorize(add2 + CONST_EXP3d(9) + devectorize((precalc[9] + ROL2(h[9], 10) + ROL2(h[12], 13) - ROL2(h[3], 4)) ^ cmsg[0])); + + add1 -= devectorize(q[8] - q[22]); + add2 -= devectorize(q[9] - q[23]); + + uint2 XH64 = xor3x(XL64, q[24], q[25]); + + q[26] = vectorize(add1 + CONST_EXP3d(10) + devectorize((precalc[10] + ROL2(h[10], 11) + ROL2(h[13], 14) - ROL2(h[4], 5)) ^ cmsg[1])); + q[27] = vectorize(add2 + CONST_EXP3d(11) + devectorize((precalc[11] + ROL2(h[11], 12) + ROL2(h[14], 15) - ROL2(h[5], 6)) ^ cmsg[2])); + + add1 -= devectorize(q[10] - q[24]); + add2 -= devectorize(q[11] - q[25]); + + XH64 = xor3x(XH64, q[26], q[27]); + + q[28] = vectorize(add1 + CONST_EXP3d(12) + devectorize((precalc[12] + ROL2(h[12], 13) + ROL16(h[15]) - ROL2(h[6], 7)) ^ cmsg[3])); + q[29] = vectorize(add2 + CONST_EXP3d(13) + devectorize((precalc[13] + ROL2(h[13], 14) + ROL2(h[0], 1) - ROL8(h[7])) ^ cmsg[4])); + + add1 -= devectorize(q[12] - q[26]); + add2 -= devectorize(q[13] - q[27]); + + XH64 = xor3x(XH64, q[28], q[29]); + + q[30] = vectorize(add1 + CONST_EXP3d(14) + devectorize((precalc[14] + ROL2(h[14], 15) + ROL2(h[1], 2) - ROL2(h[8], 9)) ^ cmsg[5])); + q[31] = vectorize(add2 + CONST_EXP3d(15) + devectorize((precalc[15] + ROL16(h[15]) + ROL2(h[2], 3) - ROL2(h[9], 10)) ^ cmsg[6])); + + XH64 = xor3x(XH64, q[30], q[31]); + + // msg[0] = devectorize((SHL2(XH64, 5) ^ SHR2(q[16], 5) ^ h[0]) + (XL64 ^ q[24] ^ q[0])); + // msg[1] = devectorize((SHR2(XH64, 7) ^ SHL8(q[17]) ^ h[1]) + (XL64 ^ q[25] ^ q[1])); + // msg[2] = devectorize((SHR2(XH64, 5) ^ SHL2(q[18], 5) ^ h[2]) + (XL64 ^ q[26] ^ q[2])); + // msg[3] = devectorize((SHR2(XH64, 1) ^ SHL2(q[19], 5) ^ h[3]) + (XL64 ^ q[27] ^ q[3])); + // msg[4] = devectorize((SHR2(XH64, 3) ^ q[20] ^ h[4]) + (XL64 ^ q[28] ^ q[4])); + // msg[5] = devectorize((SHL2(XH64, 6) ^ SHR2(q[21], 6) ^ h[5]) + (XL64 ^ q[29] ^ q[5])); + // msg[6] = devectorize((SHR2(XH64, 4) ^ SHL2(q[22], 6) ^ h[6]) + (XL64 ^ q[30] ^ q[6])); + msg[7] = devectorize((SHR2(XH64, 11) ^ SHL2(q[23], 2) ^ h[7]) + (XL64 ^ q[31] ^ q[7])); + // msg[8] = devectorize((XH64 ^ q[24] ^ h[8]) + (SHL8(XL64) ^ q[23] ^ q[8]) + ROTL64(msg[4], 9)); + + // msg[9] = devectorize((XH64 ^ q[25] ^ h[9]) + (SHR2(XL64, 6) ^ q[16] ^ q[9]) + ROTL64(msg[5], 10)); + // msg[10] = devectorize((XH64 ^ q[26] ^ h[10]) + (SHL2(XL64, 6) ^ q[17] ^ q[10]) + ROTL64(msg[6], 11)); + msg[11] = devectorize((XH64 ^ q[27] ^ h[11]) + (SHL2(XL64, 4) ^ q[18] ^ q[11]) + ROTL64(msg[7], 12)); + + //#if __CUDA_ARCH__ > 500 + // * (uint2x4*)&inpHash[0] = *(uint2x4*)&msg[8]; + //#endif + + // msg[12] = devectorize((XH64 ^ q[28] ^ h[12]) + (SHR2(XL64, 3) ^ q[19] ^ q[12]) + ROTL64(msg[0], 13)); + // msg[13] = devectorize((XH64 ^ q[29] ^ h[13]) + (SHR2(XL64, 4) ^ q[20] ^ q[13]) + ROTL64(msg[1], 14)); + // msg[14] = devectorize((XH64 ^ q[30] ^ h[14]) + (SHR2(XL64, 7) ^ q[21] ^ q[14]) + ROTL64(msg[2], 15)); + // msg[15] = devectorize((XH64 ^ q[31] ^ h[15]) + (SHR2(XL64, 2) ^ q[22] ^ q[15]) + ROTL64(msg[3], 16)); + + if (msg[11] <= target) + { + uint32_t tmp = atomicExch(&resNonce[0], thread); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } + /*#if __CUDA_ARCH__ > 500 + * (uint2x4*)&inpHash[4] = *(uint2x4*)&msg[12]; + #else + *(uint2x4*)&inpHash[0] = *(uint2x4*)&msg[8]; + *(uint2x4*)&inpHash[4] = *(uint2x4*)&msg[12]; + #endif + */ + } +} + + + +__host__ +void quark_bmw512_cpu_init(int thr_id, uint32_t threads) +{ + cuda_get_arch(thr_id); +} + + + +__host__ void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +{ + const uint32_t threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + quark_bmw512_gpu_hash_64 << > >(threads, (uint64_t*)d_hash, d_nonceVector); +} + + +__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) + + #undef SHL #undef SHR #undef CONST_EXP2 @@ -22,6 +660,7 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi #define SHL(x, n) SHL2(x, n) #define ROL(x, n) ROL2(x, n) + #define CONST_EXP2(i) \ q[i+0] + ROL(q[i+1], 5) + q[i+2] + ROL(q[i+3], 11) + \ q[i+4] + ROL(q[i+5], 27) + q[i+6] + SWAPUINT2(q[i+7]) + \ @@ -29,45 +668,43 @@ __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + paddi q[i+12] + ROL(q[i+13], 53) + (SHR(q[i+14],1) ^ q[i+14]) + (SHR(q[i+15],2) ^ q[i+15]) __device__ -void Compression512_64_first(uint2 *msg, uint2 *hash) +void Compression512(uint2 *msg, uint2 *hash) { // Compression ref. implementation uint2 q[32]; uint2 tmp; - tmp = (msg[5] ^ hash[5]) - (msg[7] ^ hash[7]) + (hash[10]) + (hash[13]) + (hash[14]); + tmp = (msg[5] ^ hash[5]) - (msg[7] ^ hash[7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]); q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[1]; - - tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + (hash[11]) + (hash[14]) - (msg[15] ^ hash[15]); + tmp = (msg[6] ^ hash[6]) - (msg[8] ^ hash[8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]); q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[2]; - tmp = (msg[0] ^ hash[0]) + (msg[7] ^ hash[7]) + (hash[9]) - (hash[12]) + (msg[15] ^ hash[15]); + tmp = (msg[0] ^ hash[0]) + (msg[7] ^ hash[7]) + (msg[9] ^ hash[9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[3]; - tmp = (msg[0] ^ hash[0]) - (msg[1] ^ hash[1]) + (msg[8] ^ hash[8]) - (hash[10]) + (hash[13]); + tmp = (msg[0] ^ hash[0]) - (msg[1] ^ hash[1]) + (msg[8] ^ hash[8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]); q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[4]; - tmp = (msg[1] ^ hash[1]) + (msg[2] ^ hash[2]) + (hash[9]) - (hash[11]) - (hash[14]); + tmp = (msg[1] ^ hash[1]) + (msg[2] ^ hash[2]) + (msg[9] ^ hash[9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]); q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; - tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (hash[10]) - (hash[12]) + (msg[15] ^ hash[15]); + tmp = (msg[3] ^ hash[3]) - (msg[2] ^ hash[2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[6]; - tmp = (msg[4] ^ hash[4]) - (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) - (hash[11]) + (hash[13]); + tmp = (msg[4] ^ hash[4]) - (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]); q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[7]; - tmp = (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[5] ^ hash[5]) - (hash[12]) - (hash[14]); + tmp = (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[5] ^ hash[5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]); q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[8]; - - tmp = (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) - (msg[6] ^ hash[6]) + (hash[13]) - (msg[15] ^ hash[15]); + tmp = (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) - (msg[6] ^ hash[6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]); q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[9]; - tmp = (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) + (msg[6] ^ hash[6]) - (msg[7] ^ hash[7]) + (hash[14]); + tmp = (msg[0] ^ hash[0]) - (msg[3] ^ hash[3]) + (msg[6] ^ hash[6]) - (msg[7] ^ hash[7]) + (msg[14] ^ hash[14]); q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; tmp = (msg[8] ^ hash[8]) - (msg[1] ^ hash[1]) - (msg[4] ^ hash[4]) - (msg[7] ^ hash[7]) + (msg[15] ^ hash[15]); q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[11]; - tmp = (msg[8] ^ hash[8]) - (msg[0] ^ hash[0]) - (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) + (hash[9]); + tmp = (msg[8] ^ hash[8]) - (msg[0] ^ hash[0]) - (msg[2] ^ hash[2]) - (msg[5] ^ hash[5]) + (msg[9] ^ hash[9]); q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[12]; - tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[10]); + tmp = (msg[1] ^ hash[1]) + (msg[3] ^ hash[3]) - (msg[6] ^ hash[6]) - (msg[9] ^ hash[9]) + (msg[10] ^ hash[10]); q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[13]; - tmp = (msg[2] ^ hash[2]) + (msg[4] ^ hash[4]) + (msg[7] ^ hash[7]) + (hash[10]) + (hash[11]); + tmp = (msg[2] ^ hash[2]) + (msg[4] ^ hash[4]) + (msg[7] ^ hash[7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]); q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[14]; - tmp = (msg[3] ^ hash[3]) - (msg[5] ^ hash[5]) + (msg[8] ^ hash[8]) - (hash[11]) - (hash[12]); + tmp = (msg[3] ^ hash[3]) - (msg[5] ^ hash[5]) + (msg[8] ^ hash[8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]); q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; - tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - (msg[6] ^ hash[6]) - (hash[9]) + (hash[13]); + tmp = (msg[12] ^ hash[12]) - (msg[4] ^ hash[4]) - (msg[6] ^ hash[6]) - (msg[9] ^ hash[9]) + (msg[13] ^ hash[13]); q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[0]; q[0 + 16] = @@ -87,148 +724,8 @@ void Compression512_64_first(uint2 *msg, uint2 *hash) (SHR(q[0 + 13], 2) ^ SHL(q[0 + 13], 1) ^ ROL(q[0 + 13], 19) ^ ROL(q[0 + 13], 53)) + (SHR(q[0 + 14], 2) ^ SHL(q[0 + 14], 2) ^ ROL(q[0 + 14], 28) ^ ROL(q[0 + 14], 59)) + (SHR(q[0 + 15], 1) ^ SHL(q[0 + 15], 3) ^ ROL(q[0 + 15], 4) ^ ROL(q[0 + 15], 37)) + - ((make_uint2(0x55555550ul,0x55555555) + ROL(msg[0], 0 + 1) + - ROL(msg[0 + 3], 0 + 4)) ^ hash[0 + 7]); - - q[1 + 16] = - (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROL(q[1], 13) ^ ROL(q[1], 43)) + - (SHR(q[1 + 1], 2) ^ SHL(q[1 + 1], 1) ^ ROL(q[1 + 1], 19) ^ ROL(q[1 + 1], 53)) + - (SHR(q[1 + 2], 2) ^ SHL(q[1 + 2], 2) ^ ROL(q[1 + 2], 28) ^ ROL(q[1 + 2], 59)) + - (SHR(q[1 + 3], 1) ^ SHL(q[1 + 3], 3) ^ ROL(q[1 + 3], 4) ^ ROL(q[1 + 3], 37)) + - (SHR(q[1 + 4], 1) ^ SHL(q[1 + 4], 2) ^ ROL(q[1 + 4], 13) ^ ROL(q[1 + 4], 43)) + - (SHR(q[1 + 5], 2) ^ SHL(q[1 + 5], 1) ^ ROL(q[1 + 5], 19) ^ ROL(q[1 + 5], 53)) + - (SHR(q[1 + 6], 2) ^ SHL(q[1 + 6], 2) ^ ROL(q[1 + 6], 28) ^ ROL(q[1 + 6], 59)) + - (SHR(q[1 + 7], 1) ^ SHL(q[1 + 7], 3) ^ ROL(q[1 + 7], 4) ^ ROL(q[1 + 7], 37)) + - (SHR(q[1 + 8], 1) ^ SHL(q[1 + 8], 2) ^ ROL(q[1 + 8], 13) ^ ROL(q[1 + 8], 43)) + - (SHR(q[1 + 9], 2) ^ SHL(q[1 + 9], 1) ^ ROL(q[1 + 9], 19) ^ ROL(q[1 + 9], 53)) + - (SHR(q[1 + 10], 2) ^ SHL(q[1 + 10], 2) ^ ROL(q[1 + 10], 28) ^ ROL(q[1 + 10], 59)) + - (SHR(q[1 + 11], 1) ^ SHL(q[1 + 11], 3) ^ ROL(q[1 + 11], 4) ^ ROL(q[1 + 11], 37)) + - (SHR(q[1 + 12], 1) ^ SHL(q[1 + 12], 2) ^ ROL(q[1 + 12], 13) ^ ROL(q[1 + 12], 43)) + - (SHR(q[1 + 13], 2) ^ SHL(q[1 + 13], 1) ^ ROL(q[1 + 13], 19) ^ ROL(q[1 + 13], 53)) + - (SHR(q[1 + 14], 2) ^ SHL(q[1 + 14], 2) ^ ROL(q[1 + 14], 28) ^ ROL(q[1 + 14], 59)) + - (SHR(q[1 + 15], 1) ^ SHL(q[1 + 15], 3) ^ ROL(q[1 + 15], 4) ^ ROL(q[1 + 15], 37)) + - ((make_uint2(0xAAAAAAA5, 0x5AAAAAAA) + ROL(msg[1], 1 + 1) + - ROL(msg[1 + 3], 1 + 4)) ^ hash[1 + 7]); - - q[2 + 16] = CONST_EXP2(2) + - ((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROL(msg[2], 2 + 1) + - ROL(msg[2 + 3], 2 + 4) - ROL(msg[2 + 10], 2 + 11)) ^ hash[2 + 7]); - q[3 + 16] = CONST_EXP2(3) + - ((make_uint2(0x5555554F, 0x65555555) + ROL(msg[3], 3 + 1) + - ROL(msg[3 + 3], 3 + 4) - ROL(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]); - q[4 + 16] = CONST_EXP2(4) + - ((make_uint2(0xAAAAAAA4, 0x6AAAAAAA) +ROL(msg[4], 4 + 1) + - ROL(msg[4 + 3], 4 + 4) - ROL(msg[4 + 10], 4 + 11)) ^ hash[4 + 7]); - q[5 + 16] = CONST_EXP2(5) + - ((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROL(msg[5], 5 + 1) + - ROL(msg[5 + 3], 5 + 4) - ROL(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]); - - #pragma unroll 3 - for (int i = 6; i<9; i++) { - q[i + 16] = CONST_EXP2(i) + - ((vectorize((i + 16)*(0x0555555555555555ull)) + ROL(msg[i], i + 1) - - ROL(msg[i - 6], (i - 6) + 1)) ^ hash[i + 7]); - } - - #pragma unroll 4 - for (int i = 9; i<13; i++) { - q[i + 16] = CONST_EXP2(i) + - ((vectorize((i + 16)*(0x0555555555555555ull)) + - ROL(msg[i + 3], i + 4) - ROL(msg[i - 6], (i - 6) + 1)) ^ hash[i - 9]); - } - - q[13 + 16] = CONST_EXP2(13) + - ((make_uint2(0xAAAAAAA1, 0x9AAAAAAA) + ROL(msg[13], 13 + 1) + - ROL(msg[13 - 13], (13 - 13) + 1) - ROL(msg[13 - 6], (13 - 6) + 1)) ^ hash[13 - 9]); - q[14 + 16] = CONST_EXP2(14) + - ((make_uint2(0xFFFFFFF6, 0x9FFFFFFF) + ROL(msg[14], 14 + 1) + - ROL(msg[14 - 13], (14 - 13) + 1) - ROL(msg[14 - 6], (14 - 6) + 1)) ^ hash[14 - 9]); - q[15 + 16] = CONST_EXP2(15) + - ((make_uint2(0x5555554B, 0xA5555555) + ROL(msg[15], 15 + 1) + - ROL(msg[15 - 13], (15 - 13) + 1) - ROL(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]); - - - uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; - uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; - - hash[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg[0]) + (XL64 ^ q[24] ^ q[0]); - hash[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg[1]) + (XL64 ^ q[25] ^ q[1]); - hash[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg[2]) + (XL64 ^ q[26] ^ q[2]); - hash[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg[3]) + (XL64 ^ q[27] ^ q[3]); - hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[4]) + (XL64 ^ q[28] ^ q[4]); - hash[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg[5]) + (XL64 ^ q[29] ^ q[5]); - hash[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg[6]) + (XL64 ^ q[30] ^ q[6]); - hash[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg[7]) + (XL64 ^ q[31] ^ q[7]); - - hash[8] = ROL(hash[4], 9) + (XH64 ^ q[24] ^ msg[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]); - hash[9] = ROL(hash[5], 10) + (XH64 ^ q[25]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); - hash[10] = ROL(hash[6], 11) + (XH64 ^ q[26]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); - hash[11] = ROL(hash[7], 12) + (XH64 ^ q[27]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); - hash[12] = ROL(hash[0], 13) + (XH64 ^ q[28]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); - hash[13] = ROL(hash[1], 14) + (XH64 ^ q[29]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); - hash[14] = ROL(hash[2], 15) + (XH64 ^ q[30]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); - hash[15] = ROL(hash[3], 16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); -} - -__device__ -void Compression512(uint2 *msg, uint2 *hash) -{ - // Compression ref. implementation - uint2 q[32]; - uint2 tmp; - - tmp = (msg[ 5] ^ hash[ 5]) - (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]) + (msg[14] ^ hash[14]); - q[0] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[1]; - tmp = (msg[ 6] ^ hash[ 6]) - (msg[ 8] ^ hash[ 8]) + (msg[11] ^ hash[11]) + (msg[14] ^ hash[14]) - (msg[15] ^ hash[15]); - q[1] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[2]; - tmp = (msg[ 0] ^ hash[ 0]) + (msg[ 7] ^ hash[ 7]) + (msg[ 9] ^ hash[ 9]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); - q[2] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[3]; - tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 1] ^ hash[ 1]) + (msg[ 8] ^ hash[ 8]) - (msg[10] ^ hash[10]) + (msg[13] ^ hash[13]); - q[3] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[4]; - tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 2] ^ hash[ 2]) + (msg[ 9] ^ hash[ 9]) - (msg[11] ^ hash[11]) - (msg[14] ^ hash[14]); - q[4] = (SHR(tmp, 1) ^ tmp) + hash[5]; - tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 2] ^ hash[ 2]) + (msg[10] ^ hash[10]) - (msg[12] ^ hash[12]) + (msg[15] ^ hash[15]); - q[5] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[6]; - tmp = (msg[ 4] ^ hash[ 4]) - (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) - (msg[11] ^ hash[11]) + (msg[13] ^ hash[13]); - q[6] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[7]; - tmp = (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 5] ^ hash[ 5]) - (msg[12] ^ hash[12]) - (msg[14] ^ hash[14]); - q[7] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[8]; - tmp = (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) - (msg[ 6] ^ hash[ 6]) + (msg[13] ^ hash[13]) - (msg[15] ^ hash[15]); - q[8] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[9]; - tmp = (msg[ 0] ^ hash[ 0]) - (msg[ 3] ^ hash[ 3]) + (msg[ 6] ^ hash[ 6]) - (msg[ 7] ^ hash[ 7]) + (msg[14] ^ hash[14]); - q[9] = (SHR(tmp, 1) ^ tmp) + hash[10]; - tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 1] ^ hash[ 1]) - (msg[ 4] ^ hash[ 4]) - (msg[ 7] ^ hash[ 7]) + (msg[15] ^ hash[15]); - q[10] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[11]; - tmp = (msg[ 8] ^ hash[ 8]) - (msg[ 0] ^ hash[ 0]) - (msg[ 2] ^ hash[ 2]) - (msg[ 5] ^ hash[ 5]) + (msg[ 9] ^ hash[ 9]); - q[11] = (SHR(tmp, 1) ^ SHL(tmp, 2) ^ ROL(tmp, 13) ^ ROL(tmp, 43)) + hash[12]; - tmp = (msg[ 1] ^ hash[ 1]) + (msg[ 3] ^ hash[ 3]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[10] ^ hash[10]); - q[12] = (SHR(tmp, 2) ^ SHL(tmp, 1) ^ ROL(tmp, 19) ^ ROL(tmp, 53)) + hash[13]; - tmp = (msg[ 2] ^ hash[ 2]) + (msg[ 4] ^ hash[ 4]) + (msg[ 7] ^ hash[ 7]) + (msg[10] ^ hash[10]) + (msg[11] ^ hash[11]); - q[13] = (SHR(tmp, 2) ^ SHL(tmp, 2) ^ ROL(tmp, 28) ^ ROL(tmp, 59)) + hash[14]; - tmp = (msg[ 3] ^ hash[ 3]) - (msg[ 5] ^ hash[ 5]) + (msg[ 8] ^ hash[ 8]) - (msg[11] ^ hash[11]) - (msg[12] ^ hash[12]); - q[14] = (SHR(tmp, 1) ^ tmp) + hash[15]; - tmp = (msg[12] ^ hash[12]) - (msg[ 4] ^ hash[ 4]) - (msg[ 6] ^ hash[ 6]) - (msg[ 9] ^ hash[ 9]) + (msg[13] ^ hash[13]); - q[15] = (SHR(tmp, 1) ^ SHL(tmp, 3) ^ ROL(tmp, 4) ^ ROL(tmp, 37)) + hash[0]; - - q[0+16] = - (SHR(q[0], 1) ^ SHL(q[0], 2) ^ ROL(q[0], 13) ^ ROL(q[0], 43)) + - (SHR(q[0+1], 2) ^ SHL(q[0+1], 1) ^ ROL(q[0+1], 19) ^ ROL(q[0+1], 53)) + - (SHR(q[0+2], 2) ^ SHL(q[0+2], 2) ^ ROL(q[0+2], 28) ^ ROL(q[0+2], 59)) + - (SHR(q[0+3], 1) ^ SHL(q[0+3], 3) ^ ROL(q[0+3], 4) ^ ROL(q[0+3], 37)) + - (SHR(q[0+4], 1) ^ SHL(q[0+4], 2) ^ ROL(q[0+4], 13) ^ ROL(q[0+4], 43)) + - (SHR(q[0+5], 2) ^ SHL(q[0+5], 1) ^ ROL(q[0+5], 19) ^ ROL(q[0+5], 53)) + - (SHR(q[0+6], 2) ^ SHL(q[0+6], 2) ^ ROL(q[0+6], 28) ^ ROL(q[0+6], 59)) + - (SHR(q[0+7], 1) ^ SHL(q[0+7], 3) ^ ROL(q[0+7], 4) ^ ROL(q[0+7], 37)) + - (SHR(q[0+8], 1) ^ SHL(q[0+8], 2) ^ ROL(q[0+8], 13) ^ ROL(q[0+8], 43)) + - (SHR(q[0+9], 2) ^ SHL(q[0+9], 1) ^ ROL(q[0+9], 19) ^ ROL(q[0+9], 53)) + - (SHR(q[0+10], 2) ^ SHL(q[0+10], 2) ^ ROL(q[0+10], 28) ^ ROL(q[0+10], 59)) + - (SHR(q[0+11], 1) ^ SHL(q[0+11], 3) ^ ROL(q[0+11], 4) ^ ROL(q[0+11], 37)) + - (SHR(q[0+12], 1) ^ SHL(q[0+12], 2) ^ ROL(q[0+12], 13) ^ ROL(q[0+12], 43)) + - (SHR(q[0+13], 2) ^ SHL(q[0+13], 1) ^ ROL(q[0+13], 19) ^ ROL(q[0+13], 53)) + - (SHR(q[0+14], 2) ^ SHL(q[0+14], 2) ^ ROL(q[0+14], 28) ^ ROL(q[0+14], 59)) + - (SHR(q[0+15], 1) ^ SHL(q[0+15], 3) ^ ROL(q[0+15], 4) ^ ROL(q[0+15], 37)) + ((make_uint2(0x55555550ul, 0x55555555) + ROL(msg[0], 0 + 1) + - ROL(msg[0+3], 0+4) - ROL(msg[0+10], 0+11) ) ^ hash[0+7]); + ROL(msg[0 + 3], 0 + 4) - ROL(msg[0 + 10], 0 + 11)) ^ hash[0 + 7]); q[1 + 16] = (SHR(q[1], 1) ^ SHL(q[1], 2) ^ ROL(q[1], 13) ^ ROL(q[1], 43)) + @@ -252,7 +749,7 @@ void Compression512(uint2 *msg, uint2 *hash) q[2 + 16] = CONST_EXP2(2) + ((make_uint2(0xFFFFFFFA, 0x5FFFFFFF) + ROL(msg[2], 2 + 1) + - ROL(msg[2+3], 2+4) - ROL(msg[2+10], 2+11) ) ^ hash[2+7]); + ROL(msg[2 + 3], 2 + 4) - ROL(msg[2 + 10], 2 + 11)) ^ hash[2 + 7]); q[3 + 16] = CONST_EXP2(3) + ((make_uint2(0x5555554F, 0x65555555) + ROL(msg[3], 3 + 1) + ROL(msg[3 + 3], 3 + 4) - ROL(msg[3 + 10], 3 + 11)) ^ hash[3 + 7]); @@ -263,7 +760,7 @@ void Compression512(uint2 *msg, uint2 *hash) ((make_uint2(0xFFFFFFF9, 0x6FFFFFFF) + ROL(msg[5], 5 + 1) + ROL(msg[5 + 3], 5 + 4) - ROL(msg[5 + 10], 5 + 11)) ^ hash[5 + 7]); q[6 + 16] = CONST_EXP2(6) + - ((make_uint2(0x5555554E, 0x75555555)+ ROL(msg[6], 6 + 1) + + ((make_uint2(0x5555554E, 0x75555555) + ROL(msg[6], 6 + 1) + ROL(msg[6 + 3], 6 + 4) - ROL(msg[6 - 6], (6 - 6) + 1)) ^ hash[6 + 7]); q[7 + 16] = CONST_EXP2(7) + ((make_uint2(0xAAAAAAA3, 0x7AAAAAAA) + ROL(msg[7], 7 + 1) + @@ -293,43 +790,37 @@ void Compression512(uint2 *msg, uint2 *hash) ((make_uint2(0x5555554B, 0xA5555555) + ROL(msg[15], 15 + 1) + ROL(msg[15 - 13], (15 - 13) + 1) - ROL(msg[15 - 6], (15 - 6) + 1)) ^ hash[15 - 9]); - uint2 XL64 = q[16]^q[17]^q[18]^q[19]^q[20]^q[21]^q[22]^q[23]; + uint2 XL64 = q[16] ^ q[17] ^ q[18] ^ q[19] ^ q[20] ^ q[21] ^ q[22] ^ q[23]; uint2 XH64 = XL64^q[24] ^ q[25] ^ q[26] ^ q[27] ^ q[28] ^ q[29] ^ q[30] ^ q[31]; - hash[0] = (SHL(XH64, 5) ^ SHR(q[16],5) ^ msg[ 0]) + (XL64 ^ q[24] ^ q[ 0]); - hash[1] = (SHR(XH64, 7) ^ SHL(q[17],8) ^ msg[ 1]) + (XL64 ^ q[25] ^ q[ 1]); - hash[2] = (SHR(XH64, 5) ^ SHL(q[18],5) ^ msg[ 2]) + (XL64 ^ q[26] ^ q[ 2]); - hash[3] = (SHR(XH64, 1) ^ SHL(q[19],5) ^ msg[ 3]) + (XL64 ^ q[27] ^ q[ 3]); - hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[ 4]) + (XL64 ^ q[28] ^ q[ 4]); - hash[5] = (SHL(XH64, 6) ^ SHR(q[21],6) ^ msg[ 5]) + (XL64 ^ q[29] ^ q[ 5]); - hash[6] = (SHR(XH64, 4) ^ SHL(q[22],6) ^ msg[ 6]) + (XL64 ^ q[30] ^ q[ 6]); - hash[7] = (SHR(XH64,11) ^ SHL(q[23],2) ^ msg[ 7]) + (XL64 ^ q[31] ^ q[ 7]); - - hash[ 8] = ROL(hash[4], 9) + (XH64 ^ q[24] ^ msg[ 8]) + (SHL(XL64,8) ^ q[23] ^ q[ 8]); - hash[ 9] = ROL(hash[5],10) + (XH64 ^ q[25] ^ msg[ 9]) + (SHR(XL64,6) ^ q[16] ^ q[ 9]); - hash[10] = ROL(hash[6],11) + (XH64 ^ q[26] ^ msg[10]) + (SHL(XL64,6) ^ q[17] ^ q[10]); - hash[11] = ROL(hash[7],12) + (XH64 ^ q[27] ^ msg[11]) + (SHL(XL64,4) ^ q[18] ^ q[11]); - hash[12] = ROL(hash[0],13) + (XH64 ^ q[28] ^ msg[12]) + (SHR(XL64,3) ^ q[19] ^ q[12]); - hash[13] = ROL(hash[1],14) + (XH64 ^ q[29] ^ msg[13]) + (SHR(XL64,4) ^ q[20] ^ q[13]); - hash[14] = ROL(hash[2],15) + (XH64 ^ q[30] ^ msg[14]) + (SHR(XL64,7) ^ q[21] ^ q[14]); - hash[15] = ROL(hash[3],16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); + hash[0] = (SHL(XH64, 5) ^ SHR(q[16], 5) ^ msg[0]) + (XL64 ^ q[24] ^ q[0]); + hash[1] = (SHR(XH64, 7) ^ SHL(q[17], 8) ^ msg[1]) + (XL64 ^ q[25] ^ q[1]); + hash[2] = (SHR(XH64, 5) ^ SHL(q[18], 5) ^ msg[2]) + (XL64 ^ q[26] ^ q[2]); + hash[3] = (SHR(XH64, 1) ^ SHL(q[19], 5) ^ msg[3]) + (XL64 ^ q[27] ^ q[3]); + hash[4] = (SHR(XH64, 3) ^ q[20] ^ msg[4]) + (XL64 ^ q[28] ^ q[4]); + hash[5] = (SHL(XH64, 6) ^ SHR(q[21], 6) ^ msg[5]) + (XL64 ^ q[29] ^ q[5]); + hash[6] = (SHR(XH64, 4) ^ SHL(q[22], 6) ^ msg[6]) + (XL64 ^ q[30] ^ q[6]); + hash[7] = (SHR(XH64, 11) ^ SHL(q[23], 2) ^ msg[7]) + (XL64 ^ q[31] ^ q[7]); + + hash[8] = ROL(hash[4], 9) + (XH64 ^ q[24] ^ msg[8]) + (SHL(XL64, 8) ^ q[23] ^ q[8]); + hash[9] = ROL(hash[5], 10) + (XH64 ^ q[25] ^ msg[9]) + (SHR(XL64, 6) ^ q[16] ^ q[9]); + hash[10] = ROL(hash[6], 11) + (XH64 ^ q[26] ^ msg[10]) + (SHL(XL64, 6) ^ q[17] ^ q[10]); + hash[11] = ROL(hash[7], 12) + (XH64 ^ q[27] ^ msg[11]) + (SHL(XL64, 4) ^ q[18] ^ q[11]); + hash[12] = ROL(hash[0], 13) + (XH64 ^ q[28] ^ msg[12]) + (SHR(XL64, 3) ^ q[19] ^ q[12]); + hash[13] = ROL(hash[1], 14) + (XH64 ^ q[29] ^ msg[13]) + (SHR(XL64, 4) ^ q[20] ^ q[13]); + hash[14] = ROL(hash[2], 15) + (XH64 ^ q[30] ^ msg[14]) + (SHR(XL64, 7) ^ q[21] ^ q[14]); + hash[15] = ROL(hash[3], 16) + (XH64 ^ q[31] ^ msg[15]) + (SHR(XL64, 2) ^ q[22] ^ q[15]); } -__global__ -#if __CUDA_ARCH__ > 500 -__launch_bounds__(32, 16) -#else -__launch_bounds__(64, 8) -#endif -void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) + + +__global__ __launch_bounds__(256, 2) +void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - uint32_t hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[hashPosition * 8]; + uint32_t nounce = startNounce + thread; // Init uint2 h[16] = { @@ -350,46 +841,93 @@ void quark_bmw512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * { 0xF4F5F6F7UL, 0xF0F1F2F3UL }, { 0xFCFDFEFFUL, 0xF8F9FAFBUL } }; - // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte, // BMW arbeitet mit 128 Byte!!! uint2 message[16]; - #pragma unroll - for(int i=0;i<8;i++) - message[i] = vectorize(inpHash[i]); - - #pragma unroll 6 - for(int i=9;i<15;i++) - message[i] = make_uint2(0,0); +#pragma unroll 16 + for (int i = 0; i<16; i++) + message[i] = vectorize(c_PaddedMessage80[i]); - // Padding einfgen (Byteorder?!?) - message[8] = make_uint2(0x80,0); - // Lnge (in Bits, d.h. 64 Byte * 8 = 512 Bits - message[15] = make_uint2(512,0); + // die Nounce durch die thread-spezifische ersetzen + message[9].y = cuda_swab32(nounce); //REPLACE_HIDWORD(message[9], cuda_swab32(nounce)); // Compression 1 - Compression512_64_first(message, h); + Compression512(message, h); +#pragma unroll 16 + for (int i = 0; i<16; i++) + message[i] = make_uint2(0xaaaaaaa0 + i, 0xaaaaaaaa); + - // Final - #pragma unroll - for(int i=0;i<16;i++) - { - message[i].y = 0xaaaaaaaa; - message[i].x = 0xaaaaaaa0ul + (uint32_t)i; - } Compression512(h, message); // fertig - uint64_t *outpHash = &g_hash[hashPosition * 8]; + uint64_t *outpHash = &g_hash[thread * 8]; - #pragma unroll - for(int i=0;i<8;i++) - outpHash[i] = devectorize(message[i+8]); +#pragma unroll 8 + for (int i = 0; i<8; i++) + outpHash[i] = devectorize(message[i + 8]); } } -__global__ __launch_bounds__(256, 2) -void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) + +__constant__ uint64_t BMW512_IV[] = { + (0x8081828384858687), (0x88898A8B8C8D8E8F), + (0x9091929394959697), (0x98999A9B9C9D9E9F), + (0xA0A1A2A3A4A5A6A7), (0xA8A9AAABACADAEAF), + (0xB0B1B2B3B4B5B6B7), (0xB8B9BABBBCBDBEBF), + (0xC0C1C2C3C4C5C6C7), (0xC8C9CACBCCCDCECF), + (0xD0D1D2D3D4D5D6D7), (0xD8D9DADBDCDDDEDF), + (0xE0E1E2E3E4E5E6E7), (0xE8E9EAEBECEDEEEF), + (0xF0F1F2F3F4F5F6F7), (0xF8F9FAFBFCFDFEFF) +}; + + + +__constant__ uint64_t BMW512_FINAL[16] = +{ + 0xAAAAAAAAAAAAAAA0UL, 0xAAAAAAAAAAAAAAA1UL, 0xAAAAAAAAAAAAAAA2UL, 0xAAAAAAAAAAAAAAA3UL, + 0xAAAAAAAAAAAAAAA4UL, 0xAAAAAAAAAAAAAAA5UL, 0xAAAAAAAAAAAAAAA6UL, 0xAAAAAAAAAAAAAAA7UL, + 0xAAAAAAAAAAAAAAA8UL, 0xAAAAAAAAAAAAAAA9UL, 0xAAAAAAAAAAAAAAAAUL, 0xAAAAAAAAAAAAAAABUL, + 0xAAAAAAAAAAAAAAACUL, 0xAAAAAAAAAAAAAAADUL, 0xAAAAAAAAAAAAAAAEUL, 0xAAAAAAAAAAAAAAAFUL +}; + + +__host__ +void quark_bmw512_cpu_setBlock_80(void *pdata) +{ + unsigned char PaddedMessage[128]; + memcpy(PaddedMessage, pdata, 80); + memset(PaddedMessage + 80, 0, 48); + uint64_t *message = (uint64_t*)PaddedMessage; + message[10] = SPH_C64(0x80); + message[15] = SPH_C64(640); + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} + + + +__host__ +void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +{ + const uint32_t threadsperblock = 256; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + quark_bmw512_gpu_hash_80 << > >(threads, startNounce, (uint64_t*)d_hash); +} + +__host__ void quark_bmw512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash, uint32_t *resNonce, const uint64_t target) +{ + const uint32_t threadsperblock = 256; + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + quark_bmw512_gpu_hash_64_final << > >(threads, (uint64_t*)d_hash, d_nonceVector, resNonce, target); +} + +__global__ __launch_bounds__(128, 4) +void quark_bmw512_gpu_hash_80_final(uint32_t threads, uint32_t startNounce, uint32_t *resNonce, const uint64_t target) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -415,77 +953,32 @@ void quark_bmw512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t * { 0xF4F5F6F7UL, 0xF0F1F2F3UL }, { 0xFCFDFEFFUL, 0xF8F9FAFBUL } }; - // Nachricht kopieren (Achtung, die Nachricht hat 64 Byte, - // BMW arbeitet mit 128 Byte!!! uint2 message[16]; -#pragma unroll 16 - for(int i=0;i<16;i++) - message[i] = vectorize(c_PaddedMessage80[i]); + #pragma unroll + for (int i=0;i<16;i++) message[i] = vectorize(c_PaddedMessage80[i]); - // die Nounce durch die thread-spezifische ersetzen message[9].y = cuda_swab32(nounce); //REPLACE_HIDWORD(message[9], cuda_swab32(nounce)); - // Compression 1 Compression512(message, h); -#pragma unroll 16 - for(int i=0;i<16;i++) - message[i] = make_uint2(0xaaaaaaa0+i,0xaaaaaaaa); - + #pragma unroll + for (int i=0;i<16;i++) message[i] = make_uint2(0xaaaaaaa0+i,0xaaaaaaaa); Compression512(h, message); - // fertig - uint64_t *outpHash = &g_hash[thread * 8]; - -#pragma unroll 8 - for(int i=0;i<8;i++) - outpHash[i] = devectorize(message[i+8]); + if(devectorize(message[3+8]) <= target){ + uint32_t tmp = atomicExch(&resNonce[0], thread); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } } } __host__ -void quark_bmw512_cpu_setBlock_80(void *pdata) -{ - unsigned char PaddedMessage[128]; - memcpy(PaddedMessage, pdata, 80); - memset(PaddedMessage+80, 0, 48); - uint64_t *message = (uint64_t*)PaddedMessage; - message[10] = SPH_C64(0x80); - message[15] = SPH_C64(640); - cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); -} - -__host__ -void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +void quark_bmw512_cpu_hash_80_final(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_resNonce, const uint64_t target) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - int dev_id = device_map[thr_id]; - - if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_bmw512_gpu_hash_80<<>>(threads, startNounce, (uint64_t*)d_hash); - else - quark_bmw512_gpu_hash_80_30<<>>(threads, startNounce, (uint64_t*)d_hash); -} - -__host__ -void quark_bmw512_cpu_init(int thr_id, uint32_t threads) -{ - cuda_get_arch(thr_id); -} - -__host__ -void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) -{ - const uint32_t threadsperblock = 32; - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - int dev_id = device_map[thr_id]; - if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_bmw512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - else - quark_bmw512_gpu_hash_64_30<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + quark_bmw512_gpu_hash_80_final<<>>(threads, startNounce, d_resNonce, target); } diff --git a/quark/cuda_quark.h b/quark/cuda_quark.h index fbb0c1da43..9497b5565d 100644 --- a/quark/cuda_quark.h +++ b/quark/cuda_quark.h @@ -10,6 +10,8 @@ extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st extern void quark_bmw512_cpu_init(int thr_id, uint32_t threads); extern void quark_bmw512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void quark_bmw512_cpu_setBlock_80(void *pdata); +extern void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads); extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -20,7 +22,11 @@ extern void quark_skein512_cpu_init(int thr_id, uint32_t threads); extern void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void quark_keccak512_cpu_init(int thr_id, uint32_t threads); -extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +//extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash); +extern void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash, uint64_t target, uint32_t *d_resNonce); + extern void quark_jh512_cpu_init(int thr_id, uint32_t threads); extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index 68a411a711..e89dd0e989 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -1,319 +1,806 @@ -#include -#include -#include // off_t - +/* +Based upon Tanguy Pruvot's and SP's work +Provos Alexis - 2016 +SP - May 2018 +*/ #include "miner.h" -#include "cuda_helper.h" +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" + +#define TPB80 256 -#define ROTR(x,n) ROTR64(x,n) +#define TPB52_64 192 +#define TPB50_64 192 -// use sp kernel on SM 5+ -#define SP_KERNEL +__constant__ uint2 _ALIGN(16) c_m[16]; // padded message (80 bytes + padding) -#define USE_SHUFFLE 0 +__constant__ uint2 _ALIGN(16) c_v[16]; //state -__constant__ -static uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +__constant__ uint2 _ALIGN(16) c_x[128]; //precomputed xors // ---------------------------- BEGIN CUDA quark_blake512 functions ------------------------------------ -__device__ __constant__ -static const uint8_t c_sigma_big[16][16] = { - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - - {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 }, - - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } +__constant__ _ALIGN(16) uint2 z[16] = +{ + { 0x85a308d3, 0x243f6a88 }, { 0x03707344, 0x13198a2e }, { 0x299f31d0, 0xa4093822 }, { 0xec4e6c89, 0x082efa98 }, + { 0x38d01377, 0x452821e6 }, { 0x34e90c6c, 0xbe5466cf }, { 0xc97c50dd, 0xc0ac29b7 }, { 0xb5470917, 0x3f84d5b5 }, + { 0x8979fb1b, 0x9216d5d9 }, { 0x98dfb5ac, 0xd1310ba6 }, { 0xd01adfb7, 0x2ffd72db }, { 0x6a267e96, 0xb8e1afed }, + { 0xf12c7f99, 0xba7c9045 }, { 0xb3916cf7, 0x24a19947 }, { 0x858efc16, 0x0801f2e2 }, { 0x71574e69, 0x636920d8 } }; -__device__ __constant__ -static const uint64_t c_u512[16] = -{ - 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, - 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, - 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, - 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, - 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, - 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, - 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, - 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL +__constant__ const uint2 h[8] = { + { 0xf3bcc908UL, 0x6a09e667UL }, + { 0x84caa73bUL, 0xbb67ae85UL }, + { 0xfe94f82bUL, 0x3c6ef372UL }, + { 0x5f1d36f1UL, 0xa54ff53aUL }, + { 0xade682d1UL, 0x510e527fUL }, + { 0x2b3e6c1fUL, 0x9b05688cUL }, + { 0xfb41bd6bUL, 0x1f83d9abUL }, + { 0x137e2179UL, 0x5be0cd19UL } }; -#define G(a,b,c,d,x) { \ - uint32_t idx1 = sigma[i][x]; \ - uint32_t idx2 = sigma[i][x+1]; \ - v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \ - v[d] = SWAPDWORDS(v[d] ^ v[a]); \ +#define G4(x, a,b,c,d,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3) { \ + v[a] += (m[c_sigma[i][x]] ^ z[c_sigma[i][x+1]]) + v[b]; \ + v[a1] += (m[c_sigma[i][x+2]] ^ z[c_sigma[i][x+3]]) + v[b1]; \ + v[a2] += (m[c_sigma[i][x+4]] ^ z[c_sigma[i][x+5]]) + v[b2]; \ + v[a3] += (m[c_sigma[i][x+6]] ^ z[c_sigma[i][x+7]]) + v[b3]; \ + v[d] = xorswap32(v[d] , v[a]); \ + v[d1] = xorswap32(v[d1] , v[a1]); \ + v[d2] = xorswap32(v[d2] , v[a2]); \ + v[d3] = xorswap32(v[d3] , v[a3]); \ v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c], 25); \ - v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \ - v[d] = ROTR( v[d] ^ v[a], 16); \ + v[c1] += v[d1]; \ + v[c2] += v[d2]; \ + v[c3] += v[d3]; \ + v[b] = ROR2( v[b] ^ v[c], 25); \ + v[b1] = ROR2( v[b1] ^ v[c1], 25); \ + v[b2] = ROR2( v[b2] ^ v[c2], 25); \ + v[b3] = ROR2( v[b3] ^ v[c3], 25); \ + v[a] += (m[c_sigma[i][x+1]] ^ z[c_sigma[i][x]]) + v[b]; \ + v[a1] += (m[c_sigma[i][x+3]] ^ z[c_sigma[i][x+2]]) + v[b1]; \ + v[a2] += (m[c_sigma[i][x+5]] ^ z[c_sigma[i][x+4]]) + v[b2]; \ + v[a3] += (m[c_sigma[i][x+7]] ^ z[c_sigma[i][x+6]]) + v[b3]; \ + v[d] = ROR16( v[d] ^ v[a]); \ + v[d1] = ROR16( v[d1] ^ v[a1]); \ + v[d2] = ROR16( v[d2] ^ v[a2]); \ + v[d3] = ROR16( v[d3] ^ v[a3]); \ v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c], 11); \ + v[c1] += v[d1]; \ + v[c2] += v[d2]; \ + v[c3] += v[d3]; \ + v[b] = ROR2( v[b] ^ v[c], 11); \ + v[b1] = ROR2( v[b1] ^ v[c1], 11); \ + v[b2] = ROR2( v[b2] ^ v[c2], 11); \ + v[b3] = ROR2( v[b3] ^ v[c3], 11); \ } -__device__ __forceinline__ -void quark_blake512_compress(uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0) -{ - uint64_t v[16]; - uint64_t m[16]; - - #pragma unroll - for(int i=0; i < 16; i++) { - m[i] = cuda_swab64(block[i]); - } - - //#pragma unroll 8 - for(int i=0; i < 8; i++) - v[i] = h[i]; +#define GS4(a,b,c,d,e,f,a1,b1,c1,d1,e1,f1,a2,b2,c2,d2,e2,f2,a3,b3,c3,d3,e3,f3){\ + v[ a]+= (m[ e] ^ z[ f]) + v[ b]; v[a1]+= (m[e1] ^ z[f1]) + v[b1]; v[a2]+= (m[e2] ^ z[f2]) + v[b2]; v[a3]+= (m[e3] ^ z[f3]) + v[b3];\ + v[ d] = SWAPDWORDS2(v[ d] ^ v[ a]); v[d1] = SWAPDWORDS2(v[d1] ^ v[a1]); v[d2] = SWAPDWORDS2(v[d2] ^ v[a2]); v[d3] = SWAPDWORDS2(v[d3] ^ v[a3]);\ + v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2]; v[c3]+= v[d3];\ + v[ b] = ROR2(v[b] ^ v[c], 25); v[b1] = ROR2(v[b1] ^ v[c1], 25); v[b2] = ROR2(v[b2] ^ v[c2], 25); v[b3] = ROR2(v[b3] ^ v[c3], 25); \ + v[ a]+= (m[ f] ^ z[ e]) + v[ b]; v[a1]+= (m[f1] ^ z[e1]) + v[b1]; v[a2]+= (m[f2] ^ z[e2]) + v[b2]; v[a3]+= (m[f3] ^ z[e3]) + v[b3];\ + v[ d] = ROR16(v[d] ^ v[a]); v[d1] = ROR16(v[d1] ^ v[a1]); v[d2] = ROR16(v[d2] ^ v[a2]); v[d3] = ROR16(v[d3] ^ v[a3]);\ + v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2]; v[c3]+= v[d3];\ + v[ b] = ROR2(v[b] ^ v[c], 11); v[b1] = ROR2(v[b1] ^ v[c1], 11); v[b2] = ROR2(v[b2] ^ v[c2], 11); v[b3] = ROR2(v[b3] ^ v[c3], 11);\ +} - v[ 8] = u512[0]; - v[ 9] = u512[1]; - v[10] = u512[2]; - v[11] = u512[3]; - v[12] = u512[4] ^ T0; - v[13] = u512[5] ^ T0; - v[14] = u512[6]; - v[15] = u512[7]; - - //#pragma unroll 16 - for(int i=0; i < 16; i++) - { - /* column step */ - G( 0, 4, 8, 12, 0 ); - G( 1, 5, 9, 13, 2 ); - G( 2, 6, 10, 14, 4 ); - G( 3, 7, 11, 15, 6 ); - /* diagonal step */ - G( 0, 5, 10, 15, 8 ); - G( 1, 6, 11, 12, 10 ); - G( 2, 7, 8, 13, 12 ); - G( 3, 4, 9, 14, 14 ); - } +#define GSn4(a,b,c,d,e,f,a1,b1,c1,d1,e1,f1,a2,b2,c2,d2,e2,f2,a3,b3,c3,d3,e3,f3){\ + v[ a] = v[ a] + e + v[ b]; v[a1] = v[a1] + e1 + v[b1]; v[a2] = v[a2] + e2 + v[b2]; v[a3] = v[a3] + e3 + v[b3];\ + v[ d] = SWAPDWORDS2(v[ d] ^ v[ a]); v[d1] = SWAPDWORDS2(v[d1] ^ v[a1]); v[d2] = SWAPDWORDS2(v[d2] ^ v[a2]); v[d3] = SWAPDWORDS2(v[d3] ^ v[a3]);\ + v[ c] = v[ c] + v[ d]; v[c1] = v[c1] + v[d1]; v[c2] = v[c2] + v[d2]; v[c3] = v[c3] + v[d3];\ + v[ b] = ROR2(v[b] ^ v[c],25); v[b1] = ROR2(v[b1] ^ v[c1],25); v[b2] = ROR2(v[b2] ^ v[c2],25); v[b3] = ROR2(v[b3] ^ v[c3],25); \ + v[ a] = v[ a] + f + v[ b]; v[a1] = v[a1] + f1 + v[b1]; v[a2] = v[a2] + f2 + v[b2]; v[a3] = v[a3] + f3 + v[b3];\ + v[ d] = ROR16(v[d] ^ v[a]); v[d1] = ROR16(v[d1] ^ v[a1]); v[d2] = ROR16(v[d2] ^ v[a2]); v[d3] = ROR16(v[d3] ^ v[a3]);\ + v[ c] = v[ c] + v[ d]; v[c1] = v[c1] + v[d1]; v[c2] = v[c2] + v[d2]; v[c3] = v[c3] + v[d3];\ + v[ b] = ROR2(v[b] ^ v[c],11); v[b1] = ROR2(v[b1] ^ v[c1],11); v[b2] = ROR2(v[b2] ^ v[c2],11); v[b3] = ROR2(v[b3] ^ v[c3],11);\ +} - h[0] ^= v[0] ^ v[8]; - h[1] ^= v[1] ^ v[9]; - h[2] ^= v[2] ^ v[10]; - h[3] ^= v[3] ^ v[11]; - h[4] ^= v[4] ^ v[12]; - h[5] ^= v[5] ^ v[13]; - h[6] ^= v[6] ^ v[14]; - h[7] ^= v[7] ^ v[15]; +#define GShost(a,b,c,d,e,f) { \ + v[a] += (m[e] ^ z[f]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64( v[b] ^ v[c], 25); \ + v[a] += (m[f] ^ z[e]) + v[b]; \ + v[d] = ROTR64( v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROTR64( v[b] ^ v[c], 11); \ } -__global__ __launch_bounds__(256, 4) -void quark_blake512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) +__global__ +__launch_bounds__(192, 2) +void quark_blake512_gpu_hash_64(uint32_t threads, const uint32_t *const __restrict__ g_nonceVector, uint2* g_hash) { -#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500 - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - -#if USE_SHUFFLE - const uint32_t warpBlockID = (thread + 15)>>4; // aufrunden auf volle Warp-Blöcke - - if (warpBlockID < ( (threads+15)>>4 )) -#else - if (thread < threads) -#endif - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - off_t hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8 - - // 128 Bytes - uint64_t buf[16]; - - // State - uint64_t h[8] = { - 0x6a09e667f3bcc908ULL, - 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, - 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, - 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads){ + const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread]; + + uint2 msg[16]; + + uint2x4 *phash = (uint2x4*)&g_hash[hashPosition << 3]; + uint2x4 *outpt = (uint2x4*)msg; + outpt[0] = __ldg4(&phash[0]); + outpt[1] = __ldg4(&phash[1]); + + uint2 m[16]; + m[0] = cuda_swab64_U2(msg[0]); + m[1] = cuda_swab64_U2(msg[1]); + m[2] = cuda_swab64_U2(msg[2]); + m[3] = cuda_swab64_U2(msg[3]); + m[4] = cuda_swab64_U2(msg[4]); + m[5] = cuda_swab64_U2(msg[5]); + m[6] = cuda_swab64_U2(msg[6]); + m[7] = cuda_swab64_U2(msg[7]); + m[8] = make_uint2(0, 0x80000000); + m[9] = make_uint2(0, 0); + m[10] = make_uint2(0, 0); + m[11] = make_uint2(0, 0); + m[12] = make_uint2(0, 0); + m[13] = make_uint2(1, 0); + m[14] = make_uint2(0, 0); + m[15] = make_uint2(0x200, 0); + + uint2 v[16] = { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7] }; + v[12].x ^= 512U; + v[13].x ^= 512U; - // Message for first round - #pragma unroll 8 - for (int i=0; i < 8; ++i) - buf[i] = inpHash[i]; - - // Hash Pad - buf[8] = 0x0000000000000080ull; - buf[9] = 0; - buf[10] = 0; - buf[11] = 0; - buf[12] = 0; - buf[13] = 0x0100000000000000ull; - buf[14] = 0; - buf[15] = 0x0002000000000000ull; - - // Ending round - quark_blake512_compress(h, buf, c_sigma_big, c_u512, 512); - -#if __CUDA_ARCH__ <= 350 - uint32_t *outHash = (uint32_t*)&g_hash[hashPosition * 8U]; - #pragma unroll 8 - for (int i=0; i < 8; i++) { - outHash[2*i+0] = cuda_swab32( _HIDWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) ); - } -#else - uint64_t *outHash = &g_hash[hashPosition * 8U]; - for (int i=0; i < 8; i++) { - outHash[i] = cuda_swab64(h[i]); + GS4(0, 4, 8, 12, 0, 1, 1, 5, 9, 13, 2, 3, 2, 6, 10, 14, 4, 5, 3, 7, 11, 15, 6, 7); + GS4(0, 5, 10, 15, 8, 9, 1, 6, 11, 12, 10, 11, 2, 7, 8, 13, 12, 13, 3, 4, 9, 14, 14, 15); + + GS4(0, 4, 8, 12, 14, 10, 1, 5, 9, 13, 4, 8, 2, 6, 10, 14, 9, 15, 3, 7, 11, 15, 13, 6); + GS4(0, 5, 10, 15, 1, 12, 1, 6, 11, 12, 0, 2, 2, 7, 8, 13, 11, 7, 3, 4, 9, 14, 5, 3); + + GS4(0, 4, 8, 12, 11, 8, 1, 5, 9, 13, 12, 0, 2, 6, 10, 14, 5, 2, 3, 7, 11, 15, 15, 13); + GS4(0, 5, 10, 15, 10, 14, 1, 6, 11, 12, 3, 6, 2, 7, 8, 13, 7, 1, 3, 4, 9, 14, 9, 4); + + GS4(0, 4, 8, 12, 7, 9, 1, 5, 9, 13, 3, 1, 2, 6, 10, 14, 13, 12, 3, 7, 11, 15, 11, 14); + GS4(0, 5, 10, 15, 2, 6, 1, 6, 11, 12, 5, 10, 2, 7, 8, 13, 4, 0, 3, 4, 9, 14, 15, 8); + + GS4(0, 4, 8, 12, 9, 0, 1, 5, 9, 13, 5, 7, 2, 6, 10, 14, 2, 4, 3, 7, 11, 15, 10, 15); + GS4(0, 5, 10, 15, 14, 1, 1, 6, 11, 12, 11, 12, 2, 7, 8, 13, 6, 8, 3, 4, 9, 14, 3, 13); + + GS4(0, 4, 8, 12, 2, 12, 1, 5, 9, 13, 6, 10, 2, 6, 10, 14, 0, 11, 3, 7, 11, 15, 8, 3); + GS4(0, 5, 10, 15, 4, 13, 1, 6, 11, 12, 7, 5, 2, 7, 8, 13, 15, 14, 3, 4, 9, 14, 1, 9); + + GS4(0, 4, 8, 12, 12, 5, 1, 5, 9, 13, 1, 15, 2, 6, 10, 14, 14, 13, 3, 7, 11, 15, 4, 10); + GS4(0, 5, 10, 15, 0, 7, 1, 6, 11, 12, 6, 3, 2, 7, 8, 13, 9, 2, 3, 4, 9, 14, 8, 11); + + GS4(0, 4, 8, 12, 13, 11, 1, 5, 9, 13, 7, 14, 2, 6, 10, 14, 12, 1, 3, 7, 11, 15, 3, 9); + GS4(0, 5, 10, 15, 5, 0, 1, 6, 11, 12, 15, 4, 2, 7, 8, 13, 8, 6, 3, 4, 9, 14, 2, 10); + + GS4(0, 4, 8, 12, 6, 15, 1, 5, 9, 13, 14, 9, 2, 6, 10, 14, 11, 3, 3, 7, 11, 15, 0, 8); + GS4(0, 5, 10, 15, 12, 2, 1, 6, 11, 12, 13, 7, 2, 7, 8, 13, 1, 4, 3, 4, 9, 14, 10, 5); + + GS4(0, 4, 8, 12, 10, 2, 1, 5, 9, 13, 8, 4, 2, 6, 10, 14, 7, 6, 3, 7, 11, 15, 1, 5); + GS4(0, 5, 10, 15, 15, 11, 1, 6, 11, 12, 9, 14, 2, 7, 8, 13, 3, 12, 3, 4, 9, 14, 13, 0); + + // #if __CUDA_ARCH__ == 500 + + GS4(0, 4, 8, 12, 0, 1, 1, 5, 9, 13, 2, 3, 2, 6, 10, 14, 4, 5, 3, 7, 11, 15, 6, 7); + GS4(0, 5, 10, 15, 8, 9, 1, 6, 11, 12, 10, 11, 2, 7, 8, 13, 12, 13, 3, 4, 9, 14, 14, 15); + + GS4(0, 4, 8, 12, 14, 10, 1, 5, 9, 13, 4, 8, 2, 6, 10, 14, 9, 15, 3, 7, 11, 15, 13, 6); + GS4(0, 5, 10, 15, 1, 12, 1, 6, 11, 12, 0, 2, 2, 7, 8, 13, 11, 7, 3, 4, 9, 14, 5, 3); + + GS4(0, 4, 8, 12, 11, 8, 1, 5, 9, 13, 12, 0, 2, 6, 10, 14, 5, 2, 3, 7, 11, 15, 15, 13); + GS4(0, 5, 10, 15, 10, 14, 1, 6, 11, 12, 3, 6, 2, 7, 8, 13, 7, 1, 3, 4, 9, 14, 9, 4); + + GS4(0, 4, 8, 12, 7, 9, 1, 5, 9, 13, 3, 1, 2, 6, 10, 14, 13, 12, 3, 7, 11, 15, 11, 14); + GS4(0, 5, 10, 15, 2, 6, 1, 6, 11, 12, 5, 10, 2, 7, 8, 13, 4, 0, 3, 4, 9, 14, 15, 8); + + GS4(0, 4, 8, 12, 9, 0, 1, 5, 9, 13, 5, 7, 2, 6, 10, 14, 2, 4, 3, 7, 11, 15, 10, 15); + GS4(0, 5, 10, 15, 14, 1, 1, 6, 11, 12, 11, 12, 2, 7, 8, 13, 6, 8, 3, 4, 9, 14, 3, 13); + + GS4(0, 4, 8, 12, 2, 12, 1, 5, 9, 13, 6, 10, 2, 6, 10, 14, 0, 11, 3, 7, 11, 15, 8, 3); + GS4(0, 5, 10, 15, 4, 13, 1, 6, 11, 12, 7, 5, 2, 7, 8, 13, 15, 14, 3, 4, 9, 14, 1, 9); + + // #else*/ + /* + for (int i = 0; i < 6; i++) + { + G4(0, 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15); + G4(8, 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14); } -#endif + */ + // #endif + v[0] = cuda_swab64_U2(xor3x(v[0], h[0], v[8])); + v[1] = cuda_swab64_U2(xor3x(v[1], h[1], v[9])); + v[2] = cuda_swab64_U2(xor3x(v[2], h[2], v[10])); + v[3] = cuda_swab64_U2(xor3x(v[3], h[3], v[11])); + v[4] = cuda_swab64_U2(xor3x(v[4], h[4], v[12])); + v[5] = cuda_swab64_U2(xor3x(v[5], h[5], v[13])); + v[6] = cuda_swab64_U2(xor3x(v[6], h[6], v[14])); + v[7] = cuda_swab64_U2(xor3x(v[7], h[7], v[15])); + + /* uint2* outHash = &g_hash[hashPosition<<3]; + #pragma unroll 8 + for(uint32_t i=0;i<8;i++){ + outHash[i] = v[i]; + }*/ + phash[0] = *(uint2x4*)&v[0]; + phash[1] = *(uint2x4*)&v[4]; } -#endif /* SP */ } -__global__ __launch_bounds__(256,4) -void quark_blake512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) +__global__ +__launch_bounds__(192, 2) +void quark_blake512_gpu_hash_64_final(uint32_t threads, const uint32_t *const __restrict__ g_nonceVector, uint2* g_hash, uint32_t* resNonce, const uint64_t target) { -//#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500 - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint64_t buf[16]; - #pragma unroll - for (int i=0; i < 16; ++i) - buf[i] = c_PaddedMessage80[i]; - - // The test Nonce - const uint32_t nounce = startNounce + thread; - ((uint32_t*)buf)[19] = cuda_swab32(nounce); - - uint64_t h[8] = { - 0x6a09e667f3bcc908ULL, - 0xbb67ae8584caa73bULL, - 0x3c6ef372fe94f82bULL, - 0xa54ff53a5f1d36f1ULL, - 0x510e527fade682d1ULL, - 0x9b05688c2b3e6c1fULL, - 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads){ + const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread]; + + uint2 msg[16]; + + uint2x4 *phash = (uint2x4*)&g_hash[hashPosition << 3]; + uint2x4 *outpt = (uint2x4*)msg; + outpt[0] = __ldg4(&phash[0]); + outpt[1] = __ldg4(&phash[1]); + + uint2 m[16]; + m[0] = cuda_swab64_U2(msg[0]); + m[1] = cuda_swab64_U2(msg[1]); + m[2] = cuda_swab64_U2(msg[2]); + m[3] = cuda_swab64_U2(msg[3]); + m[4] = cuda_swab64_U2(msg[4]); + m[5] = cuda_swab64_U2(msg[5]); + m[6] = cuda_swab64_U2(msg[6]); + m[7] = cuda_swab64_U2(msg[7]); + m[8] = make_uint2(0, 0x80000000); + m[9] = make_uint2(0, 0); + m[10] = make_uint2(0, 0); + m[11] = make_uint2(0, 0); + m[12] = make_uint2(0, 0); + m[13] = make_uint2(1, 0); + m[14] = make_uint2(0, 0); + m[15] = make_uint2(0x200, 0); + + uint2 v[16] = { + h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7], + z[0], z[1], z[2], z[3], z[4], z[5], z[6], z[7] }; + v[12].x ^= 512U; + v[13].x ^= 512U; - quark_blake512_compress(h, buf, c_sigma_big, c_u512, 640); + GS4(0, 4, 8, 12, 0, 1, 1, 5, 9, 13, 2, 3, 2, 6, 10, 14, 4, 5, 3, 7, 11, 15, 6, 7); + GS4(0, 5, 10, 15, 8, 9, 1, 6, 11, 12, 10, 11, 2, 7, 8, 13, 12, 13, 3, 4, 9, 14, 14, 15); -#if __CUDA_ARCH__ <= 350 - uint32_t *outHash = (uint32_t*)outputHash + (thread * 16U); - #pragma unroll 8 - for (uint32_t i=0; i < 8; i++) { - outHash[2*i] = cuda_swab32( _HIDWORD(h[i]) ); - outHash[2*i+1] = cuda_swab32( _LODWORD(h[i]) ); + GS4(0, 4, 8, 12, 14, 10, 1, 5, 9, 13, 4, 8, 2, 6, 10, 14, 9, 15, 3, 7, 11, 15, 13, 6); + GS4(0, 5, 10, 15, 1, 12, 1, 6, 11, 12, 0, 2, 2, 7, 8, 13, 11, 7, 3, 4, 9, 14, 5, 3); + + GS4(0, 4, 8, 12, 11, 8, 1, 5, 9, 13, 12, 0, 2, 6, 10, 14, 5, 2, 3, 7, 11, 15, 15, 13); + GS4(0, 5, 10, 15, 10, 14, 1, 6, 11, 12, 3, 6, 2, 7, 8, 13, 7, 1, 3, 4, 9, 14, 9, 4); + + GS4(0, 4, 8, 12, 7, 9, 1, 5, 9, 13, 3, 1, 2, 6, 10, 14, 13, 12, 3, 7, 11, 15, 11, 14); + GS4(0, 5, 10, 15, 2, 6, 1, 6, 11, 12, 5, 10, 2, 7, 8, 13, 4, 0, 3, 4, 9, 14, 15, 8); + + GS4(0, 4, 8, 12, 9, 0, 1, 5, 9, 13, 5, 7, 2, 6, 10, 14, 2, 4, 3, 7, 11, 15, 10, 15); + GS4(0, 5, 10, 15, 14, 1, 1, 6, 11, 12, 11, 12, 2, 7, 8, 13, 6, 8, 3, 4, 9, 14, 3, 13); + + GS4(0, 4, 8, 12, 2, 12, 1, 5, 9, 13, 6, 10, 2, 6, 10, 14, 0, 11, 3, 7, 11, 15, 8, 3); + GS4(0, 5, 10, 15, 4, 13, 1, 6, 11, 12, 7, 5, 2, 7, 8, 13, 15, 14, 3, 4, 9, 14, 1, 9); + + GS4(0, 4, 8, 12, 12, 5, 1, 5, 9, 13, 1, 15, 2, 6, 10, 14, 14, 13, 3, 7, 11, 15, 4, 10); + GS4(0, 5, 10, 15, 0, 7, 1, 6, 11, 12, 6, 3, 2, 7, 8, 13, 9, 2, 3, 4, 9, 14, 8, 11); + + GS4(0, 4, 8, 12, 13, 11, 1, 5, 9, 13, 7, 14, 2, 6, 10, 14, 12, 1, 3, 7, 11, 15, 3, 9); + GS4(0, 5, 10, 15, 5, 0, 1, 6, 11, 12, 15, 4, 2, 7, 8, 13, 8, 6, 3, 4, 9, 14, 2, 10); + + GS4(0, 4, 8, 12, 6, 15, 1, 5, 9, 13, 14, 9, 2, 6, 10, 14, 11, 3, 3, 7, 11, 15, 0, 8); + GS4(0, 5, 10, 15, 12, 2, 1, 6, 11, 12, 13, 7, 2, 7, 8, 13, 1, 4, 3, 4, 9, 14, 10, 5); + + GS4(0, 4, 8, 12, 10, 2, 1, 5, 9, 13, 8, 4, 2, 6, 10, 14, 7, 6, 3, 7, 11, 15, 1, 5); + GS4(0, 5, 10, 15, 15, 11, 1, 6, 11, 12, 9, 14, 2, 7, 8, 13, 3, 12, 3, 4, 9, 14, 13, 0); + + // #if __CUDA_ARCH__ == 500 + + GS4(0, 4, 8, 12, 0, 1, 1, 5, 9, 13, 2, 3, 2, 6, 10, 14, 4, 5, 3, 7, 11, 15, 6, 7); + GS4(0, 5, 10, 15, 8, 9, 1, 6, 11, 12, 10, 11, 2, 7, 8, 13, 12, 13, 3, 4, 9, 14, 14, 15); + + GS4(0, 4, 8, 12, 14, 10, 1, 5, 9, 13, 4, 8, 2, 6, 10, 14, 9, 15, 3, 7, 11, 15, 13, 6); + GS4(0, 5, 10, 15, 1, 12, 1, 6, 11, 12, 0, 2, 2, 7, 8, 13, 11, 7, 3, 4, 9, 14, 5, 3); + + GS4(0, 4, 8, 12, 11, 8, 1, 5, 9, 13, 12, 0, 2, 6, 10, 14, 5, 2, 3, 7, 11, 15, 15, 13); + GS4(0, 5, 10, 15, 10, 14, 1, 6, 11, 12, 3, 6, 2, 7, 8, 13, 7, 1, 3, 4, 9, 14, 9, 4); + + GS4(0, 4, 8, 12, 7, 9, 1, 5, 9, 13, 3, 1, 2, 6, 10, 14, 13, 12, 3, 7, 11, 15, 11, 14); + GS4(0, 5, 10, 15, 2, 6, 1, 6, 11, 12, 5, 10, 2, 7, 8, 13, 4, 0, 3, 4, 9, 14, 15, 8); + + GS4(0, 4, 8, 12, 9, 0, 1, 5, 9, 13, 5, 7, 2, 6, 10, 14, 2, 4, 3, 7, 11, 15, 10, 15); + GS4(0, 5, 10, 15, 14, 1, 1, 6, 11, 12, 11, 12, 2, 7, 8, 13, 6, 8, 3, 4, 9, 14, 3, 13); + + GS4(0, 4, 8, 12, 2, 12, 1, 5, 9, 13, 6, 10, 2, 6, 10, 14, 0, 11, 3, 7, 11, 15, 8, 3); + GS4(0, 5, 10, 15, 4, 13, 1, 6, 11, 12, 7, 5, 2, 7, 8, 13, 15, 14, 3, 4, 9, 14, 1, 9); + + // #else*/ + /* + for (int i = 0; i < 6; i++) + { + G4(0, 0, 4, 8,12, 1, 5, 9,13, 2, 6,10,14, 3, 7,11,15); + G4(8, 0, 5,10,15, 1, 6,11,12, 2, 7, 8,13, 3, 4, 9,14); } -#else - uint64_t *outHash = (uint64_t*)outputHash + (thread * 8U); - for (uint32_t i=0; i < 8; i++) { - outHash[i] = cuda_swab64( h[i] ); + */ + // #endif + v[0] = cuda_swab64_U2(xor3x(v[0], h[0], v[8])); + v[1] = cuda_swab64_U2(xor3x(v[1], h[1], v[9])); + v[2] = cuda_swab64_U2(xor3x(v[2], h[2], v[10])); + v[3] = cuda_swab64_U2(xor3x(v[3], h[3], v[11])); + v[4] = cuda_swab64_U2(xor3x(v[4], h[4], v[12])); + v[5] = cuda_swab64_U2(xor3x(v[5], h[5], v[13])); + v[6] = cuda_swab64_U2(xor3x(v[6], h[6], v[14])); + v[7] = cuda_swab64_U2(xor3x(v[7], h[7], v[15])); + + /* uint2* outHash = &g_hash[hashPosition<<3]; + #pragma unroll 8 + for(uint32_t i=0;i<8;i++){ + outHash[i] = v[i]; + }*/ + // phash[0] = *(uint2x4*)&v[0]; + // phash[1] = *(uint2x4*)&v[4]; + + if (devectorize(v[3]) <= target) + { + uint32_t tmp = atomicExch(&resNonce[0], thread); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; } -#endif } -//#endif } -#ifdef SP_KERNEL -#include "cuda_quark_blake512_sp.cuh" -#endif -__host__ -void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order) -{ -#ifdef SP_KERNEL - int dev_id = device_map[thr_id]; - if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) - quark_blake512_cpu_hash_64_sp(threads, startNounce, d_nonceVector, d_outputHash); - else -#endif - { - const uint32_t threadsperblock = 256; - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - quark_blake512_gpu_hash_64<<>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); +__global__ __launch_bounds__(512, 2)// __launch_bounds__(TPB80, 4) +void quark_blake512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint2x4 *const __restrict__ g_hash){ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + uint2 v[16]; + uint2 m[10]; + uint2 xors[16]; + + const uint2 h[8] = { + { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }, + { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } + }; + const uint2 z[16] = { + { 0x85a308d3, 0x243f6a88 }, { 0x03707344, 0x13198a2e }, { 0x299f31d0, 0xa4093822 }, { 0xec4e6c89, 0x082efa98 }, + { 0x38d01377, 0x452821e6 }, { 0x34e90c6c, 0xbe5466cf }, { 0xc97c50dd, 0xc0ac29b7 }, { 0xb5470917, 0x3f84d5b5 }, + { 0x8979fb1b, 0x9216d5d9 }, { 0x98dfb5ac, 0xd1310ba6 }, { 0xd01adfb7, 0x2ffd72db }, { 0x6a267e96, 0xb8e1afed }, + { 0xf12c7f99, 0xba7c9045 }, { 0xb3916cf7, 0x24a19947 }, { 0x858efc16, 0x0801f2e2 }, { 0x71574e69, 0x636920d8 } + }; + const uint32_t m150 = 0x280 ^ z[9].x;//make_uint2(0x280,0) ^ z[ 9];//2 + const uint32_t m151 = 0x280 ^ z[13].x;//2 + const uint32_t m152 = 0x280 ^ z[8].x;//2 + const uint32_t m153 = 0x280 ^ z[10].x;//2 + const uint32_t m154 = 0x280 ^ z[14].x;//3 + const uint32_t m155 = 0x280 ^ z[1].x;//1 + const uint32_t m156 = 0x280 ^ z[4].x;//1 + const uint32_t m157 = 0x280 ^ z[6].x;//1 + const uint32_t m158 = 0x280 ^ z[11].x;//1 + + const uint32_t m130 = 0x01 ^ z[6].x;//2 + const uint32_t m131 = 0x01 ^ z[15].x;//2 + const uint32_t m132 = 0x01 ^ z[12].x;//3 + const uint32_t m133 = 0x01 ^ z[3].x;//2 + const uint32_t m134 = 0x01 ^ z[4].x;//2 + const uint32_t m135 = 0x01 ^ z[14].x;//1 + const uint32_t m136 = 0x01 ^ z[11].x;//1 + const uint32_t m137 = 0x01 ^ z[7].x;//1 + const uint32_t m138 = 0x01 ^ z[0].x;//1 + + const uint32_t m100 = 0x80000000 ^ z[14].y;//4 + const uint32_t m101 = 0x80000000 ^ z[5].y;//3 + const uint32_t m102 = 0x80000000 ^ z[15].y;//2 + const uint32_t m103 = 0x80000000 ^ z[6].y;//2 + const uint32_t m104 = 0x80000000 ^ z[4].y;//1 + const uint32_t m105 = 0x80000000 ^ z[2].y;//2 + const uint32_t m106 = 0x80000000 ^ z[11].y;//2 + + if (thread < threads){ + + int i = 0; + +#pragma unroll 10 + for (int i = 0; i < 10; ++i) + m[i] = c_m[i]; + + + m[9].x = startNounce + thread; + +#pragma unroll 16 + for (int i = 0; i < 16; i++) + v[i] = c_v[i]; + + // GSn( 0, 5,10,15, 8, 9); + v[0] += (m[9] ^ z[8]); + v[15] = ROR16(v[15] ^ v[0]); + v[10] += v[15]; + v[5] = ROR2(v[5] ^ v[10], 11); + + xors[0] = z[10]; xors[1] = c_x[i++]; xors[2] = m[9] ^ z[15]; xors[3] = make_uint2(m130, z[6].y); + xors[4] = make_uint2(z[14].x, m100); xors[5] = c_x[i++]; xors[6] = make_uint2(m150, z[9].y); xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = z[7]; xors[11] = c_x[i++]; + xors[12] = z[1]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //2:{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } + xors[0] = z[8]; xors[1] = z[0]; xors[2] = c_x[i++]; xors[3] = make_uint2(m151, z[13].y); + xors[4] = c_x[i++]; xors[5] = c_x[i++]; xors[6] = c_x[i++]; xors[7] = make_uint2(m131, z[15].y); + + xors[8] = make_uint2(z[14].x, m100); xors[9] = c_x[i++]; xors[10] = c_x[i++]; xors[11] = m[9] ^ z[4]; + xors[12] = z[10]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //3:{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } + xors[0] = c_x[i++]; xors[1] = c_x[i++]; xors[2] = make_uint2(m132, z[12].y); xors[3] = z[14]; + xors[4] = m[9] ^ z[7]; xors[5] = c_x[i++]; xors[6] = z[13]; xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = c_x[i++]; xors[11] = make_uint2(m152, z[8].y); + xors[12] = c_x[i++]; xors[13] = make_uint2(z[5].x, m101); xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //4:{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } + xors[0] = m[9] ^ z[0]; xors[1] = c_x[i++]; xors[2] = c_x[i++]; xors[3] = make_uint2(z[15].x, m102); + xors[4] = c_x[i++]; xors[5] = c_x[i++]; xors[6] = c_x[i++]; xors[7] = make_uint2(m153, z[10].y); + + xors[8] = z[1]; xors[9] = z[12]; xors[10] = c_x[i++]; xors[11] = c_x[i++]; + xors[12] = c_x[i++]; xors[13] = z[11]; xors[14] = c_x[i++]; xors[15] = make_uint2(m133, z[3].y); + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //5:{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } + xors[0] = c_x[i++]; xors[1] = c_x[i++]; xors[2] = c_x[i++]; xors[3] = c_x[i++]; + xors[4] = z[2]; xors[5] = make_uint2(z[6].x, m103); xors[6] = z[0]; xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = make_uint2(m154, z[14].y); xors[11] = c_x[i++]; + xors[12] = make_uint2(m134, z[4].y); xors[13] = c_x[i++]; xors[14] = z[15]; xors[15] = m[9] ^ z[1]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //6:{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } + xors[0] = z[5]; xors[1] = c_x[i++]; xors[2] = z[13]; xors[3] = c_x[i++]; + xors[4] = c_x[i++]; xors[5] = make_uint2(m155, z[1].y); xors[6] = make_uint2(m135, z[14].y); xors[7] = make_uint2(z[4].x, m104); + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = m[9] ^ z[2]; xors[11] = c_x[i++]; + xors[12] = c_x[i++]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = z[8]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //7:{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } + xors[0] = make_uint2(m136, z[11].y); xors[1] = c_x[i++]; xors[2] = z[1]; xors[3] = c_x[i++]; + xors[4] = z[13]; xors[5] = z[7]; xors[6] = c_x[i++]; xors[7] = m[9] ^ z[3]; + + xors[8] = c_x[i++]; xors[9] = make_uint2(m156, z[4].y); xors[10] = c_x[i++]; xors[11] = c_x[i++]; + xors[12] = c_x[i++]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = make_uint2(z[2].x, m105); + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //8:{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } + xors[0] = c_x[i++]; xors[1] = z[9]; xors[2] = z[3]; xors[3] = c_x[i++]; + xors[4] = make_uint2(m157, z[6].y); xors[5] = m[9] ^ z[14]; xors[6] = c_x[i++]; xors[7] = c_x[i++]; + + xors[8] = z[2]; xors[9] = make_uint2(m137, z[7].y); xors[10] = c_x[i++]; xors[11] = make_uint2(z[5].x, m101); + xors[12] = c_x[i++]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //9:{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } + xors[0] = make_uint2(z[2].x, m105); xors[1] = c_x[i++]; xors[2] = c_x[i++]; xors[3] = c_x[i++]; + xors[4] = c_x[i++]; xors[5] = c_x[i++]; xors[6] = c_x[i++]; xors[7] = c_x[i++]; + + xors[8] = make_uint2(m158, z[11].y); xors[9] = m[9] ^ z[14]; xors[10] = c_x[i++]; xors[11] = make_uint2(m138, z[0].y); + xors[12] = z[15]; xors[13] = z[9]; xors[14] = z[3]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + //10:{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } + xors[0] = c_x[i++]; xors[1] = c_x[i++]; xors[2] = c_x[i++]; xors[3] = c_x[i++]; + xors[4] = c_x[i++]; xors[5] = c_x[i++]; xors[6] = c_x[i++]; xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = make_uint2(z[11].x, m106); xors[10] = z[13]; xors[11] = z[15]; + xors[12] = m[9] ^ z[8]; xors[13] = z[10]; xors[14] = make_uint2(m132, z[12].y); xors[15] = make_uint2(m154, z[14].y); + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + //------------------ + i = 0; + //11:{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } + xors[0] = z[10]; xors[1] = c_x[i++]; xors[2] = m[9] ^ z[15]; xors[3] = make_uint2(m130, z[6].y); + xors[4] = make_uint2(z[14].x, m100); xors[5] = c_x[i++]; xors[6] = make_uint2(m150, z[9].y); xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = z[7]; xors[11] = c_x[i++]; + xors[12] = z[1]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //12:{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } + xors[0] = z[8]; xors[1] = z[0]; xors[2] = c_x[i++]; xors[3] = make_uint2(m151, z[13].y); + xors[4] = c_x[i++]; xors[5] = c_x[i++]; xors[6] = c_x[i++]; xors[7] = make_uint2(m131, z[15].y); + + xors[8] = make_uint2(z[14].x, m100); xors[9] = c_x[i++]; xors[10] = c_x[i++]; xors[11] = m[9] ^ z[4]; + xors[12] = z[10]; xors[13] = c_x[i++]; xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //13:{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } + xors[0] = c_x[i++]; xors[1] = c_x[i++]; xors[2] = make_uint2(m132, z[12].y); xors[3] = z[14]; + xors[4] = m[9] ^ z[7]; xors[5] = c_x[i++]; xors[6] = z[13]; xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = c_x[i++]; xors[11] = make_uint2(m152, z[8].y); + xors[12] = c_x[i++]; xors[13] = make_uint2(z[5].x, m101); xors[14] = c_x[i++]; xors[15] = c_x[i++]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + //14:{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } + xors[0] = m[9] ^ z[0]; xors[1] = c_x[i++]; xors[2] = c_x[i++]; xors[3] = make_uint2(z[15].x, m102); + xors[4] = c_x[i++]; xors[5] = c_x[i++]; xors[6] = c_x[i++]; xors[7] = make_uint2(m153, z[10].y); + + xors[8] = z[1]; xors[9] = z[12]; xors[10] = c_x[i++]; xors[11] = c_x[i++]; + xors[12] = c_x[i++]; xors[13] = z[11]; xors[14] = c_x[i++]; xors[15] = make_uint2(m133, z[3].y); + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + //15:{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } + xors[0] = c_x[i++]; xors[1] = c_x[i++]; xors[2] = c_x[i++]; xors[3] = c_x[i++]; + xors[4] = z[2]; xors[5] = make_uint2(z[6].x, m103); xors[6] = z[0]; xors[7] = c_x[i++]; + + xors[8] = c_x[i++]; xors[9] = c_x[i++]; xors[10] = make_uint2(m154, z[14].y); xors[11] = c_x[i++]; + xors[12] = make_uint2(m134, z[4].y); xors[13] = c_x[i++]; xors[14] = z[15]; xors[15] = m[9] ^ z[1]; + + GSn4(0, 4, 8, 12, xors[0], xors[4], 1, 5, 9, 13, xors[1], xors[5], 2, 6, 10, 14, xors[2], xors[6], 3, 7, 11, 15, xors[3], xors[7]); + GSn4(0, 5, 10, 15, xors[8], xors[12], 1, 6, 11, 12, xors[9], xors[13], 2, 7, 8, 13, xors[10], xors[14], 3, 4, 9, 14, xors[11], xors[15]); + + v[0] = cuda_swab64_U2(xor3x(v[0], h[0], v[8])); + v[1] = cuda_swab64_U2(xor3x(v[1], h[1], v[9])); + v[2] = cuda_swab64_U2(xor3x(v[2], h[2], v[10])); + v[3] = cuda_swab64_U2(xor3x(v[3], h[3], v[11])); + v[4] = cuda_swab64_U2(xor3x(v[4], h[4], v[12])); + v[5] = cuda_swab64_U2(xor3x(v[5], h[5], v[13])); + v[6] = cuda_swab64_U2(xor3x(v[6], h[6], v[14])); + v[7] = cuda_swab64_U2(xor3x(v[7], h[7], v[15])); + + uint2x4* outpt = &g_hash[thread << 1]; + outpt[0] = *(uint2x4*)&v[0]; + outpt[1] = *(uint2x4*)&v[4]; } - MyStreamSynchronize(NULL, order, thr_id); } -__host__ -void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) + +//void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_outputHash){ +__host__ void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { -#ifdef SP_KERNEL + uint32_t tpb = TPB52_64; int dev_id = device_map[thr_id]; - if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) - quark_blake512_cpu_hash_80_sp(threads, startNounce, d_outputHash); - else -#endif - { - const uint32_t threadsperblock = 256; - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - quark_blake512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); - } -} -// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ + if (device_sm[dev_id] <= 500) tpb = TPB50_64; + const dim3 grid((threads + tpb - 1) / tpb); + const dim3 block(tpb); + quark_blake512_gpu_hash_64 << > >(threads, d_nonceVector, (uint2*)d_hash); +} -__host__ -void quark_blake512_cpu_init(int thr_id, uint32_t threads) +extern void quark_blake512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_outputHash, uint32_t *resNonce, const uint64_t target) { - cuda_get_arch(thr_id); + uint32_t tpb = TPB52_64; + int dev_id = device_map[thr_id]; + + if (device_sm[dev_id] <= 500) tpb = TPB50_64; + const dim3 grid((threads + tpb - 1) / tpb); + const dim3 block(tpb); + quark_blake512_gpu_hash_64_final << > >(threads, d_nonceVector, (uint2*)d_outputHash, resNonce, target); } + __host__ -void quark_blake512_cpu_free(int thr_id) +void quark_blake512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) { + dim3 grid((threads + 512 - 1) / 512); + dim3 block(512); + + quark_blake512_gpu_hash_80 << > >(threads, startNounce, (uint2x4*)d_outputHash); } +// ---------------------------- END CUDA quark_blake512 functions ------------------------------------ + // ----------------------------- Host midstate for 80-bytes input ------------------------------------ +__host__ +void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *endiandata){ + uint64_t m[16], v[16], xors[128]; + memcpy(m, endiandata, 80); + m[10] = 0x8000000000000000ull; + m[11] = 0; + m[12] = 0; + m[13] = 0x01; + m[14] = 0; + m[15] = 0x280; + + for (int i = 0; i<10; i++){ + m[i] = cuda_swab64(m[i]); + } + + uint64_t h[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, + 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL + }; -#undef SPH_C32 -#undef SPH_T32 -#undef SPH_C64 -#undef SPH_T64 + const uint64_t z[16] = { + 0x243f6a8885a308d3ULL, 0x13198a2e03707344ULL, 0xa4093822299f31d0ULL, 0x082efa98ec4e6c89ULL, + 0x452821e638d01377ULL, 0xbe5466cf34e90c6cULL, 0xc0ac29b7c97c50ddULL, 0x3f84d5b5b5470917ULL, + 0x9216d5d98979fb1bULL, 0xd1310ba698dfb5acULL, 0x2ffd72dbd01adfb7ULL, 0xb8e1afed6a267e96ULL, + 0xba7c9045f12c7f99ULL, 0x24a19947b3916cf7ULL, 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL + }; -extern "C" { -#include "sph/sph_blake.h" + for (int i = 0; i<8; i++){ + v[i] = h[i]; + } + v[8] = z[0]; + v[9] = z[1]; + v[10] = z[2]; + v[11] = z[3]; + v[12] = z[4] ^ 640; + v[13] = z[5] ^ 640; + v[14] = z[6]; + v[15] = z[7]; + + /* column step */ + GShost(0, 4, 8, 12, 0, 1); + GShost(1, 5, 9, 13, 2, 3); + GShost(2, 6, 10, 14, 4, 5); + GShost(3, 7, 11, 15, 6, 7); + + GShost(1, 6, 11, 12, 10, 11); + GShost(2, 7, 8, 13, 12, 13); + GShost(3, 4, 9, 14, 14, 15); + /* + v[a] += (m[e] ^ z[f]) + v[b]; \ + v[d] = ROTR64(v[d] ^ v[a],32); \ + v[c] += v[d]; \ + v[b] = ROTR64( v[b] ^ v[c], 25); \ + v[a] += (m[f] ^ z[e]) + v[b]; \ + v[d] = ROTR64( v[d] ^ v[a], 16); \ + v[c] += v[d]; + v[b] = ROTR64( v[b] ^ v[c], 11); + */ + + v[0] += (m[8] ^ z[9]) + v[5]; + v[15] = ROTR64(v[15] ^ v[0], 32); + v[10] += v[15]; + v[5] = ROTR64(v[5] ^ v[10], 25); + + v[0] += v[5]; + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_m, m, sizeof(m), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_v, v, sizeof(m), 0, cudaMemcpyHostToDevice)); + + int i = 0; + + xors[i++] = m[4] ^ z[8]; + xors[i++] = m[8] ^ z[4]; + xors[i++] = m[6] ^ z[13]; + xors[i++] = m[1] ^ z[12]; + xors[i++] = m[0] ^ z[2]; + xors[i++] = m[5] ^ z[3]; + xors[i++] = m[2] ^ z[0]; + xors[i++] = m[7] ^ z[11]; + xors[i++] = m[3] ^ z[5]; + //2 + xors[i++] = m[5] ^ z[2]; + xors[i++] = m[8] ^ z[11]; + xors[i++] = m[0] ^ z[12]; + xors[i++] = m[2] ^ z[5]; + xors[i++] = m[3] ^ z[6]; + xors[i++] = m[7] ^ z[1]; + xors[i++] = m[6] ^ z[3]; + xors[i++] = m[1] ^ z[7]; + xors[i++] = m[4] ^ z[9]; + //3 + xors[i++] = m[7] ^ z[9]; + xors[i++] = m[3] ^ z[1]; + xors[i++] = m[1] ^ z[3]; + xors[i++] = m[14] ^ z[11]; + xors[i++] = m[2] ^ z[6]; + xors[i++] = m[5] ^ z[10]; + xors[i++] = m[4] ^ z[0]; + xors[i++] = m[6] ^ z[2]; + xors[i++] = m[0] ^ z[4]; + xors[i++] = m[8] ^ z[15]; + //4 + xors[i++] = m[5] ^ z[7]; + xors[i++] = m[2] ^ z[4]; + xors[i++] = m[0] ^ z[9]; + xors[i++] = m[7] ^ z[5]; + xors[i++] = m[4] ^ z[2]; + xors[i++] = m[6] ^ z[8]; + xors[i++] = m[3] ^ z[13]; + xors[i++] = m[1] ^ z[14]; + xors[i++] = m[8] ^ z[6]; + //5 + xors[i++] = m[2] ^ z[12]; + xors[i++] = m[6] ^ z[10]; + xors[i++] = m[0] ^ z[11]; + xors[i++] = m[8] ^ z[3]; + xors[i++] = m[3] ^ z[8]; + xors[i++] = m[4] ^ z[13]; + xors[i++] = m[7] ^ z[5]; + xors[i++] = m[1] ^ z[9]; + xors[i++] = m[5] ^ z[7]; + //6 + xors[i++] = m[1] ^ z[15]; + xors[i++] = m[4] ^ z[10]; + xors[i++] = m[5] ^ z[12]; + xors[i++] = m[0] ^ z[7]; + xors[i++] = m[6] ^ z[3]; + xors[i++] = m[8] ^ z[11]; + xors[i++] = m[7] ^ z[0]; + xors[i++] = m[3] ^ z[6]; + xors[i++] = m[2] ^ z[9]; + //7 + xors[i++] = m[7] ^ z[14]; + xors[i++] = m[3] ^ z[9]; + xors[i++] = m[1] ^ z[12]; + xors[i++] = m[5] ^ z[0]; + xors[i++] = m[8] ^ z[6]; + xors[i++] = m[2] ^ z[10]; + xors[i++] = m[0] ^ z[5]; + xors[i++] = m[4] ^ z[15]; + xors[i++] = m[6] ^ z[8]; + //8 + xors[i++] = m[6] ^ z[15]; + xors[i++] = m[0] ^ z[8]; + xors[i++] = m[3] ^ z[11]; + xors[i++] = m[8] ^ z[0]; + xors[i++] = m[1] ^ z[4]; + xors[i++] = m[2] ^ z[12]; + xors[i++] = m[7] ^ z[13]; + xors[i++] = m[4] ^ z[1]; + xors[i++] = m[5] ^ z[10]; + //9 + xors[i++] = m[8] ^ z[4]; + xors[i++] = m[7] ^ z[6]; + xors[i++] = m[1] ^ z[5]; + xors[i++] = m[2] ^ z[10]; + xors[i++] = m[4] ^ z[8]; + xors[i++] = m[6] ^ z[7]; + xors[i++] = m[5] ^ z[1]; + xors[i++] = m[3] ^ z[12]; + xors[i++] = m[0] ^ z[13]; + //10 + xors[i++] = m[0] ^ z[1]; + xors[i++] = m[2] ^ z[3]; + xors[i++] = m[4] ^ z[5]; + xors[i++] = m[6] ^ z[7]; + xors[i++] = m[1] ^ z[0]; + xors[i++] = m[3] ^ z[2]; + xors[i++] = m[5] ^ z[4]; + xors[i++] = m[7] ^ z[6]; + xors[i++] = m[8] ^ z[9]; + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_x, xors, i*sizeof(uint2), 0, cudaMemcpyHostToDevice)); } -__host__ -void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *endiandata) +__host__ void quark_blake512_cpu_free(int) { -#ifdef SP_KERNEL - int dev_id = device_map[thr_id]; - if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) - quark_blake512_cpu_setBlock_80_sp(thr_id, (uint64_t*) endiandata); - else -#endif - { - uint64_t message[16]; - - memcpy(message, endiandata, 80); - message[10] = 0x80; - message[11] = 0; - message[12] = 0; - message[13] = 0x0100000000000000ull; - message[14] = 0; - message[15] = 0x8002000000000000ull; // 0x280 - - cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice); - } - CUDA_LOG_ERROR(); + } + +__host__ void __cdecl quark_blake512_cpu_init(int, unsigned int) +{ + +} \ No newline at end of file diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu index 1a6136ff7b..4890f2c2af 100644 --- a/quark/cuda_quark_keccak512.cu +++ b/quark/cuda_quark_keccak512.cu @@ -1,8 +1,20 @@ +/* + Based upon Tanguy Pruvot's repo + + Provos Alexis - 2016 +*/ + #include #include -#include // off_t -#include "cuda_helper.h" +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" +#include "miner.h" + +#define TPB52 128 +#define TPB50 256 + + #define U32TO64_LE(p) \ (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32)) @@ -27,34 +39,75 @@ static const uint64_t host_keccak_round_constants[24] = { __constant__ uint64_t d_keccak_round_constants[24]; -__device__ __forceinline__ -static void keccak_block(uint2 *s) -{ - size_t i; - uint2 t[5], u[5], v, w; - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROL2(t[1], 1); - u[1] = t[0] ^ ROL2(t[2], 1); - u[2] = t[1] ^ ROL2(t[3], 1); - u[3] = t[2] ^ ROL2(t[4], 1); - u[4] = t[3] ^ ROL2(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; +/* << from alexis */ + +__constant__ +uint2 keccak_round_constants[24] = { + { 0x00000001, 0x00000000 }, { 0x00008082, 0x00000000 }, { 0x0000808a, 0x80000000 }, { 0x80008000, 0x80000000 }, + { 0x0000808b, 0x00000000 }, { 0x80000001, 0x00000000 }, { 0x80008081, 0x80000000 }, { 0x00008009, 0x80000000 }, + { 0x0000008a, 0x00000000 }, { 0x00000088, 0x00000000 }, { 0x80008009, 0x00000000 }, { 0x8000000a, 0x00000000 }, + { 0x8000808b, 0x00000000 }, { 0x0000008b, 0x80000000 }, { 0x00008089, 0x80000000 }, { 0x00008003, 0x80000000 }, + { 0x00008002, 0x80000000 }, { 0x00000080, 0x80000000 }, { 0x0000800a, 0x00000000 }, { 0x8000000a, 0x80000000 }, + { 0x80008081, 0x80000000 }, { 0x00008080, 0x80000000 }, { 0x80000001, 0x00000000 }, { 0x80008008, 0x80000000 } +}; + +#if __CUDA_ARCH__ > 500 +__global__ __launch_bounds__(TPB52,7) +#else +__global__ __launch_bounds__(TPB50,3) +#endif +void quark_keccak512_gpu_hash_64(uint32_t threads, uint2* g_hash, uint32_t *g_nonceVector){ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint2 t[5], u[5], v, w; + uint2 s[25]; + if (thread < threads){ + + const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread]; + + uint2x4* d_hash = (uint2x4 *)&g_hash[hashPosition * 8]; + + #if __CUDA_ARCH__ > 500 + *(uint2x4*)&s[ 0] = __ldg4(&d_hash[ 0]); + *(uint2x4*)&s[ 4] = __ldg4(&d_hash[ 1]); + #else + *(uint2x4*)&s[ 0] = d_hash[ 0]; + *(uint2x4*)&s[ 4] = d_hash[ 1]; + #endif + + s[8] = make_uint2(1,0x80000000); + + /*theta*/ + t[ 0] = vectorize(devectorize(s[ 0])^devectorize(s[ 5])); + t[ 1] = vectorize(devectorize(s[ 1])^devectorize(s[ 6])); + t[ 2] = vectorize(devectorize(s[ 2])^devectorize(s[ 7])); + t[ 3] = vectorize(devectorize(s[ 3])^devectorize(s[ 8])); + t[ 4] = s[4]; + + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + u[ j] = ROL2(t[ j], 1); + } + + s[ 4] = xor3x(s[ 4], t[3], u[ 0]); + s[24] = s[19] = s[14] = s[ 9] = t[ 3] ^ u[ 0]; + + s[ 0] = xor3x(s[ 0], t[4], u[ 1]); + s[ 5] = xor3x(s[ 5], t[4], u[ 1]); + s[20] = s[15] = s[10] = t[4] ^ u[ 1]; + + s[ 1] = xor3x(s[ 1], t[0], u[ 2]); + s[ 6] = xor3x(s[ 6], t[0], u[ 2]); + s[21] = s[16] = s[11] = t[0] ^ u[ 2]; + + s[ 2] = xor3x(s[ 2], t[1], u[ 3]); + s[ 7] = xor3x(s[ 7], t[1], u[ 3]); + s[22] = s[17] = s[12] = t[1] ^ u[ 3]; + + s[ 3] = xor3x(s[ 3], t[2], u[ 4]);s[ 8] = xor3x(s[ 8], t[2], u[ 4]); + s[23] = s[18] = s[13] = t[2] ^ u[ 4]; /* rho pi: b[..] = rotl(a[..], ..) */ v = s[1]; s[1] = ROL2(s[6], 44); @@ -65,8 +118,8 @@ static void keccak_block(uint2 *s) s[20] = ROL2(s[2], 62); s[2] = ROL2(s[12], 43); s[12] = ROL2(s[13], 25); - s[13] = ROL2(s[19], 8); - s[19] = ROL2(s[23], 56); + s[13] = ROL8(s[19]); + s[19] = ROR8(s[23]); s[23] = ROL2(s[15], 41); s[15] = ROL2(s[4], 27); s[4] = ROL2(s[24], 14); @@ -81,179 +134,327 @@ static void keccak_block(uint2 *s) s[11] = ROL2(s[7], 6); s[7] = ROL2(s[10], 3); s[10] = ROL2(v, 1); - /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w; - v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - + #pragma unroll 5 + for(int j=0;j<25;j+=5){ + v=s[j];w=s[j + 1];s[j] = chi(v,w,s[j+2]);s[j+1] = chi(w,s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w); + } /* iota: a[0,0] ^= round constant */ - s[0] ^= vectorize(d_keccak_round_constants[i]); - } -} - -__global__ -void quark_keccak512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - off_t hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[hashPosition * 8]; - uint2 keccak_gpu_state[25]; - - for (int i = 0; i<8; i++) { - keccak_gpu_state[i] = vectorize(inpHash[i]); + s[0] ^= keccak_round_constants[ 0]; + + #if __CUDA_ARCH__ > 500 + #pragma unroll 4 + #else + #pragma unroll 3 + #endif + for (int i = 1; i < 23; i++) { + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20]))); + } + + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + u[ j] = ROL2(t[ j], 1); + } + s[ 4] = xor3x(s[ 4], t[3], u[ 0]);s[ 9] = xor3x(s[ 9], t[3], u[ 0]);s[14] = xor3x(s[14], t[3], u[ 0]);s[19] = xor3x(s[19], t[3], u[ 0]);s[24] = xor3x(s[24], t[3], u[ 0]); + s[ 0] = xor3x(s[ 0], t[4], u[ 1]);s[ 5] = xor3x(s[ 5], t[4], u[ 1]);s[10] = xor3x(s[10], t[4], u[ 1]);s[15] = xor3x(s[15], t[4], u[ 1]);s[20] = xor3x(s[20], t[4], u[ 1]); + s[ 1] = xor3x(s[ 1], t[0], u[ 2]);s[ 6] = xor3x(s[ 6], t[0], u[ 2]);s[11] = xor3x(s[11], t[0], u[ 2]);s[16] = xor3x(s[16], t[0], u[ 2]);s[21] = xor3x(s[21], t[0], u[ 2]); + s[ 2] = xor3x(s[ 2], t[1], u[ 3]);s[ 7] = xor3x(s[ 7], t[1], u[ 3]);s[12] = xor3x(s[12], t[1], u[ 3]);s[17] = xor3x(s[17], t[1], u[ 3]);s[22] = xor3x(s[22], t[1], u[ 3]); + s[ 3] = xor3x(s[ 3], t[2], u[ 4]);s[ 8] = xor3x(s[ 8], t[2], u[ 4]);s[13] = xor3x(s[13], t[2], u[ 4]);s[18] = xor3x(s[18], t[2], u[ 4]);s[23] = xor3x(s[23], t[2], u[ 4]); + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[1]; + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL8(s[19]); + s[19] = ROR8(s[23]); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + #pragma unroll 5 + for(int j=0;j<25;j+=5){ + v=s[j];w=s[j + 1];s[j] = chi(v,w,s[j+2]);s[j+1] = chi(w,s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w); + } + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; } - keccak_gpu_state[8] = vectorize(0x8000000000000001ULL); - - for (int i=9; i<25; i++) { - keccak_gpu_state[i] = make_uint2(0, 0); + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]),s[j+15],s[j+20]); } - keccak_block(keccak_gpu_state); - - for(int i=0; i<8; i++) { - inpHash[i] = devectorize(keccak_gpu_state[i]); + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + u[ j] = ROL2(t[ j], 1); } + s[ 9] = xor3x(s[ 9], t[3], u[ 0]); + s[24] = xor3x(s[24], t[3], u[ 0]); + s[ 0] = xor3x(s[ 0], t[4], u[ 1]); + s[10] = xor3x(s[10], t[4], u[ 1]); + s[ 6] = xor3x(s[ 6], t[0], u[ 2]); + s[16] = xor3x(s[16], t[0], u[ 2]); + s[12] = xor3x(s[12], t[1], u[ 3]); + s[22] = xor3x(s[22], t[1], u[ 3]); + s[ 3] = xor3x(s[ 3], t[2], u[ 4]); + s[18] = xor3x(s[18], t[2], u[ 4]); + /* rho pi: b[..] = rotl(a[..], ..) */ + s[ 1] = ROL2(s[ 6], 44); + s[ 2] = ROL2(s[12], 43); + s[ 5] = ROL2(s[ 3], 28); + s[ 7] = ROL2(s[10], 3); + s[ 3] = ROL2(s[18], 21); + s[ 4] = ROL2(s[24], 14); + s[ 6] = ROL2(s[ 9], 20); + s[ 8] = ROL2(s[16], 45); + s[ 9] = ROL2(s[22], 61); + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + v=s[ 0];w=s[ 1];s[ 0] = chi(v,w,s[ 2]);s[ 1] = chi(w,s[ 2],s[ 3]);s[ 2]=chi(s[ 2],s[ 3],s[ 4]);s[ 3]=chi(s[ 3],s[ 4],v);s[ 4]=chi(s[ 4],v,w); + v=s[ 5];w=s[ 6];s[ 5] = chi(v,w,s[ 7]);s[ 6] = chi(w,s[ 7],s[ 8]);s[ 7]=chi(s[ 7],s[ 8],s[ 9]); + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[23]; + + d_hash[0] = *(uint2x4*)&s[0]; + d_hash[1] = *(uint2x4*)&s[4]; + } } -__device__ __forceinline__ -static void keccak_block_v30(uint64_t *s, const uint32_t *in) -{ - size_t i; - uint64_t t[5], u[5], v, w; - - #pragma unroll 9 - for (i = 0; i < 72 / 8; i++, in += 2) - s[i] ^= U32TO64_LE(in); - - for (i = 0; i < 24; i++) { - /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */ - t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20]; - t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21]; - t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22]; - t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23]; - t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24]; - - /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */ - u[0] = t[4] ^ ROTL64(t[1], 1); - u[1] = t[0] ^ ROTL64(t[2], 1); - u[2] = t[1] ^ ROTL64(t[3], 1); - u[3] = t[2] ^ ROTL64(t[4], 1); - u[4] = t[3] ^ ROTL64(t[0], 1); - - /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */ - s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0]; - s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1]; - s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2]; - s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3]; - s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4]; - +#if __CUDA_ARCH__ > 500 +__global__ __launch_bounds__(TPB52,6) +#else +__global__ __launch_bounds__(TPB50,3) +#endif +void quark_keccak512_gpu_hash_64_final(uint32_t threads, uint2* g_hash, uint32_t* g_nonceVector, uint32_t *resNonce, const uint64_t target){ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint2 t[5], u[5], v, w; + uint2 s[25]; + if (thread < threads){ + + const uint32_t hashPosition = g_nonceVector[thread]; + + uint2x4* d_hash = (uint2x4 *)&g_hash[hashPosition * 8]; + + #if __CUDA_ARCH__ > 500 + *(uint2x4*)&s[ 0] = __ldg4(&d_hash[ 0]); + *(uint2x4*)&s[ 4] = __ldg4(&d_hash[ 1]); + #else + *(uint2x4*)&s[ 0] = d_hash[ 0]; + *(uint2x4*)&s[ 4] = d_hash[ 1]; + #endif + + s[8] = make_uint2(1,0x80000000); + + /*theta*/ + t[ 0] = vectorize(devectorize(s[ 0])^devectorize(s[ 5])); + t[ 1] = vectorize(devectorize(s[ 1])^devectorize(s[ 6])); + t[ 2] = vectorize(devectorize(s[ 2])^devectorize(s[ 7])); + t[ 3] = vectorize(devectorize(s[ 3])^devectorize(s[ 8])); + t[ 4] = s[4]; + + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + u[ j] = ROL2(t[ j], 1); + } + + s[ 4] = xor3x(s[ 4], t[3], u[ 0]); + s[24] = s[19] = s[14] = s[ 9] = t[ 3] ^ u[ 0]; + + s[ 0] = xor3x(s[ 0], t[4], u[ 1]); + s[ 5] = xor3x(s[ 5], t[4], u[ 1]); + s[20] = s[15] = s[10] = t[4] ^ u[ 1]; + + s[ 1] = xor3x(s[ 1], t[0], u[ 2]); + s[ 6] = xor3x(s[ 6], t[0], u[ 2]); + s[21] = s[16] = s[11] = t[0] ^ u[ 2]; + + s[ 2] = xor3x(s[ 2], t[1], u[ 3]); + s[ 7] = xor3x(s[ 7], t[1], u[ 3]); + s[22] = s[17] = s[12] = t[1] ^ u[ 3]; + + s[ 3] = xor3x(s[ 3], t[2], u[ 4]);s[ 8] = xor3x(s[ 8], t[2], u[ 4]); + s[23] = s[18] = s[13] = t[2] ^ u[ 4]; /* rho pi: b[..] = rotl(a[..], ..) */ - v = s[ 1]; - s[ 1] = ROTL64(s[ 6], 44); - s[ 6] = ROTL64(s[ 9], 20); - s[ 9] = ROTL64(s[22], 61); - s[22] = ROTL64(s[14], 39); - s[14] = ROTL64(s[20], 18); - s[20] = ROTL64(s[ 2], 62); - s[ 2] = ROTL64(s[12], 43); - s[12] = ROTL64(s[13], 25); - s[13] = ROTL64(s[19], 8); - s[19] = ROTL64(s[23], 56); - s[23] = ROTL64(s[15], 41); - s[15] = ROTL64(s[ 4], 27); - s[ 4] = ROTL64(s[24], 14); - s[24] = ROTL64(s[21], 2); - s[21] = ROTL64(s[ 8], 55); - s[ 8] = ROTL64(s[16], 45); - s[16] = ROTL64(s[ 5], 36); - s[ 5] = ROTL64(s[ 3], 28); - s[ 3] = ROTL64(s[18], 21); - s[18] = ROTL64(s[17], 15); - s[17] = ROTL64(s[11], 10); - s[11] = ROTL64(s[ 7], 6); - s[ 7] = ROTL64(s[10], 3); - s[10] = ROTL64( v, 1); - + v = s[1]; + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL8(s[19]); + s[19] = ROR8(s[23]); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(v, 1); /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ - v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w; - v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w; - v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w; - v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w; - v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w; - + #pragma unroll 5 + for(int j=0;j<25;j+=5){ + v=s[j];w=s[j + 1];s[j] = chi(v,w,s[j+2]);s[j+1] = chi(w,s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w); + } /* iota: a[0,0] ^= round constant */ - s[0] ^= d_keccak_round_constants[i]; + s[0] ^= keccak_round_constants[ 0]; + + #if __CUDA_ARCH__ > 500 + #pragma unroll 4 + #else + #pragma unroll 3 + #endif + for (int i = 1; i < 23; i++) { + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + t[ j] = vectorize(xor5(devectorize(s[ j]),devectorize(s[j+5]),devectorize(s[j+10]),devectorize(s[j+15]),devectorize(s[j+20]))); + } + + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + u[ j] = ROL2(t[ j], 1); + } + s[ 4] = xor3x(s[ 4], t[3], u[ 0]);s[ 9] = xor3x(s[ 9], t[3], u[ 0]);s[14] = xor3x(s[14], t[3], u[ 0]);s[19] = xor3x(s[19], t[3], u[ 0]);s[24] = xor3x(s[24], t[3], u[ 0]); + s[ 0] = xor3x(s[ 0], t[4], u[ 1]);s[ 5] = xor3x(s[ 5], t[4], u[ 1]);s[10] = xor3x(s[10], t[4], u[ 1]);s[15] = xor3x(s[15], t[4], u[ 1]);s[20] = xor3x(s[20], t[4], u[ 1]); + s[ 1] = xor3x(s[ 1], t[0], u[ 2]);s[ 6] = xor3x(s[ 6], t[0], u[ 2]);s[11] = xor3x(s[11], t[0], u[ 2]);s[16] = xor3x(s[16], t[0], u[ 2]);s[21] = xor3x(s[21], t[0], u[ 2]); + s[ 2] = xor3x(s[ 2], t[1], u[ 3]);s[ 7] = xor3x(s[ 7], t[1], u[ 3]);s[12] = xor3x(s[12], t[1], u[ 3]);s[17] = xor3x(s[17], t[1], u[ 3]);s[22] = xor3x(s[22], t[1], u[ 3]); + s[ 3] = xor3x(s[ 3], t[2], u[ 4]);s[ 8] = xor3x(s[ 8], t[2], u[ 4]);s[13] = xor3x(s[13], t[2], u[ 4]);s[18] = xor3x(s[18], t[2], u[ 4]);s[23] = xor3x(s[23], t[2], u[ 4]); + + /* rho pi: b[..] = rotl(a[..], ..) */ + v = s[1]; + s[1] = ROL2(s[6], 44); + s[6] = ROL2(s[9], 20); + s[9] = ROL2(s[22], 61); + s[22] = ROL2(s[14], 39); + s[14] = ROL2(s[20], 18); + s[20] = ROL2(s[2], 62); + s[2] = ROL2(s[12], 43); + s[12] = ROL2(s[13], 25); + s[13] = ROL8(s[19]); + s[19] = ROR8(s[23]); + s[23] = ROL2(s[15], 41); + s[15] = ROL2(s[4], 27); + s[4] = ROL2(s[24], 14); + s[24] = ROL2(s[21], 2); + s[21] = ROL2(s[8], 55); + s[8] = ROL2(s[16], 45); + s[16] = ROL2(s[5], 36); + s[5] = ROL2(s[3], 28); + s[3] = ROL2(s[18], 21); + s[18] = ROL2(s[17], 15); + s[17] = ROL2(s[11], 10); + s[11] = ROL2(s[7], 6); + s[7] = ROL2(s[10], 3); + s[10] = ROL2(v, 1); + + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + #pragma unroll 5 + for(int j=0;j<25;j+=5){ + v=s[j];w=s[j + 1];s[j] = chi(v,w,s[j+2]);s[j+1] = chi(w,s[j+2],s[j+3]);s[j+2]=chi(s[j+2],s[j+3],s[j+4]);s[j+3]=chi(s[j+3],s[j+4],v);s[j+4]=chi(s[j+4],v,w); + } + + /* iota: a[0,0] ^= round constant */ + s[0] ^= keccak_round_constants[i]; + } + /*theta*/ + #pragma unroll 5 + for(int j=0;j<5;j++){ + t[ j] = xor3x(xor3x(s[j+0],s[j+5],s[j+10]),s[j+15],s[j+20]); + } + /*theta*/ + u[ 0] = ROL2(t[ 0],1); + u[ 1] = ROL2(t[ 1],1); + s[18] = xor3x(s[18], t[2], ROL2(t[ 4],1)); + s[24] = xor3x(s[24], t[3], u[ 0]); + s[ 0] = xor3x(s[ 0], t[4], u[ 1]); + /* rho pi: b[..] = rotl(a[..], ..) */ + s[ 3] = ROL2(s[18], 21); + s[ 4] = ROL2(s[24], 14); + /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */ + if(devectorize(chi(s[ 3],s[ 4],s[ 0])) <= target){ + const uint32_t tmp = atomicExch(&resNonce[0], hashPosition); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } } } -__global__ -void quark_keccak512_gpu_hash_64_v30(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +/* +__host__ +void quark_keccak512_cpu_init(int thr_id, uint32_t threads) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - off_t hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[hashPosition * 8]; - - uint32_t message[18]; - #pragma unroll 16 - for(int i=0;i<16;i++) - message[i] = inpHash[i]; - - message[16] = 0x01; - message[17] = 0x80000000; - uint64_t keccak_gpu_state[25]; - #pragma unroll 25 - for (int i=0; i<25; i++) - keccak_gpu_state[i] = 0; - - keccak_block_v30(keccak_gpu_state, message); +} +*/ - uint32_t hash[16]; - #pragma unroll 8 - for (size_t i = 0; i < 64; i += 8) { - U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]); - } +__host__ +void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads,uint32_t *d_nonceVector, uint32_t *d_hash) +{ + uint32_t tpb = TPB52; + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] <= 500) tpb = TPB50; + const dim3 grid((threads + tpb-1)/tpb); + const dim3 block(tpb); - uint32_t *outpHash = (uint32_t*)&g_hash[hashPosition * 8]; - #pragma unroll 16 - for(int i=0; i<16; i++) - outpHash[i] = hash[i]; - } + quark_keccak512_gpu_hash_64<<>>(threads, (uint2*)d_hash, d_nonceVector); } __host__ -void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void quark_keccak512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash, uint64_t target, uint32_t *d_resNonce) { - const uint32_t threadsperblock = 256; + uint32_t tpb = TPB52; + int dev_id = device_map[thr_id]; + if (device_sm[dev_id] <= 500) tpb = TPB50; + const dim3 grid((threads + tpb-1)/tpb); + const dim3 block(tpb); + + quark_keccak512_gpu_hash_64_final<<>>(threads, (uint2*)d_hash, d_nonceVector, d_resNonce, target); +} - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - int dev_id = device_map[thr_id]; - if (device_sm[dev_id] >= 320) - quark_keccak512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - else - quark_keccak512_gpu_hash_64_v30<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); -} void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads); void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen); void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); + __host__ void quark_keccak512_cpu_init(int thr_id, uint32_t threads) { @@ -264,6 +465,7 @@ void quark_keccak512_cpu_init(int thr_id, uint32_t threads) jackpot_keccak512_cpu_init(thr_id, threads); } + __host__ void keccak512_setBlock_80(int thr_id, uint32_t *endiandata) { @@ -275,3 +477,4 @@ void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint { jackpot_keccak512_cpu_hash(thr_id, threads, startNounce, d_hash, 0); } + diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 7c4d99ab3a..dcab123efd 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -754,6 +754,307 @@ void quark_skein512_gpu_hash_64(const uint32_t threads, const uint32_t startNonc } } + +__global__ __launch_bounds__(512, 3) +void quark_skein512_gpu_hash_64_final(const uint32_t threads, uint64_t* g_hash, uint32_t* resNonce, uint64_t target) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads){ + + // Skein + uint2 p[8], h[9]; + + const uint32_t hashPosition = thread; + + uint64_t *Hash = &g_hash[hashPosition << 3]; + + uint2x4 *phash = (uint2x4*)Hash; + *(uint2x4*)&p[0] = __ldg4(&phash[0]); + *(uint2x4*)&p[4] = __ldg4(&phash[1]); + + h[0] = p[0]; h[1] = p[1]; h[2] = p[2]; h[3] = p[3]; + h[4] = p[4]; h[5] = p[5]; h[6] = p[6]; h[7] = p[7]; + + p[0] += buffer[0]; p[1] += buffer[1]; p[2] += buffer[2]; p[3] += buffer[3]; + p[4] += buffer[4]; p[5] += buffer[5]; p[6] += buffer[6]; p[7] += buffer[7]; + macro1(); + p[0] += buffer[8]; p[1] += buffer[9]; p[2] += buffer[10]; p[3] += buffer[11]; + p[4] += buffer[12]; p[5] += buffer[13]; p[6] += buffer[14]; p[7] += buffer[15]; + macro2(); + p[0] += buffer[16]; p[1] += buffer[17]; p[2] += buffer[18]; p[3] += buffer[19]; + p[4] += buffer[20]; p[5] += buffer[21]; p[6] += buffer[22]; p[7] += buffer[23]; + macro1(); + p[0] += buffer[24]; p[1] += buffer[25]; p[2] += buffer[26]; p[3] += buffer[27]; + p[4] += buffer[28]; p[5] += buffer[29]; p[6] += buffer[30]; p[7] += buffer[31]; + macro2(); + p[0] += buffer[32]; p[1] += buffer[33]; p[2] += buffer[34]; p[3] += buffer[35]; + p[4] += buffer[36]; p[5] += buffer[37]; p[6] += buffer[38]; p[7] += buffer[39]; + macro1(); + p[0] += buffer[40]; p[1] += buffer[41]; p[2] += buffer[42]; p[3] += buffer[43]; + p[4] += buffer[44]; p[5] += buffer[45]; p[6] += buffer[46]; p[7] += buffer[47]; + macro2(); + p[0] += buffer[48]; p[1] += buffer[49]; p[2] += buffer[50]; p[3] += buffer[51]; + p[4] += buffer[52]; p[5] += buffer[53]; p[6] += buffer[54]; p[7] += buffer[55]; + macro1(); + p[0] += buffer[56]; p[1] += buffer[57]; p[2] += buffer[58]; p[3] += buffer[59]; + p[4] += buffer[60]; p[5] += buffer[61]; p[6] += buffer[62]; p[7] += buffer[63]; + macro2(); + p[0] += buffer[64]; p[1] += buffer[65]; p[2] += buffer[66]; p[3] += buffer[67]; + p[4] += buffer[68]; p[5] += buffer[69]; p[6] += buffer[70]; p[7] += buffer[71]; + macro1(); + p[0] += buffer[72]; p[1] += buffer[73]; p[2] += buffer[74]; p[3] += buffer[75]; + p[4] += buffer[76]; p[5] += buffer[77]; p[6] += buffer[78]; p[7] += buffer[79]; + macro2(); + p[0] += buffer[80]; p[1] += buffer[81]; p[2] += buffer[82]; p[3] += buffer[83]; + p[4] += buffer[84]; p[5] += buffer[85]; p[6] += buffer[86]; p[7] += buffer[87]; + macro1(); + p[0] += buffer[88]; p[1] += buffer[89]; p[2] += buffer[90]; p[3] += buffer[91]; + p[4] += buffer[92]; p[5] += buffer[93]; p[6] += buffer[94]; p[7] += buffer[95]; + macro2(); + p[0] += buffer[96]; p[1] += buffer[97]; p[2] += buffer[98]; p[3] += buffer[99]; + p[4] += buffer[100]; p[5] += buffer[101]; p[6] += buffer[102]; p[7] += buffer[103]; + macro1(); + p[0] += buffer[104]; p[1] += buffer[105]; p[2] += buffer[106]; p[3] += buffer[107]; + p[4] += buffer[108]; p[5] += buffer[109]; p[6] += buffer[110]; p[7] += buffer[111]; + macro2(); + p[0] += make_uint2(0xA9D5C3F4, 0xEABE394C); p[1] += make_uint2(0x1A75B523, 0x991112C7); + p[2] += make_uint2(0x660FCC33, 0xAE18A40B); p[3] += make_uint2(0x98173EC4, 0xCAB2076D); + p[4] += make_uint2(0x749C51CE, 0x4903ADFF); p[5] += make_uint2(0x9746DF43, 0xFD95DE39); + p[6] += make_uint2(0x27C79C0E, 0x8FD19341); p[7] += make_uint2(0xFF352CBF, 0x9A255629); + macro1(); + p[0] += make_uint2(0x1A75B523, 0x991112C7); p[1] += make_uint2(0x660FCC33, 0xAE18A40B); + p[2] += make_uint2(0x98173EC4, 0xCAB2076D); p[3] += make_uint2(0x749C51CE, 0x4903ADFF); + p[4] += make_uint2(0x9746DF03, 0x0D95DE39); p[5] += make_uint2(0x27C79C0E, 0x8FD19341); + p[6] += make_uint2(0xFF352CB1, 0x8A255629); p[7] += make_uint2(0xDF6CA7BF, 0x5DB62599); + macro2(); + p[0] += vectorize(0xAE18A40B660FCC33); p[1] += vectorize(0xcab2076d98173ec4); + p[2] += vectorize(0x4903ADFF749C51CE); p[3] += vectorize(0x0D95DE399746DF03); + p[4] += vectorize(0x8FD1934127C79BCE); p[5] += vectorize(0x8A255629FF352CB1); + p[6] += vectorize(0x4DB62599DF6CA7F0); p[7] += vectorize(0xEABE394CA9D5C3F4 + 16); + macro1(); + p[0] += vectorize(0xcab2076d98173ec4); p[1] += vectorize(0x4903ADFF749C51CE); + p[2] += vectorize(0x0D95DE399746DF03); p[3] += vectorize(0x8FD1934127C79BCE); + p[4] += vectorize(0x9A255629FF352CB1); p[5] += vectorize(0x4DB62599DF6CA7F0); + p[6] += vectorize(0xEABE394CA9D5C3F4 + 0x0000000000000040); + p[7] += vectorize(0x991112C71A75B523 + 17); + macro2(); + p[0] += vectorize(0x4903ADFF749C51CE); p[1] += vectorize(0x0D95DE399746DF03); + p[2] += vectorize(0x8FD1934127C79BCE); p[3] += vectorize(0x9A255629FF352CB1); + p[4] += vectorize(0x5DB62599DF6CA7B0); p[5] += vectorize(0xEABE394CA9D5C3F4 + 0x0000000000000040); + p[6] += vectorize(0x891112C71A75B523); p[7] += vectorize(0xAE18A40B660FCC33 + 18); + +#define h0 p[0] +#define h1 p[1] +#define h2 p[2] +#define h3 p[3] +#define h4 p[4] +#define h5 p[5] +#define h6 p[6] +#define h7 p[7] + + h0 ^= h[0]; h1 ^= h[1]; h2 ^= h[2]; h3 ^= h[3]; + h4 ^= h[4]; h5 ^= h[5]; h6 ^= h[6]; h7 ^= h[7]; + + uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22); + + uint2 hash64[8]; + + hash64[5] = h5 + 8; + + hash64[0] = h0 + h1; + hash64[1] = ROL2(h1, 46) ^ hash64[0]; + hash64[2] = h2 + h3; + hash64[3] = ROL2(h3, 36) ^ hash64[2]; + hash64[4] = h4 + hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] = (h6 + h7 + make_uint2(0, 0xff000000)); + hash64[7] = ROL2(h7, 37) ^ hash64[6]; + hash64[2] += hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4] += hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6] += hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0] += hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4] += hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6] += hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0] += hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2] += hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6] += hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0] += hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2] += hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4] += hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + + hash64[0] += h1; hash64[1] += h2; hash64[2] += h3; hash64[3] += h4; + hash64[4] += h5; + hash64[5] += h6 + make_uint2(0, 0xff000000); + hash64[6] += h7 + vectorize(0xff00000000000008); + hash64[7] += skein_h8 + 1; + macro3(); + hash64[0] += h2; hash64[1] += h3; hash64[2] += h4; hash64[3] += h5; + hash64[4] += h6; + hash64[5] += h7 + vectorize(0xff00000000000008); + hash64[6] += skein_h8 + 8; + hash64[7] += h0 + 2; + macro4(); + hash64[0] = (hash64[0] + h3); hash64[1] = (hash64[1] + h4); + hash64[2] = (hash64[2] + h5); hash64[3] = (hash64[3] + h6); + hash64[4] = (hash64[4] + h7); hash64[5] = (hash64[5] + skein_h8 + 8); + hash64[6] = (hash64[6] + h0 + make_uint2(0, 0xff000000)); + hash64[7] = (hash64[7] + h1 + 3); + macro3(); + hash64[0] = (hash64[0] + h4); hash64[1] = (hash64[1] + h5); + hash64[2] = (hash64[2] + h6); hash64[3] = (hash64[3] + h7); + hash64[4] = (hash64[4] + skein_h8); hash64[5] = (hash64[5] + h0 + make_uint2(0, 0xff000000)); + hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008)); + hash64[7] = (hash64[7] + h2 + 4); + macro4(); + hash64[0] = (hash64[0] + h5); hash64[1] = (hash64[1] + h6); + hash64[2] = (hash64[2] + h7); hash64[3] = (hash64[3] + skein_h8); + hash64[4] = (hash64[4] + h0); hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008)); + hash64[6] = (hash64[6] + h2 + 8); hash64[7] = (hash64[7] + h3 + 5); + macro3(); + hash64[0] = (hash64[0] + h6); hash64[1] = (hash64[1] + h7); + hash64[2] = (hash64[2] + skein_h8); hash64[3] = (hash64[3] + h0); + hash64[4] = (hash64[4] + h1); hash64[5] = (hash64[5] + h2 + 8); + hash64[6] = (hash64[6] + h3 + make_uint2(0, 0xff000000)); + hash64[7] = (hash64[7] + h4 + 6); + macro4(); + hash64[0] = (hash64[0] + h7); hash64[1] = (hash64[1] + skein_h8); + hash64[2] = (hash64[2] + h0); hash64[3] = (hash64[3] + h1); + hash64[4] = (hash64[4] + h2); hash64[5] = (hash64[5] + h3 + make_uint2(0, 0xff000000)); + hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008)); + hash64[7] = (hash64[7] + h5 + 7); + macro3(); + hash64[0] = (hash64[0] + skein_h8); hash64[1] = (hash64[1] + h0); + hash64[2] = (hash64[2] + h1); hash64[3] = (hash64[3] + h2); + hash64[4] = (hash64[4] + h3); hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008)); + hash64[6] = (hash64[6] + h5 + 8); hash64[7] = (hash64[7] + h6 + 8); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h2)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h3)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h4)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h5) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h6) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h7) + 9); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h1)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h2)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h3)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h4)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h5)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h6) + 0xff00000000000000); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h7) + 0xff00000000000008); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(skein_h8) + 10); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h2)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h3)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h4)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h5)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h6)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h7) + 0xff00000000000008); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(skein_h8) + 8); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h0) + 11); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h3)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h4)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h5)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h6)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h7)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(skein_h8) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h0) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h1) + 12); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h4)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h5)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h6)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h7)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(skein_h8)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h0) + 0xff00000000000000); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h1) + 0xff00000000000008); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h2) + 13); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h5)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h6)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h7)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(skein_h8)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h0)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h1) + 0xff00000000000008); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h2) + 8); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h3) + 14); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h6)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h7)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(skein_h8)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h0)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h1)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h2) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h3) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h4) + 15); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h7)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(skein_h8)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h0)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h1)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h2)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h3) + 0xff00000000000000); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h4) + 0xff00000000000008); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h5) + 16); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(skein_h8)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h0)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h1)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h2)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h3)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h4) + 0xff00000000000008); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h5) + 8); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h6) + 17); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h2)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h3)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h4)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h5) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h6) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h7) + 18); + +// phash = (uint2x4*)hash64; +// uint2x4 *outpt = (uint2x4*)Hash; +// outpt[0] = phash[0]; +// outpt[1] = phash[1]; + +#undef h0 +#undef h1 +#undef h2 +#undef h3 +#undef h4 +#undef h5 +#undef h6 +#undef h7 + + if (devectorize(hash64[3]) <= target) + { + const uint32_t tmp = atomicExch(&resNonce[0], hashPosition); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } + } +} + + __host__ //void quark_skein512_cpu_hash_64(int thr_id,uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash) void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) @@ -768,6 +1069,20 @@ void quark_skein512_cpu_hash_64(int thr_id, const uint32_t threads, const uint32 } +__host__ +void quark_skein512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t *d_hash, uint64_t target, uint32_t *d_resNonce) +{ + uint32_t tpb = TPB52; + int dev_id = device_map[thr_id]; + + if (device_sm[dev_id] <= 500) tpb = TPB50; + const dim3 grid((threads + tpb - 1) / tpb); + const dim3 block(tpb); + quark_skein512_gpu_hash_64_final << > >(threads, (uint64_t*)d_hash, d_resNonce, target); +} + + + // 120 * 8 = 960 ... too big ? static __constant__ uint2 c_buffer[120]; // padded message (80 bytes + 72*8 bytes midstate + align) diff --git a/quark/nist5.cu b/quark/nist5.cu index 25aff74311..ee53f35489 100644 --- a/quark/nist5.cu +++ b/quark/nist5.cu @@ -109,7 +109,7 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 445b1cfebf..c9dbf7102d 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -154,7 +154,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); quark_bmw512_cpu_init(thr_id, throughput); - quark_keccak512_cpu_init(thr_id, throughput); + //quark_keccak512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); quark_compactTest_cpu_init(thr_id, throughput); @@ -214,7 +214,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, quark_bmw512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); // das ist der unbedingte Branch für Keccak512 - quark_keccak512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, nrm3, NULL, d_hash[thr_id]); order++; // das ist der unbedingte Branch für Skein512 quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); @@ -225,7 +225,8 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, d_branch2Nonces[thr_id], &nrm2, order++); - quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, nrm1, d_branch1Nonces[thr_id], d_hash[thr_id]); order++; + quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); work->nonces[0] = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); @@ -250,13 +251,15 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); TRACE("perm2 :"); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("keccak :"); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("skein :"); quark_filter_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); quark_merge_cpu_sm2(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); TRACE("perm3 :"); diff --git a/qubit/deep.cu b/qubit/deep.cu index 0de2a9ce32..72615917a5 100644 --- a/qubit/deep.cu +++ b/qubit/deep.cu @@ -73,7 +73,7 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); qubit_luffa512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); + //x11_cubehash512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); cuda_check_cpu_init(thr_id, throughput); @@ -91,7 +91,7 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, int order = 0; qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; diff --git a/qubit/qubit.cu b/qubit/qubit.cu index 9520ea367f..660beb16f9 100644 --- a/qubit/qubit.cu +++ b/qubit/qubit.cu @@ -82,7 +82,7 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); qubit_luffa512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); + //x11_cubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); @@ -105,7 +105,7 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, // Hash with CUDA qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/res/ccminer.rc b/res/ccminer.rc index 7ad908379c..6168e57ab9 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 0,5,0,0 - PRODUCTVERSION 0,5,0,0 + FILEVERSION 0,5,1,0 + PRODUCTVERSION 0,5,1,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "0.5.0" + VALUE "FileVersion", "0.5.1" VALUE "LegalCopyright", "Copyright (C) 2022" VALUE "ProductName", "ccminer-fancyIX" - VALUE "ProductVersion", "0.5.0" + VALUE "ProductVersion", "0.5.1" END END BLOCK "VarFileInfo" diff --git a/skunk/cuda_skunk_streebog.cu b/skunk/cuda_skunk_streebog.cu index 36ec7923c0..e1403bfffd 100644 --- a/skunk/cuda_skunk_streebog.cu +++ b/skunk/cuda_skunk_streebog.cu @@ -201,6 +201,105 @@ static void GOST_E12(const uint2 shared[8][256],uint2 *const __restrict__ K, uin } } + +#define TPB 256 +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(TPB, 2) +#else +__launch_bounds__(TPB, 3) +#endif +void streebog_gpu_hash_64_alexis(uint64_t *g_hash){ + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint2 buf[8], t[8], temp[8], K0[8], hash[8]; + + __shared__ uint2 shared[7][256]; + shared[0][threadIdx.x] = __ldg(&T02[threadIdx.x]); + shared[1][threadIdx.x] = __ldg(&T12[threadIdx.x]); + shared[2][threadIdx.x] = __ldg(&T22[threadIdx.x]); + shared[3][threadIdx.x] = __ldg(&T32[threadIdx.x]); + shared[4][threadIdx.x] = __ldg(&T42[threadIdx.x]); + shared[5][threadIdx.x] = __ldg(&T52[threadIdx.x]); + shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]); + //shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]); + +// if (thread < threads) +// { + uint64_t* inout = &g_hash[thread<<3]; + + *(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]); + *(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]); + + __syncthreads(); + + #pragma unroll + for(int i = 0; i < 8; i++) buf[i] = vectorize(0x74a5d4ce2efc83b3) ^ hash[i]; + + #pragma nounroll + for(int i = 0; i < 12; i++) { + GOST_FS(shared, buf, temp); + #pragma unroll + for(uint32_t j = 0; j < 8; j++) buf[j] = temp[j] ^ *(uint2*)&precomputed_values[i][j]; + } + + #pragma unroll + for(int j = 0; j < 8; j++) buf[j] ^= hash[j]; + + #pragma unroll + for(int j = 0; j < 8; j++) K0[j] = buf[j]; + K0[7].y ^= 0x00020000; + + GOST_FS(shared, K0, t); + + #pragma unroll + for(int i = 0; i < 8; i++) K0[i] = t[i]; + + t[7].y ^= 0x01000000; + + GOST_E12(shared, K0, t); + + #pragma unroll + for(int j = 0; j < 8; j++) buf[j] ^= t[j]; + + buf[7].y ^= 0x01000000; + + GOST_FS(shared, buf,K0); + + buf[7].y ^= 0x00020000; + + #pragma unroll + for(int j = 0; j < 8; j++) t[j] = K0[j]; + + t[7].y ^= 0x00020000; + + GOST_E12(shared, K0, t); + + #pragma unroll + for(int j = 0; j < 8; j++) buf[j] ^= t[j]; + + GOST_FS(shared, buf,K0); // K = F(h) + + hash[7]+= vectorize(0x0100000000000000); + + #pragma unroll + for(int j = 0; j < 8; j++) t[j] = K0[j] ^ hash[j]; + + GOST_E12(shared, K0, t); + + *(uint2x4*)&inout[ 0] = *(uint2x4*)&t[ 0] ^ *(uint2x4*)&hash[0] ^ *(uint2x4*)&buf[0]; + *(uint2x4*)&inout[ 4] = *(uint2x4*)&t[ 4] ^ *(uint2x4*)&hash[4] ^ *(uint2x4*)&buf[4]; +} + +__host__ +void streebog_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + dim3 grid((threads + TPB-1) / TPB); + dim3 block(TPB); + + streebog_gpu_hash_64_alexis<<>>((uint64_t*)d_hash); +} + __constant__ uint64_t target64[4]; __host__ @@ -368,4 +467,4 @@ void skunk_cuda_streebog(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_ dim3 block(TPB); skunk_streebog_gpu_final_64 <<< grid, block >>> ((uint64_t*)d_hash, d_resNonce); -} +} \ No newline at end of file diff --git a/skunk/skunk.cu b/skunk/skunk.cu index c1add50303..ef6946a133 100644 --- a/skunk/skunk.cu +++ b/skunk/skunk.cu @@ -19,7 +19,7 @@ extern "C" { // compatibility kernels extern void skein512_cpu_setBlock_80(void *pdata); extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap); -extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x13_fugue512_cpu_free(int thr_id); @@ -127,7 +127,7 @@ extern "C" int scanhash_skunk(int thr_id, struct work* work, uint32_t max_nonce, int order = 0; if (use_compat_kernels[thr_id]) { skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); streebog_sm3_hash_64_final(thr_id, throughput, d_hash[thr_id], d_resNonce[thr_id]); } else { diff --git a/sph/sha3d.c b/sph/sha3d.c new file mode 100644 index 0000000000..9adbbec845 --- /dev/null +++ b/sph/sha3d.c @@ -0,0 +1,1824 @@ +/* $Id: keccak.c 259 2011-07-19 22:11:27Z tp $ */ +/* + * Keccak implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_sha3d.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +/* + * Parameters: + * + * SPH_KECCAK_64 use a 64-bit type + * SPH_KECCAK_UNROLL number of loops to unroll (0/undef for full unroll) + * SPH_KECCAK_INTERLEAVE use bit-interleaving (32-bit type only) + * SPH_KECCAK_NOCOPY do not copy the state into local variables + * + * If there is no usable 64-bit type, the code automatically switches + * back to the 32-bit implementation. + * + * Some tests on an Intel Core2 Q6600 (both 64-bit and 32-bit, 32 kB L1 + * code cache), a PowerPC (G3, 32 kB L1 code cache), an ARM920T core + * (16 kB L1 code cache), and a small MIPS-compatible CPU (Broadcom BCM3302, + * 8 kB L1 code cache), seem to show that the following are optimal: + * + * -- x86, 64-bit: use the 64-bit implementation, unroll 8 rounds, + * do not copy the state; unrolling 2, 6 or all rounds also provides + * near-optimal performance. + * -- x86, 32-bit: use the 32-bit implementation, unroll 6 rounds, + * interleave, do not copy the state. Unrolling 1, 2, 4 or 8 rounds + * also provides near-optimal performance. + * -- PowerPC: use the 64-bit implementation, unroll 8 rounds, + * copy the state. Unrolling 4 or 6 rounds is near-optimal. + * -- ARM: use the 64-bit implementation, unroll 2 or 4 rounds, + * copy the state. + * -- MIPS: use the 64-bit implementation, unroll 2 rounds, copy + * the state. Unrolling only 1 round is also near-optimal. + * + * Also, interleaving does not always yield actual improvements when + * using a 32-bit implementation; in particular when the architecture + * does not offer a native rotation opcode (interleaving replaces one + * 64-bit rotation with two 32-bit rotations, which is a gain only if + * there is a native 32-bit rotation opcode and not a native 64-bit + * rotation opcode; also, interleaving implies a small overhead when + * processing input words). + * + * To sum up: + * -- when possible, use the 64-bit code + * -- exception: on 32-bit x86, use 32-bit code + * -- when using 32-bit code, use interleaving + * -- copy the state, except on x86 + * -- unroll 8 rounds on "big" machine, 2 rounds on "small" machines + */ + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_SMALL_FOOTPRINT_KECCAK 1 +#endif + +/* + * By default, we select the 64-bit implementation if a 64-bit type + * is available, unless a 32-bit x86 is detected. + */ +#if !defined SPH_KECCAK_64 && SPH_64 \ + && !(defined __i386__ || SPH_I386_GCC || SPH_I386_MSVC) +#define SPH_KECCAK_64 1 +#endif + +/* + * If using a 32-bit implementation, we prefer to interleave. + */ +#if !SPH_KECCAK_64 && !defined SPH_KECCAK_INTERLEAVE +#define SPH_KECCAK_INTERLEAVE 1 +#endif + +/* + * Unroll 8 rounds on big systems, 2 rounds on small systems. + */ +#ifndef SPH_KECCAK_UNROLL +#if SPH_SMALL_FOOTPRINT_KECCAK +#define SPH_KECCAK_UNROLL 2 +#else +#define SPH_KECCAK_UNROLL 8 +#endif +#endif + +/* + * We do not want to copy the state to local variables on x86 (32-bit + * and 64-bit alike). + */ +#ifndef SPH_KECCAK_NOCOPY +#if defined __i386__ || defined __x86_64 || SPH_I386_MSVC || SPH_I386_GCC +#define SPH_KECCAK_NOCOPY 1 +#else +#define SPH_KECCAK_NOCOPY 0 +#endif +#endif + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#if SPH_KECCAK_64 + +static const sph_u64 RC[] = { + SPH_C64(0x0000000000000001), SPH_C64(0x0000000000008082), + SPH_C64(0x800000000000808A), SPH_C64(0x8000000080008000), + SPH_C64(0x000000000000808B), SPH_C64(0x0000000080000001), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008009), + SPH_C64(0x000000000000008A), SPH_C64(0x0000000000000088), + SPH_C64(0x0000000080008009), SPH_C64(0x000000008000000A), + SPH_C64(0x000000008000808B), SPH_C64(0x800000000000008B), + SPH_C64(0x8000000000008089), SPH_C64(0x8000000000008003), + SPH_C64(0x8000000000008002), SPH_C64(0x8000000000000080), + SPH_C64(0x000000000000800A), SPH_C64(0x800000008000000A), + SPH_C64(0x8000000080008081), SPH_C64(0x8000000000008080), + SPH_C64(0x0000000080000001), SPH_C64(0x8000000080008008) +}; + +#if SPH_KECCAK_NOCOPY + +#define a00 (kc->u.wide[ 0]) +#define a10 (kc->u.wide[ 1]) +#define a20 (kc->u.wide[ 2]) +#define a30 (kc->u.wide[ 3]) +#define a40 (kc->u.wide[ 4]) +#define a01 (kc->u.wide[ 5]) +#define a11 (kc->u.wide[ 6]) +#define a21 (kc->u.wide[ 7]) +#define a31 (kc->u.wide[ 8]) +#define a41 (kc->u.wide[ 9]) +#define a02 (kc->u.wide[10]) +#define a12 (kc->u.wide[11]) +#define a22 (kc->u.wide[12]) +#define a32 (kc->u.wide[13]) +#define a42 (kc->u.wide[14]) +#define a03 (kc->u.wide[15]) +#define a13 (kc->u.wide[16]) +#define a23 (kc->u.wide[17]) +#define a33 (kc->u.wide[18]) +#define a43 (kc->u.wide[19]) +#define a04 (kc->u.wide[20]) +#define a14 (kc->u.wide[21]) +#define a24 (kc->u.wide[22]) +#define a34 (kc->u.wide[23]) +#define a44 (kc->u.wide[24]) + +#define DECL_STATE +#define READ_STATE(sc) +#define WRITE_STATE(sc) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + kc->u.wide[j >> 3] ^= sph_dec64le_aligned(buf + j); \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u64 a00, a01, a02, a03, a04; \ + sph_u64 a10, a11, a12, a13, a14; \ + sph_u64 a20, a21, a22, a23, a24; \ + sph_u64 a30, a31, a32, a33, a34; \ + sph_u64 a40, a41, a42, a43, a44; + +#define READ_STATE(state) do { \ + a00 = (state)->u.wide[ 0]; \ + a10 = (state)->u.wide[ 1]; \ + a20 = (state)->u.wide[ 2]; \ + a30 = (state)->u.wide[ 3]; \ + a40 = (state)->u.wide[ 4]; \ + a01 = (state)->u.wide[ 5]; \ + a11 = (state)->u.wide[ 6]; \ + a21 = (state)->u.wide[ 7]; \ + a31 = (state)->u.wide[ 8]; \ + a41 = (state)->u.wide[ 9]; \ + a02 = (state)->u.wide[10]; \ + a12 = (state)->u.wide[11]; \ + a22 = (state)->u.wide[12]; \ + a32 = (state)->u.wide[13]; \ + a42 = (state)->u.wide[14]; \ + a03 = (state)->u.wide[15]; \ + a13 = (state)->u.wide[16]; \ + a23 = (state)->u.wide[17]; \ + a33 = (state)->u.wide[18]; \ + a43 = (state)->u.wide[19]; \ + a04 = (state)->u.wide[20]; \ + a14 = (state)->u.wide[21]; \ + a24 = (state)->u.wide[22]; \ + a34 = (state)->u.wide[23]; \ + a44 = (state)->u.wide[24]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.wide[ 0] = a00; \ + (state)->u.wide[ 1] = a10; \ + (state)->u.wide[ 2] = a20; \ + (state)->u.wide[ 3] = a30; \ + (state)->u.wide[ 4] = a40; \ + (state)->u.wide[ 5] = a01; \ + (state)->u.wide[ 6] = a11; \ + (state)->u.wide[ 7] = a21; \ + (state)->u.wide[ 8] = a31; \ + (state)->u.wide[ 9] = a41; \ + (state)->u.wide[10] = a02; \ + (state)->u.wide[11] = a12; \ + (state)->u.wide[12] = a22; \ + (state)->u.wide[13] = a32; \ + (state)->u.wide[14] = a42; \ + (state)->u.wide[15] = a03; \ + (state)->u.wide[16] = a13; \ + (state)->u.wide[17] = a23; \ + (state)->u.wide[18] = a33; \ + (state)->u.wide[19] = a43; \ + (state)->u.wide[20] = a04; \ + (state)->u.wide[21] = a14; \ + (state)->u.wide[22] = a24; \ + (state)->u.wide[23] = a34; \ + (state)->u.wide[24] = a44; \ + } while (0) + +#define INPUT_BUF144 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + a00 ^= sph_dec64le_aligned(buf + 0); \ + a10 ^= sph_dec64le_aligned(buf + 8); \ + a20 ^= sph_dec64le_aligned(buf + 16); \ + a30 ^= sph_dec64le_aligned(buf + 24); \ + a40 ^= sph_dec64le_aligned(buf + 32); \ + a01 ^= sph_dec64le_aligned(buf + 40); \ + a11 ^= sph_dec64le_aligned(buf + 48); \ + a21 ^= sph_dec64le_aligned(buf + 56); \ + a31 ^= sph_dec64le_aligned(buf + 64); \ + if ((lim) == 72) \ + break; \ + a41 ^= sph_dec64le_aligned(buf + 72); \ + a02 ^= sph_dec64le_aligned(buf + 80); \ + a12 ^= sph_dec64le_aligned(buf + 88); \ + a22 ^= sph_dec64le_aligned(buf + 96); \ + if ((lim) == 104) \ + break; \ + a32 ^= sph_dec64le_aligned(buf + 104); \ + a42 ^= sph_dec64le_aligned(buf + 112); \ + a03 ^= sph_dec64le_aligned(buf + 120); \ + a13 ^= sph_dec64le_aligned(buf + 128); \ + if ((lim) == 136) \ + break; \ + a23 ^= sph_dec64le_aligned(buf + 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x +#define MOV64(d, s) (d = s) +#define XOR64(d, a, b) (d = a ^ b) +#define AND64(d, a, b) (d = a & b) +#define OR64(d, a, b) (d = a | b) +#define NOT64(d, s) (d = SPH_T64(~s)) +#define ROL64(d, v, n) (d = SPH_ROTL64(v, n)) +#define XOR64_IOTA XOR64 + +#else + +static const struct { + sph_u32 high, low; +} RC[] = { +#if SPH_KECCAK_INTERLEAVE + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000089), SPH_C32(0x00000000) }, + { SPH_C32(0x8000008B), SPH_C32(0x00000000) }, + { SPH_C32(0x80008080), SPH_C32(0x00000000) }, + { SPH_C32(0x0000008B), SPH_C32(0x00000001) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000082), SPH_C32(0x00000001) }, + { SPH_C32(0x0000000B), SPH_C32(0x00000000) }, + { SPH_C32(0x0000000A), SPH_C32(0x00000000) }, + { SPH_C32(0x00008082), SPH_C32(0x00000001) }, + { SPH_C32(0x00008003), SPH_C32(0x00000000) }, + { SPH_C32(0x0000808B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000000B), SPH_C32(0x00000001) }, + { SPH_C32(0x8000008A), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000001) }, + { SPH_C32(0x80000081), SPH_C32(0x00000000) }, + { SPH_C32(0x80000008), SPH_C32(0x00000000) }, + { SPH_C32(0x00000083), SPH_C32(0x00000000) }, + { SPH_C32(0x80008003), SPH_C32(0x00000000) }, + { SPH_C32(0x80008088), SPH_C32(0x00000001) }, + { SPH_C32(0x80000088), SPH_C32(0x00000000) }, + { SPH_C32(0x00008000), SPH_C32(0x00000001) }, + { SPH_C32(0x80008082), SPH_C32(0x00000000) } +#else + { SPH_C32(0x00000000), SPH_C32(0x00000001) }, + { SPH_C32(0x00000000), SPH_C32(0x00008082) }, + { SPH_C32(0x80000000), SPH_C32(0x0000808A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008000) }, + { SPH_C32(0x00000000), SPH_C32(0x0000808B) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008009) }, + { SPH_C32(0x00000000), SPH_C32(0x0000008A) }, + { SPH_C32(0x00000000), SPH_C32(0x00000088) }, + { SPH_C32(0x00000000), SPH_C32(0x80008009) }, + { SPH_C32(0x00000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x00000000), SPH_C32(0x8000808B) }, + { SPH_C32(0x80000000), SPH_C32(0x0000008B) }, + { SPH_C32(0x80000000), SPH_C32(0x00008089) }, + { SPH_C32(0x80000000), SPH_C32(0x00008003) }, + { SPH_C32(0x80000000), SPH_C32(0x00008002) }, + { SPH_C32(0x80000000), SPH_C32(0x00000080) }, + { SPH_C32(0x00000000), SPH_C32(0x0000800A) }, + { SPH_C32(0x80000000), SPH_C32(0x8000000A) }, + { SPH_C32(0x80000000), SPH_C32(0x80008081) }, + { SPH_C32(0x80000000), SPH_C32(0x00008080) }, + { SPH_C32(0x00000000), SPH_C32(0x80000001) }, + { SPH_C32(0x80000000), SPH_C32(0x80008008) } +#endif +}; + +#if SPH_KECCAK_INTERLEAVE + +#define INTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + (xl) = l; (xh) = h; \ + } while (0) + +#define UNINTERLEAVE(xl, xh) do { \ + sph_u32 l, h, t; \ + l = (xl); h = (xh); \ + t = (l ^ SPH_T32(h << 16)) & SPH_C32(0xFFFF0000); \ + l ^= t; h ^= t >> 16; \ + t = (l ^ (l >> 8)) & SPH_C32(0x0000FF00); l ^= t ^ (t << 8); \ + t = (h ^ (h >> 8)) & SPH_C32(0x0000FF00); h ^= t ^ (t << 8); \ + t = (l ^ (l >> 4)) & SPH_C32(0x00F000F0); l ^= t ^ (t << 4); \ + t = (h ^ (h >> 4)) & SPH_C32(0x00F000F0); h ^= t ^ (t << 4); \ + t = (l ^ (l >> 2)) & SPH_C32(0x0C0C0C0C); l ^= t ^ (t << 2); \ + t = (h ^ (h >> 2)) & SPH_C32(0x0C0C0C0C); h ^= t ^ (t << 2); \ + t = (l ^ (l >> 1)) & SPH_C32(0x22222222); l ^= t ^ (t << 1); \ + t = (h ^ (h >> 1)) & SPH_C32(0x22222222); h ^= t ^ (t << 1); \ + (xl) = l; (xh) = h; \ + } while (0) + +#else + +#define INTERLEAVE(l, h) +#define UNINTERLEAVE(l, h) + +#endif + +#if SPH_KECCAK_NOCOPY + +#define a00l (kc->u.narrow[2 * 0 + 0]) +#define a00h (kc->u.narrow[2 * 0 + 1]) +#define a10l (kc->u.narrow[2 * 1 + 0]) +#define a10h (kc->u.narrow[2 * 1 + 1]) +#define a20l (kc->u.narrow[2 * 2 + 0]) +#define a20h (kc->u.narrow[2 * 2 + 1]) +#define a30l (kc->u.narrow[2 * 3 + 0]) +#define a30h (kc->u.narrow[2 * 3 + 1]) +#define a40l (kc->u.narrow[2 * 4 + 0]) +#define a40h (kc->u.narrow[2 * 4 + 1]) +#define a01l (kc->u.narrow[2 * 5 + 0]) +#define a01h (kc->u.narrow[2 * 5 + 1]) +#define a11l (kc->u.narrow[2 * 6 + 0]) +#define a11h (kc->u.narrow[2 * 6 + 1]) +#define a21l (kc->u.narrow[2 * 7 + 0]) +#define a21h (kc->u.narrow[2 * 7 + 1]) +#define a31l (kc->u.narrow[2 * 8 + 0]) +#define a31h (kc->u.narrow[2 * 8 + 1]) +#define a41l (kc->u.narrow[2 * 9 + 0]) +#define a41h (kc->u.narrow[2 * 9 + 1]) +#define a02l (kc->u.narrow[2 * 10 + 0]) +#define a02h (kc->u.narrow[2 * 10 + 1]) +#define a12l (kc->u.narrow[2 * 11 + 0]) +#define a12h (kc->u.narrow[2 * 11 + 1]) +#define a22l (kc->u.narrow[2 * 12 + 0]) +#define a22h (kc->u.narrow[2 * 12 + 1]) +#define a32l (kc->u.narrow[2 * 13 + 0]) +#define a32h (kc->u.narrow[2 * 13 + 1]) +#define a42l (kc->u.narrow[2 * 14 + 0]) +#define a42h (kc->u.narrow[2 * 14 + 1]) +#define a03l (kc->u.narrow[2 * 15 + 0]) +#define a03h (kc->u.narrow[2 * 15 + 1]) +#define a13l (kc->u.narrow[2 * 16 + 0]) +#define a13h (kc->u.narrow[2 * 16 + 1]) +#define a23l (kc->u.narrow[2 * 17 + 0]) +#define a23h (kc->u.narrow[2 * 17 + 1]) +#define a33l (kc->u.narrow[2 * 18 + 0]) +#define a33h (kc->u.narrow[2 * 18 + 1]) +#define a43l (kc->u.narrow[2 * 19 + 0]) +#define a43h (kc->u.narrow[2 * 19 + 1]) +#define a04l (kc->u.narrow[2 * 20 + 0]) +#define a04h (kc->u.narrow[2 * 20 + 1]) +#define a14l (kc->u.narrow[2 * 21 + 0]) +#define a14h (kc->u.narrow[2 * 21 + 1]) +#define a24l (kc->u.narrow[2 * 22 + 0]) +#define a24h (kc->u.narrow[2 * 22 + 1]) +#define a34l (kc->u.narrow[2 * 23 + 0]) +#define a34h (kc->u.narrow[2 * 23 + 1]) +#define a44l (kc->u.narrow[2 * 24 + 0]) +#define a44h (kc->u.narrow[2 * 24 + 1]) + +#define DECL_STATE +#define READ_STATE(state) +#define WRITE_STATE(state) + +#define INPUT_BUF(size) do { \ + size_t j; \ + for (j = 0; j < (size); j += 8) { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + j + 0); \ + th = sph_dec32le_aligned(buf + j + 4); \ + INTERLEAVE(tl, th); \ + kc->u.narrow[(j >> 2) + 0] ^= tl; \ + kc->u.narrow[(j >> 2) + 1] ^= th; \ + } \ + } while (0) + +#define INPUT_BUF144 INPUT_BUF(144) +#define INPUT_BUF136 INPUT_BUF(136) +#define INPUT_BUF104 INPUT_BUF(104) +#define INPUT_BUF72 INPUT_BUF(72) + +#else + +#define DECL_STATE \ + sph_u32 a00l, a00h, a01l, a01h, a02l, a02h, a03l, a03h, a04l, a04h; \ + sph_u32 a10l, a10h, a11l, a11h, a12l, a12h, a13l, a13h, a14l, a14h; \ + sph_u32 a20l, a20h, a21l, a21h, a22l, a22h, a23l, a23h, a24l, a24h; \ + sph_u32 a30l, a30h, a31l, a31h, a32l, a32h, a33l, a33h, a34l, a34h; \ + sph_u32 a40l, a40h, a41l, a41h, a42l, a42h, a43l, a43h, a44l, a44h; + +#define READ_STATE(state) do { \ + a00l = (state)->u.narrow[2 * 0 + 0]; \ + a00h = (state)->u.narrow[2 * 0 + 1]; \ + a10l = (state)->u.narrow[2 * 1 + 0]; \ + a10h = (state)->u.narrow[2 * 1 + 1]; \ + a20l = (state)->u.narrow[2 * 2 + 0]; \ + a20h = (state)->u.narrow[2 * 2 + 1]; \ + a30l = (state)->u.narrow[2 * 3 + 0]; \ + a30h = (state)->u.narrow[2 * 3 + 1]; \ + a40l = (state)->u.narrow[2 * 4 + 0]; \ + a40h = (state)->u.narrow[2 * 4 + 1]; \ + a01l = (state)->u.narrow[2 * 5 + 0]; \ + a01h = (state)->u.narrow[2 * 5 + 1]; \ + a11l = (state)->u.narrow[2 * 6 + 0]; \ + a11h = (state)->u.narrow[2 * 6 + 1]; \ + a21l = (state)->u.narrow[2 * 7 + 0]; \ + a21h = (state)->u.narrow[2 * 7 + 1]; \ + a31l = (state)->u.narrow[2 * 8 + 0]; \ + a31h = (state)->u.narrow[2 * 8 + 1]; \ + a41l = (state)->u.narrow[2 * 9 + 0]; \ + a41h = (state)->u.narrow[2 * 9 + 1]; \ + a02l = (state)->u.narrow[2 * 10 + 0]; \ + a02h = (state)->u.narrow[2 * 10 + 1]; \ + a12l = (state)->u.narrow[2 * 11 + 0]; \ + a12h = (state)->u.narrow[2 * 11 + 1]; \ + a22l = (state)->u.narrow[2 * 12 + 0]; \ + a22h = (state)->u.narrow[2 * 12 + 1]; \ + a32l = (state)->u.narrow[2 * 13 + 0]; \ + a32h = (state)->u.narrow[2 * 13 + 1]; \ + a42l = (state)->u.narrow[2 * 14 + 0]; \ + a42h = (state)->u.narrow[2 * 14 + 1]; \ + a03l = (state)->u.narrow[2 * 15 + 0]; \ + a03h = (state)->u.narrow[2 * 15 + 1]; \ + a13l = (state)->u.narrow[2 * 16 + 0]; \ + a13h = (state)->u.narrow[2 * 16 + 1]; \ + a23l = (state)->u.narrow[2 * 17 + 0]; \ + a23h = (state)->u.narrow[2 * 17 + 1]; \ + a33l = (state)->u.narrow[2 * 18 + 0]; \ + a33h = (state)->u.narrow[2 * 18 + 1]; \ + a43l = (state)->u.narrow[2 * 19 + 0]; \ + a43h = (state)->u.narrow[2 * 19 + 1]; \ + a04l = (state)->u.narrow[2 * 20 + 0]; \ + a04h = (state)->u.narrow[2 * 20 + 1]; \ + a14l = (state)->u.narrow[2 * 21 + 0]; \ + a14h = (state)->u.narrow[2 * 21 + 1]; \ + a24l = (state)->u.narrow[2 * 22 + 0]; \ + a24h = (state)->u.narrow[2 * 22 + 1]; \ + a34l = (state)->u.narrow[2 * 23 + 0]; \ + a34h = (state)->u.narrow[2 * 23 + 1]; \ + a44l = (state)->u.narrow[2 * 24 + 0]; \ + a44h = (state)->u.narrow[2 * 24 + 1]; \ + } while (0) + +#define WRITE_STATE(state) do { \ + (state)->u.narrow[2 * 0 + 0] = a00l; \ + (state)->u.narrow[2 * 0 + 1] = a00h; \ + (state)->u.narrow[2 * 1 + 0] = a10l; \ + (state)->u.narrow[2 * 1 + 1] = a10h; \ + (state)->u.narrow[2 * 2 + 0] = a20l; \ + (state)->u.narrow[2 * 2 + 1] = a20h; \ + (state)->u.narrow[2 * 3 + 0] = a30l; \ + (state)->u.narrow[2 * 3 + 1] = a30h; \ + (state)->u.narrow[2 * 4 + 0] = a40l; \ + (state)->u.narrow[2 * 4 + 1] = a40h; \ + (state)->u.narrow[2 * 5 + 0] = a01l; \ + (state)->u.narrow[2 * 5 + 1] = a01h; \ + (state)->u.narrow[2 * 6 + 0] = a11l; \ + (state)->u.narrow[2 * 6 + 1] = a11h; \ + (state)->u.narrow[2 * 7 + 0] = a21l; \ + (state)->u.narrow[2 * 7 + 1] = a21h; \ + (state)->u.narrow[2 * 8 + 0] = a31l; \ + (state)->u.narrow[2 * 8 + 1] = a31h; \ + (state)->u.narrow[2 * 9 + 0] = a41l; \ + (state)->u.narrow[2 * 9 + 1] = a41h; \ + (state)->u.narrow[2 * 10 + 0] = a02l; \ + (state)->u.narrow[2 * 10 + 1] = a02h; \ + (state)->u.narrow[2 * 11 + 0] = a12l; \ + (state)->u.narrow[2 * 11 + 1] = a12h; \ + (state)->u.narrow[2 * 12 + 0] = a22l; \ + (state)->u.narrow[2 * 12 + 1] = a22h; \ + (state)->u.narrow[2 * 13 + 0] = a32l; \ + (state)->u.narrow[2 * 13 + 1] = a32h; \ + (state)->u.narrow[2 * 14 + 0] = a42l; \ + (state)->u.narrow[2 * 14 + 1] = a42h; \ + (state)->u.narrow[2 * 15 + 0] = a03l; \ + (state)->u.narrow[2 * 15 + 1] = a03h; \ + (state)->u.narrow[2 * 16 + 0] = a13l; \ + (state)->u.narrow[2 * 16 + 1] = a13h; \ + (state)->u.narrow[2 * 17 + 0] = a23l; \ + (state)->u.narrow[2 * 17 + 1] = a23h; \ + (state)->u.narrow[2 * 18 + 0] = a33l; \ + (state)->u.narrow[2 * 18 + 1] = a33h; \ + (state)->u.narrow[2 * 19 + 0] = a43l; \ + (state)->u.narrow[2 * 19 + 1] = a43h; \ + (state)->u.narrow[2 * 20 + 0] = a04l; \ + (state)->u.narrow[2 * 20 + 1] = a04h; \ + (state)->u.narrow[2 * 21 + 0] = a14l; \ + (state)->u.narrow[2 * 21 + 1] = a14h; \ + (state)->u.narrow[2 * 22 + 0] = a24l; \ + (state)->u.narrow[2 * 22 + 1] = a24h; \ + (state)->u.narrow[2 * 23 + 0] = a34l; \ + (state)->u.narrow[2 * 23 + 1] = a34h; \ + (state)->u.narrow[2 * 24 + 0] = a44l; \ + (state)->u.narrow[2 * 24 + 1] = a44h; \ + } while (0) + +#define READ64(d, off) do { \ + sph_u32 tl, th; \ + tl = sph_dec32le_aligned(buf + (off)); \ + th = sph_dec32le_aligned(buf + (off) + 4); \ + INTERLEAVE(tl, th); \ + d ## l ^= tl; \ + d ## h ^= th; \ + } while (0) + +#define INPUT_BUF144 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + READ64(a23, 136); \ + } while (0) + +#define INPUT_BUF136 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + } while (0) + +#define INPUT_BUF104 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + } while (0) + +#define INPUT_BUF72 do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + } while (0) + +#define INPUT_BUF(lim) do { \ + READ64(a00, 0); \ + READ64(a10, 8); \ + READ64(a20, 16); \ + READ64(a30, 24); \ + READ64(a40, 32); \ + READ64(a01, 40); \ + READ64(a11, 48); \ + READ64(a21, 56); \ + READ64(a31, 64); \ + if ((lim) == 72) \ + break; \ + READ64(a41, 72); \ + READ64(a02, 80); \ + READ64(a12, 88); \ + READ64(a22, 96); \ + if ((lim) == 104) \ + break; \ + READ64(a32, 104); \ + READ64(a42, 112); \ + READ64(a03, 120); \ + READ64(a13, 128); \ + if ((lim) == 136) \ + break; \ + READ64(a23, 136); \ + } while (0) + +#endif + +#define DECL64(x) sph_u64 x ## l, x ## h +#define MOV64(d, s) (d ## l = s ## l, d ## h = s ## h) +#define XOR64(d, a, b) (d ## l = a ## l ^ b ## l, d ## h = a ## h ^ b ## h) +#define AND64(d, a, b) (d ## l = a ## l & b ## l, d ## h = a ## h & b ## h) +#define OR64(d, a, b) (d ## l = a ## l | b ## l, d ## h = a ## h | b ## h) +#define NOT64(d, s) (d ## l = SPH_T32(~s ## l), d ## h = SPH_T32(~s ## h)) +#define ROL64(d, v, n) ROL64_ ## n(d, v) + +#if SPH_KECCAK_INTERLEAVE + +#define ROL64_odd1(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = SPH_T32(v ## h << 1) | (v ## h >> 31); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd63(d, v) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << 31) | (v ## l >> 1); \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_odd(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << (n - 1)) | (v ## l >> (33 - n)); \ + d ## l = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + d ## h = tmp; \ + } while (0) + +#define ROL64_even(d, v, n) do { \ + d ## l = SPH_T32(v ## l << n) | (v ## l >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## h >> (32 - n)); \ + } while (0) + +#define ROL64_0(d, v) +#define ROL64_1(d, v) ROL64_odd1(d, v) +#define ROL64_2(d, v) ROL64_even(d, v, 1) +#define ROL64_3(d, v) ROL64_odd( d, v, 2) +#define ROL64_4(d, v) ROL64_even(d, v, 2) +#define ROL64_5(d, v) ROL64_odd( d, v, 3) +#define ROL64_6(d, v) ROL64_even(d, v, 3) +#define ROL64_7(d, v) ROL64_odd( d, v, 4) +#define ROL64_8(d, v) ROL64_even(d, v, 4) +#define ROL64_9(d, v) ROL64_odd( d, v, 5) +#define ROL64_10(d, v) ROL64_even(d, v, 5) +#define ROL64_11(d, v) ROL64_odd( d, v, 6) +#define ROL64_12(d, v) ROL64_even(d, v, 6) +#define ROL64_13(d, v) ROL64_odd( d, v, 7) +#define ROL64_14(d, v) ROL64_even(d, v, 7) +#define ROL64_15(d, v) ROL64_odd( d, v, 8) +#define ROL64_16(d, v) ROL64_even(d, v, 8) +#define ROL64_17(d, v) ROL64_odd( d, v, 9) +#define ROL64_18(d, v) ROL64_even(d, v, 9) +#define ROL64_19(d, v) ROL64_odd( d, v, 10) +#define ROL64_20(d, v) ROL64_even(d, v, 10) +#define ROL64_21(d, v) ROL64_odd( d, v, 11) +#define ROL64_22(d, v) ROL64_even(d, v, 11) +#define ROL64_23(d, v) ROL64_odd( d, v, 12) +#define ROL64_24(d, v) ROL64_even(d, v, 12) +#define ROL64_25(d, v) ROL64_odd( d, v, 13) +#define ROL64_26(d, v) ROL64_even(d, v, 13) +#define ROL64_27(d, v) ROL64_odd( d, v, 14) +#define ROL64_28(d, v) ROL64_even(d, v, 14) +#define ROL64_29(d, v) ROL64_odd( d, v, 15) +#define ROL64_30(d, v) ROL64_even(d, v, 15) +#define ROL64_31(d, v) ROL64_odd( d, v, 16) +#define ROL64_32(d, v) ROL64_even(d, v, 16) +#define ROL64_33(d, v) ROL64_odd( d, v, 17) +#define ROL64_34(d, v) ROL64_even(d, v, 17) +#define ROL64_35(d, v) ROL64_odd( d, v, 18) +#define ROL64_36(d, v) ROL64_even(d, v, 18) +#define ROL64_37(d, v) ROL64_odd( d, v, 19) +#define ROL64_38(d, v) ROL64_even(d, v, 19) +#define ROL64_39(d, v) ROL64_odd( d, v, 20) +#define ROL64_40(d, v) ROL64_even(d, v, 20) +#define ROL64_41(d, v) ROL64_odd( d, v, 21) +#define ROL64_42(d, v) ROL64_even(d, v, 21) +#define ROL64_43(d, v) ROL64_odd( d, v, 22) +#define ROL64_44(d, v) ROL64_even(d, v, 22) +#define ROL64_45(d, v) ROL64_odd( d, v, 23) +#define ROL64_46(d, v) ROL64_even(d, v, 23) +#define ROL64_47(d, v) ROL64_odd( d, v, 24) +#define ROL64_48(d, v) ROL64_even(d, v, 24) +#define ROL64_49(d, v) ROL64_odd( d, v, 25) +#define ROL64_50(d, v) ROL64_even(d, v, 25) +#define ROL64_51(d, v) ROL64_odd( d, v, 26) +#define ROL64_52(d, v) ROL64_even(d, v, 26) +#define ROL64_53(d, v) ROL64_odd( d, v, 27) +#define ROL64_54(d, v) ROL64_even(d, v, 27) +#define ROL64_55(d, v) ROL64_odd( d, v, 28) +#define ROL64_56(d, v) ROL64_even(d, v, 28) +#define ROL64_57(d, v) ROL64_odd( d, v, 29) +#define ROL64_58(d, v) ROL64_even(d, v, 29) +#define ROL64_59(d, v) ROL64_odd( d, v, 30) +#define ROL64_60(d, v) ROL64_even(d, v, 30) +#define ROL64_61(d, v) ROL64_odd( d, v, 31) +#define ROL64_62(d, v) ROL64_even(d, v, 31) +#define ROL64_63(d, v) ROL64_odd63(d, v) + +#else + +#define ROL64_small(d, v, n) do { \ + sph_u32 tmp; \ + tmp = SPH_T32(v ## l << n) | (v ## h >> (32 - n)); \ + d ## h = SPH_T32(v ## h << n) | (v ## l >> (32 - n)); \ + d ## l = tmp; \ + } while (0) + +#define ROL64_0(d, v) 0 +#define ROL64_1(d, v) ROL64_small(d, v, 1) +#define ROL64_2(d, v) ROL64_small(d, v, 2) +#define ROL64_3(d, v) ROL64_small(d, v, 3) +#define ROL64_4(d, v) ROL64_small(d, v, 4) +#define ROL64_5(d, v) ROL64_small(d, v, 5) +#define ROL64_6(d, v) ROL64_small(d, v, 6) +#define ROL64_7(d, v) ROL64_small(d, v, 7) +#define ROL64_8(d, v) ROL64_small(d, v, 8) +#define ROL64_9(d, v) ROL64_small(d, v, 9) +#define ROL64_10(d, v) ROL64_small(d, v, 10) +#define ROL64_11(d, v) ROL64_small(d, v, 11) +#define ROL64_12(d, v) ROL64_small(d, v, 12) +#define ROL64_13(d, v) ROL64_small(d, v, 13) +#define ROL64_14(d, v) ROL64_small(d, v, 14) +#define ROL64_15(d, v) ROL64_small(d, v, 15) +#define ROL64_16(d, v) ROL64_small(d, v, 16) +#define ROL64_17(d, v) ROL64_small(d, v, 17) +#define ROL64_18(d, v) ROL64_small(d, v, 18) +#define ROL64_19(d, v) ROL64_small(d, v, 19) +#define ROL64_20(d, v) ROL64_small(d, v, 20) +#define ROL64_21(d, v) ROL64_small(d, v, 21) +#define ROL64_22(d, v) ROL64_small(d, v, 22) +#define ROL64_23(d, v) ROL64_small(d, v, 23) +#define ROL64_24(d, v) ROL64_small(d, v, 24) +#define ROL64_25(d, v) ROL64_small(d, v, 25) +#define ROL64_26(d, v) ROL64_small(d, v, 26) +#define ROL64_27(d, v) ROL64_small(d, v, 27) +#define ROL64_28(d, v) ROL64_small(d, v, 28) +#define ROL64_29(d, v) ROL64_small(d, v, 29) +#define ROL64_30(d, v) ROL64_small(d, v, 30) +#define ROL64_31(d, v) ROL64_small(d, v, 31) + +#define ROL64_32(d, v) do { \ + sph_u32 tmp; \ + tmp = v ## l; \ + d ## l = v ## h; \ + d ## h = tmp; \ + } while (0) + +#define ROL64_big(d, v, n) do { \ + sph_u32 trl, trh; \ + ROL64_small(tr, v, n); \ + d ## h = trl; \ + d ## l = trh; \ + } while (0) + +#define ROL64_33(d, v) ROL64_big(d, v, 1) +#define ROL64_34(d, v) ROL64_big(d, v, 2) +#define ROL64_35(d, v) ROL64_big(d, v, 3) +#define ROL64_36(d, v) ROL64_big(d, v, 4) +#define ROL64_37(d, v) ROL64_big(d, v, 5) +#define ROL64_38(d, v) ROL64_big(d, v, 6) +#define ROL64_39(d, v) ROL64_big(d, v, 7) +#define ROL64_40(d, v) ROL64_big(d, v, 8) +#define ROL64_41(d, v) ROL64_big(d, v, 9) +#define ROL64_42(d, v) ROL64_big(d, v, 10) +#define ROL64_43(d, v) ROL64_big(d, v, 11) +#define ROL64_44(d, v) ROL64_big(d, v, 12) +#define ROL64_45(d, v) ROL64_big(d, v, 13) +#define ROL64_46(d, v) ROL64_big(d, v, 14) +#define ROL64_47(d, v) ROL64_big(d, v, 15) +#define ROL64_48(d, v) ROL64_big(d, v, 16) +#define ROL64_49(d, v) ROL64_big(d, v, 17) +#define ROL64_50(d, v) ROL64_big(d, v, 18) +#define ROL64_51(d, v) ROL64_big(d, v, 19) +#define ROL64_52(d, v) ROL64_big(d, v, 20) +#define ROL64_53(d, v) ROL64_big(d, v, 21) +#define ROL64_54(d, v) ROL64_big(d, v, 22) +#define ROL64_55(d, v) ROL64_big(d, v, 23) +#define ROL64_56(d, v) ROL64_big(d, v, 24) +#define ROL64_57(d, v) ROL64_big(d, v, 25) +#define ROL64_58(d, v) ROL64_big(d, v, 26) +#define ROL64_59(d, v) ROL64_big(d, v, 27) +#define ROL64_60(d, v) ROL64_big(d, v, 28) +#define ROL64_61(d, v) ROL64_big(d, v, 29) +#define ROL64_62(d, v) ROL64_big(d, v, 30) +#define ROL64_63(d, v) ROL64_big(d, v, 31) + +#endif + +#define XOR64_IOTA(d, s, k) \ + (d ## l = s ## l ^ k.low, d ## h = s ## h ^ k.high) + +#endif + +#define TH_ELT(t, c0, c1, c2, c3, c4, d0, d1, d2, d3, d4) do { \ + DECL64(tt0); \ + DECL64(tt1); \ + DECL64(tt2); \ + DECL64(tt3); \ + XOR64(tt0, d0, d1); \ + XOR64(tt1, d2, d3); \ + XOR64(tt0, tt0, d4); \ + XOR64(tt0, tt0, tt1); \ + ROL64(tt0, tt0, 1); \ + XOR64(tt2, c0, c1); \ + XOR64(tt3, c2, c3); \ + XOR64(tt0, tt0, c4); \ + XOR64(tt2, tt2, tt3); \ + XOR64(t, tt0, tt2); \ + } while (0) + +#define THETA(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(t0); \ + DECL64(t1); \ + DECL64(t2); \ + DECL64(t3); \ + DECL64(t4); \ + TH_ELT(t0, b40, b41, b42, b43, b44, b10, b11, b12, b13, b14); \ + TH_ELT(t1, b00, b01, b02, b03, b04, b20, b21, b22, b23, b24); \ + TH_ELT(t2, b10, b11, b12, b13, b14, b30, b31, b32, b33, b34); \ + TH_ELT(t3, b20, b21, b22, b23, b24, b40, b41, b42, b43, b44); \ + TH_ELT(t4, b30, b31, b32, b33, b34, b00, b01, b02, b03, b04); \ + XOR64(b00, b00, t0); \ + XOR64(b01, b01, t0); \ + XOR64(b02, b02, t0); \ + XOR64(b03, b03, t0); \ + XOR64(b04, b04, t0); \ + XOR64(b10, b10, t1); \ + XOR64(b11, b11, t1); \ + XOR64(b12, b12, t1); \ + XOR64(b13, b13, t1); \ + XOR64(b14, b14, t1); \ + XOR64(b20, b20, t2); \ + XOR64(b21, b21, t2); \ + XOR64(b22, b22, t2); \ + XOR64(b23, b23, t2); \ + XOR64(b24, b24, t2); \ + XOR64(b30, b30, t3); \ + XOR64(b31, b31, t3); \ + XOR64(b32, b32, t3); \ + XOR64(b33, b33, t3); \ + XOR64(b34, b34, t3); \ + XOR64(b40, b40, t4); \ + XOR64(b41, b41, t4); \ + XOR64(b42, b42, t4); \ + XOR64(b43, b43, t4); \ + XOR64(b44, b44, t4); \ + } while (0) + +#define RHO(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + /* ROL64(b00, b00, 0); */ \ + ROL64(b01, b01, 36); \ + ROL64(b02, b02, 3); \ + ROL64(b03, b03, 41); \ + ROL64(b04, b04, 18); \ + ROL64(b10, b10, 1); \ + ROL64(b11, b11, 44); \ + ROL64(b12, b12, 10); \ + ROL64(b13, b13, 45); \ + ROL64(b14, b14, 2); \ + ROL64(b20, b20, 62); \ + ROL64(b21, b21, 6); \ + ROL64(b22, b22, 43); \ + ROL64(b23, b23, 15); \ + ROL64(b24, b24, 61); \ + ROL64(b30, b30, 28); \ + ROL64(b31, b31, 55); \ + ROL64(b32, b32, 25); \ + ROL64(b33, b33, 21); \ + ROL64(b34, b34, 56); \ + ROL64(b40, b40, 27); \ + ROL64(b41, b41, 20); \ + ROL64(b42, b42, 39); \ + ROL64(b43, b43, 8); \ + ROL64(b44, b44, 14); \ + } while (0) + +/* + * The KHI macro integrates the "lane complement" optimization. On input, + * some words are complemented: + * a00 a01 a02 a04 a13 a20 a21 a22 a30 a33 a34 a43 + * On output, the following words are complemented: + * a04 a10 a20 a22 a23 a31 + * + * The (implicit) permutation and the theta expansion will bring back + * the input mask for the next round. + */ + +#define KHI_XO(d, a, b, c) do { \ + DECL64(kt); \ + OR64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI_XA(d, a, b, c) do { \ + DECL64(kt); \ + AND64(kt, b, c); \ + XOR64(d, a, kt); \ + } while (0) + +#define KHI(b00, b01, b02, b03, b04, b10, b11, b12, b13, b14, \ + b20, b21, b22, b23, b24, b30, b31, b32, b33, b34, \ + b40, b41, b42, b43, b44) \ + do { \ + DECL64(c0); \ + DECL64(c1); \ + DECL64(c2); \ + DECL64(c3); \ + DECL64(c4); \ + DECL64(bnn); \ + NOT64(bnn, b20); \ + KHI_XO(c0, b00, b10, b20); \ + KHI_XO(c1, b10, bnn, b30); \ + KHI_XA(c2, b20, b30, b40); \ + KHI_XO(c3, b30, b40, b00); \ + KHI_XA(c4, b40, b00, b10); \ + MOV64(b00, c0); \ + MOV64(b10, c1); \ + MOV64(b20, c2); \ + MOV64(b30, c3); \ + MOV64(b40, c4); \ + NOT64(bnn, b41); \ + KHI_XO(c0, b01, b11, b21); \ + KHI_XA(c1, b11, b21, b31); \ + KHI_XO(c2, b21, b31, bnn); \ + KHI_XO(c3, b31, b41, b01); \ + KHI_XA(c4, b41, b01, b11); \ + MOV64(b01, c0); \ + MOV64(b11, c1); \ + MOV64(b21, c2); \ + MOV64(b31, c3); \ + MOV64(b41, c4); \ + NOT64(bnn, b32); \ + KHI_XO(c0, b02, b12, b22); \ + KHI_XA(c1, b12, b22, b32); \ + KHI_XA(c2, b22, bnn, b42); \ + KHI_XO(c3, bnn, b42, b02); \ + KHI_XA(c4, b42, b02, b12); \ + MOV64(b02, c0); \ + MOV64(b12, c1); \ + MOV64(b22, c2); \ + MOV64(b32, c3); \ + MOV64(b42, c4); \ + NOT64(bnn, b33); \ + KHI_XA(c0, b03, b13, b23); \ + KHI_XO(c1, b13, b23, b33); \ + KHI_XO(c2, b23, bnn, b43); \ + KHI_XA(c3, bnn, b43, b03); \ + KHI_XO(c4, b43, b03, b13); \ + MOV64(b03, c0); \ + MOV64(b13, c1); \ + MOV64(b23, c2); \ + MOV64(b33, c3); \ + MOV64(b43, c4); \ + NOT64(bnn, b14); \ + KHI_XA(c0, b04, bnn, b24); \ + KHI_XO(c1, bnn, b24, b34); \ + KHI_XA(c2, b24, b34, b44); \ + KHI_XO(c3, b34, b44, b04); \ + KHI_XA(c4, b44, b04, b14); \ + MOV64(b04, c0); \ + MOV64(b14, c1); \ + MOV64(b24, c2); \ + MOV64(b34, c3); \ + MOV64(b44, c4); \ + } while (0) + +#define IOTA(r) XOR64_IOTA(a00, a00, r) + +#define P0 a00, a01, a02, a03, a04, a10, a11, a12, a13, a14, a20, a21, \ + a22, a23, a24, a30, a31, a32, a33, a34, a40, a41, a42, a43, a44 +#define P1 a00, a30, a10, a40, a20, a11, a41, a21, a01, a31, a22, a02, \ + a32, a12, a42, a33, a13, a43, a23, a03, a44, a24, a04, a34, a14 +#define P2 a00, a33, a11, a44, a22, a41, a24, a02, a30, a13, a32, a10, \ + a43, a21, a04, a23, a01, a34, a12, a40, a14, a42, a20, a03, a31 +#define P3 a00, a23, a41, a14, a32, a24, a42, a10, a33, a01, a43, a11, \ + a34, a02, a20, a12, a30, a03, a21, a44, a31, a04, a22, a40, a13 +#define P4 a00, a12, a24, a31, a43, a42, a04, a11, a23, a30, a34, a41, \ + a03, a10, a22, a21, a33, a40, a02, a14, a13, a20, a32, a44, a01 +#define P5 a00, a21, a42, a13, a34, a04, a20, a41, a12, a33, a03, a24, \ + a40, a11, a32, a02, a23, a44, a10, a31, a01, a22, a43, a14, a30 +#define P6 a00, a02, a04, a01, a03, a20, a22, a24, a21, a23, a40, a42, \ + a44, a41, a43, a10, a12, a14, a11, a13, a30, a32, a34, a31, a33 +#define P7 a00, a10, a20, a30, a40, a22, a32, a42, a02, a12, a44, a04, \ + a14, a24, a34, a11, a21, a31, a41, a01, a33, a43, a03, a13, a23 +#define P8 a00, a11, a22, a33, a44, a32, a43, a04, a10, a21, a14, a20, \ + a31, a42, a03, a41, a02, a13, a24, a30, a23, a34, a40, a01, a12 +#define P9 a00, a41, a32, a23, a14, a43, a34, a20, a11, a02, a31, a22, \ + a13, a04, a40, a24, a10, a01, a42, a33, a12, a03, a44, a30, a21 +#define P10 a00, a24, a43, a12, a31, a34, a03, a22, a41, a10, a13, a32, \ + a01, a20, a44, a42, a11, a30, a04, a23, a21, a40, a14, a33, a02 +#define P11 a00, a42, a34, a21, a13, a03, a40, a32, a24, a11, a01, a43, \ + a30, a22, a14, a04, a41, a33, a20, a12, a02, a44, a31, a23, a10 +#define P12 a00, a04, a03, a02, a01, a40, a44, a43, a42, a41, a30, a34, \ + a33, a32, a31, a20, a24, a23, a22, a21, a10, a14, a13, a12, a11 +#define P13 a00, a20, a40, a10, a30, a44, a14, a34, a04, a24, a33, a03, \ + a23, a43, a13, a22, a42, a12, a32, a02, a11, a31, a01, a21, a41 +#define P14 a00, a22, a44, a11, a33, a14, a31, a03, a20, a42, a23, a40, \ + a12, a34, a01, a32, a04, a21, a43, a10, a41, a13, a30, a02, a24 +#define P15 a00, a32, a14, a41, a23, a31, a13, a40, a22, a04, a12, a44, \ + a21, a03, a30, a43, a20, a02, a34, a11, a24, a01, a33, a10, a42 +#define P16 a00, a43, a31, a24, a12, a13, a01, a44, a32, a20, a21, a14, \ + a02, a40, a33, a34, a22, a10, a03, a41, a42, a30, a23, a11, a04 +#define P17 a00, a34, a13, a42, a21, a01, a30, a14, a43, a22, a02, a31, \ + a10, a44, a23, a03, a32, a11, a40, a24, a04, a33, a12, a41, a20 +#define P18 a00, a03, a01, a04, a02, a30, a33, a31, a34, a32, a10, a13, \ + a11, a14, a12, a40, a43, a41, a44, a42, a20, a23, a21, a24, a22 +#define P19 a00, a40, a30, a20, a10, a33, a23, a13, a03, a43, a11, a01, \ + a41, a31, a21, a44, a34, a24, a14, a04, a22, a12, a02, a42, a32 +#define P20 a00, a44, a33, a22, a11, a23, a12, a01, a40, a34, a41, a30, \ + a24, a13, a02, a14, a03, a42, a31, a20, a32, a21, a10, a04, a43 +#define P21 a00, a14, a23, a32, a41, a12, a21, a30, a44, a03, a24, a33, \ + a42, a01, a10, a31, a40, a04, a13, a22, a43, a02, a11, a20, a34 +#define P22 a00, a31, a12, a43, a24, a21, a02, a33, a14, a40, a42, a23, \ + a04, a30, a11, a13, a44, a20, a01, a32, a34, a10, a41, a22, a03 +#define P23 a00, a13, a21, a34, a42, a02, a10, a23, a31, a44, a04, a12, \ + a20, a33, a41, a01, a14, a22, a30, a43, a03, a11, a24, a32, a40 + +#define P1_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a30); \ + MOV64(a30, a33); \ + MOV64(a33, a23); \ + MOV64(a23, a12); \ + MOV64(a12, a21); \ + MOV64(a21, a02); \ + MOV64(a02, a10); \ + MOV64(a10, a11); \ + MOV64(a11, a41); \ + MOV64(a41, a24); \ + MOV64(a24, a42); \ + MOV64(a42, a04); \ + MOV64(a04, a20); \ + MOV64(a20, a22); \ + MOV64(a22, a32); \ + MOV64(a32, a43); \ + MOV64(a43, a34); \ + MOV64(a34, a03); \ + MOV64(a03, a40); \ + MOV64(a40, a44); \ + MOV64(a44, a14); \ + MOV64(a14, a31); \ + MOV64(a31, a13); \ + MOV64(a13, t); \ + } while (0) + +#define P2_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a33); \ + MOV64(a33, a12); \ + MOV64(a12, a02); \ + MOV64(a02, a11); \ + MOV64(a11, a24); \ + MOV64(a24, a04); \ + MOV64(a04, a22); \ + MOV64(a22, a43); \ + MOV64(a43, a03); \ + MOV64(a03, a44); \ + MOV64(a44, a31); \ + MOV64(a31, t); \ + MOV64(t, a10); \ + MOV64(a10, a41); \ + MOV64(a41, a42); \ + MOV64(a42, a20); \ + MOV64(a20, a32); \ + MOV64(a32, a34); \ + MOV64(a34, a40); \ + MOV64(a40, a14); \ + MOV64(a14, a13); \ + MOV64(a13, a30); \ + MOV64(a30, a23); \ + MOV64(a23, a21); \ + MOV64(a21, t); \ + } while (0) + +#define P4_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a12); \ + MOV64(a12, a11); \ + MOV64(a11, a04); \ + MOV64(a04, a43); \ + MOV64(a43, a44); \ + MOV64(a44, t); \ + MOV64(t, a02); \ + MOV64(a02, a24); \ + MOV64(a24, a22); \ + MOV64(a22, a03); \ + MOV64(a03, a31); \ + MOV64(a31, a33); \ + MOV64(a33, t); \ + MOV64(t, a10); \ + MOV64(a10, a42); \ + MOV64(a42, a32); \ + MOV64(a32, a40); \ + MOV64(a40, a13); \ + MOV64(a13, a23); \ + MOV64(a23, t); \ + MOV64(t, a14); \ + MOV64(a14, a30); \ + MOV64(a30, a21); \ + MOV64(a21, a41); \ + MOV64(a41, a20); \ + MOV64(a20, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P6_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a02); \ + MOV64(a02, a04); \ + MOV64(a04, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a20); \ + MOV64(a20, a40); \ + MOV64(a40, a30); \ + MOV64(a30, t); \ + MOV64(t, a11); \ + MOV64(a11, a22); \ + MOV64(a22, a44); \ + MOV64(a44, a33); \ + MOV64(a33, t); \ + MOV64(t, a12); \ + MOV64(a12, a24); \ + MOV64(a24, a43); \ + MOV64(a43, a31); \ + MOV64(a31, t); \ + MOV64(t, a13); \ + MOV64(a13, a21); \ + MOV64(a21, a42); \ + MOV64(a42, a34); \ + MOV64(a34, t); \ + MOV64(t, a14); \ + MOV64(a14, a23); \ + MOV64(a23, a41); \ + MOV64(a41, a32); \ + MOV64(a32, t); \ + } while (0) + +#define P8_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a11); \ + MOV64(a11, a43); \ + MOV64(a43, t); \ + MOV64(t, a02); \ + MOV64(a02, a22); \ + MOV64(a22, a31); \ + MOV64(a31, t); \ + MOV64(t, a03); \ + MOV64(a03, a33); \ + MOV64(a33, a24); \ + MOV64(a24, t); \ + MOV64(t, a04); \ + MOV64(a04, a44); \ + MOV64(a44, a12); \ + MOV64(a12, t); \ + MOV64(t, a10); \ + MOV64(a10, a32); \ + MOV64(a32, a13); \ + MOV64(a13, t); \ + MOV64(t, a14); \ + MOV64(a14, a21); \ + MOV64(a21, a20); \ + MOV64(a20, t); \ + MOV64(t, a23); \ + MOV64(a23, a42); \ + MOV64(a42, a40); \ + MOV64(a40, t); \ + MOV64(t, a30); \ + MOV64(a30, a41); \ + MOV64(a41, a34); \ + MOV64(a34, t); \ + } while (0) + +#define P12_TO_P0 do { \ + DECL64(t); \ + MOV64(t, a01); \ + MOV64(a01, a04); \ + MOV64(a04, t); \ + MOV64(t, a02); \ + MOV64(a02, a03); \ + MOV64(a03, t); \ + MOV64(t, a10); \ + MOV64(a10, a40); \ + MOV64(a40, t); \ + MOV64(t, a11); \ + MOV64(a11, a44); \ + MOV64(a44, t); \ + MOV64(t, a12); \ + MOV64(a12, a43); \ + MOV64(a43, t); \ + MOV64(t, a13); \ + MOV64(a13, a42); \ + MOV64(a42, t); \ + MOV64(t, a14); \ + MOV64(a14, a41); \ + MOV64(a41, t); \ + MOV64(t, a20); \ + MOV64(a20, a30); \ + MOV64(a30, t); \ + MOV64(t, a21); \ + MOV64(a21, a34); \ + MOV64(a34, t); \ + MOV64(t, a22); \ + MOV64(a22, a33); \ + MOV64(a33, t); \ + MOV64(t, a23); \ + MOV64(a23, a32); \ + MOV64(a32, t); \ + MOV64(t, a24); \ + MOV64(a24, a31); \ + MOV64(a31, t); \ + } while (0) + +#define LPAR ( +#define RPAR ) + +#define KF_ELT(r, s, k) do { \ + THETA LPAR P ## r RPAR; \ + RHO LPAR P ## r RPAR; \ + KHI LPAR P ## s RPAR; \ + IOTA(k); \ + } while (0) + +#define DO(x) x + +#define KECCAK_F_1600 DO(KECCAK_F_1600_) + +#if SPH_KECCAK_UNROLL == 1 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j ++) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + P1_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 2 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 2) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + P2_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 4 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 4) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + P4_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 6 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 6) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + P6_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 8 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 8) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + P8_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 12 + +#define KECCAK_F_1600_ do { \ + int j; \ + for (j = 0; j < 24; j += 12) { \ + KF_ELT( 0, 1, RC[j + 0]); \ + KF_ELT( 1, 2, RC[j + 1]); \ + KF_ELT( 2, 3, RC[j + 2]); \ + KF_ELT( 3, 4, RC[j + 3]); \ + KF_ELT( 4, 5, RC[j + 4]); \ + KF_ELT( 5, 6, RC[j + 5]); \ + KF_ELT( 6, 7, RC[j + 6]); \ + KF_ELT( 7, 8, RC[j + 7]); \ + KF_ELT( 8, 9, RC[j + 8]); \ + KF_ELT( 9, 10, RC[j + 9]); \ + KF_ELT(10, 11, RC[j + 10]); \ + KF_ELT(11, 12, RC[j + 11]); \ + P12_TO_P0; \ + } \ + } while (0) + +#elif SPH_KECCAK_UNROLL == 0 + +#define KECCAK_F_1600_ do { \ + KF_ELT( 0, 1, RC[ 0]); \ + KF_ELT( 1, 2, RC[ 1]); \ + KF_ELT( 2, 3, RC[ 2]); \ + KF_ELT( 3, 4, RC[ 3]); \ + KF_ELT( 4, 5, RC[ 4]); \ + KF_ELT( 5, 6, RC[ 5]); \ + KF_ELT( 6, 7, RC[ 6]); \ + KF_ELT( 7, 8, RC[ 7]); \ + KF_ELT( 8, 9, RC[ 8]); \ + KF_ELT( 9, 10, RC[ 9]); \ + KF_ELT(10, 11, RC[10]); \ + KF_ELT(11, 12, RC[11]); \ + KF_ELT(12, 13, RC[12]); \ + KF_ELT(13, 14, RC[13]); \ + KF_ELT(14, 15, RC[14]); \ + KF_ELT(15, 16, RC[15]); \ + KF_ELT(16, 17, RC[16]); \ + KF_ELT(17, 18, RC[17]); \ + KF_ELT(18, 19, RC[18]); \ + KF_ELT(19, 20, RC[19]); \ + KF_ELT(20, 21, RC[20]); \ + KF_ELT(21, 22, RC[21]); \ + KF_ELT(22, 23, RC[22]); \ + KF_ELT(23, 0, RC[23]); \ + } while (0) + +#else + +#error Unimplemented unroll count for Keccak. + +#endif + +static void +keccak_init(sph_keccak_context *kc, unsigned out_size) +{ + int i; + +#if SPH_KECCAK_64 + for (i = 0; i < 25; i ++) + kc->u.wide[i] = 0; + /* + * Initialization for the "lane complement". + */ + kc->u.wide[ 1] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 2] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[ 8] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[12] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[17] = SPH_C64(0xFFFFFFFFFFFFFFFF); + kc->u.wide[20] = SPH_C64(0xFFFFFFFFFFFFFFFF); +#else + + for (i = 0; i < 50; i ++) + kc->u.narrow[i] = 0; + /* + * Initialization for the "lane complement". + * Note: since we set to all-one full 64-bit words, + * interleaving (if applicable) is a no-op. + */ + kc->u.narrow[ 2] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 3] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 4] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[ 5] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[16] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[17] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[24] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[25] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[34] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[35] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[40] = SPH_C32(0xFFFFFFFF); + kc->u.narrow[41] = SPH_C32(0xFFFFFFFF); +#endif + kc->ptr = 0; + kc->lim = 200 - (out_size >> 2); +} + +static void +keccak_core(sph_keccak_context *kc, const void *data, size_t len, size_t lim) +{ + unsigned char *buf; + size_t ptr; + DECL_STATE + + buf = kc->buf; + ptr = kc->ptr; + + if (len < (lim - ptr)) { + memcpy(buf + ptr, data, len); + kc->ptr = ptr + len; + return; + } + + READ_STATE(kc); + while (len > 0) { + size_t clen; + + clen = (lim - ptr); + if (clen > len) + clen = len; + memcpy(buf + ptr, data, clen); + ptr += clen; + data = (const unsigned char *)data + clen; + len -= clen; + if (ptr == lim) { + INPUT_BUF(lim); + KECCAK_F_1600; + ptr = 0; + } + } + WRITE_STATE(kc); + kc->ptr = ptr; +} + +#if SPH_KECCAK_64 + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = 0x6; \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.wide[ 1] = ~kc->u.wide[ 1]; \ + kc->u.wide[ 2] = ~kc->u.wide[ 2]; \ + kc->u.wide[ 8] = ~kc->u.wide[ 8]; \ + kc->u.wide[12] = ~kc->u.wide[12]; \ + kc->u.wide[17] = ~kc->u.wide[17]; \ + kc->u.wide[20] = ~kc->u.wide[20]; \ + for (j = 0; j < d; j += 8) \ + sph_enc64le_aligned(u.tmp + j, kc->u.wide[j >> 3]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#else + +#define DEFCLOSE(d, lim) \ + static void keccak_close ## d( \ + sph_keccak_context *kc, unsigned ub, unsigned n, void *dst) \ + { \ + unsigned eb; \ + union { \ + unsigned char tmp[lim + 1]; \ + sph_u64 dummy; /* for alignment */ \ + } u; \ + size_t j; \ + \ + eb = (0x100 | (ub & 0xFF)) >> (8 - n); \ + if (kc->ptr == (lim - 1)) { \ + if (n == 7) { \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, lim - 1); \ + u.tmp[lim] = 0x80; \ + j = 1 + lim; \ + } else { \ + u.tmp[0] = eb | 0x80; \ + j = 1; \ + } \ + } else { \ + j = lim - kc->ptr; \ + u.tmp[0] = eb; \ + memset(u.tmp + 1, 0, j - 2); \ + u.tmp[j - 1] = 0x80; \ + } \ + keccak_core(kc, u.tmp, j, lim); \ + /* Finalize the "lane complement" */ \ + kc->u.narrow[ 2] = ~kc->u.narrow[ 2]; \ + kc->u.narrow[ 3] = ~kc->u.narrow[ 3]; \ + kc->u.narrow[ 4] = ~kc->u.narrow[ 4]; \ + kc->u.narrow[ 5] = ~kc->u.narrow[ 5]; \ + kc->u.narrow[16] = ~kc->u.narrow[16]; \ + kc->u.narrow[17] = ~kc->u.narrow[17]; \ + kc->u.narrow[24] = ~kc->u.narrow[24]; \ + kc->u.narrow[25] = ~kc->u.narrow[25]; \ + kc->u.narrow[34] = ~kc->u.narrow[34]; \ + kc->u.narrow[35] = ~kc->u.narrow[35]; \ + kc->u.narrow[40] = ~kc->u.narrow[40]; \ + kc->u.narrow[41] = ~kc->u.narrow[41]; \ + /* un-interleave */ \ + for (j = 0; j < 50; j += 2) \ + UNINTERLEAVE(kc->u.narrow[j], kc->u.narrow[j + 1]); \ + for (j = 0; j < d; j += 4) \ + sph_enc32le_aligned(u.tmp + j, kc->u.narrow[j >> 2]); \ + memcpy(dst, u.tmp, d); \ + keccak_init(kc, (unsigned)d << 3); \ + } \ + +#endif + +DEFCLOSE(28, 144) +DEFCLOSE(32, 136) +DEFCLOSE(48, 104) +DEFCLOSE(64, 72) + +/* see sph_keccak.h */ +void +sph_sha3d224_init(void *cc) +{ + keccak_init(cc, 224); +} + +/* see sph_keccak.h */ +void +sph_sha3d224(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 144); +} + +/* see sph_keccak.h */ +void +sph_sha3d224_close(void *cc, void *dst) +{ + sph_sha3d224_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close28(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d256_init(void *cc) +{ + keccak_init(cc, 256); +} + +/* see sph_keccak.h */ +void +sph_sha3d256(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 136); +} + +/* see sph_keccak.h */ +void +sph_sha3d256_close(void *cc, void *dst) +{ + sph_sha3d256_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close32(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d384_init(void *cc) +{ + keccak_init(cc, 384); +} + +/* see sph_keccak.h */ +void +sph_sha3d384(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 104); +} + +/* see sph_keccak.h */ +void +sph_sha3d384_close(void *cc, void *dst) +{ + sph_sha3d384_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close48(cc, ub, n, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d512_init(void *cc) +{ + keccak_init(cc, 512); +} + +/* see sph_keccak.h */ +void +sph_sha3d512(void *cc, const void *data, size_t len) +{ + keccak_core(cc, data, len, 72); +} + +/* see sph_keccak.h */ +void +sph_sha3d512_close(void *cc, void *dst) +{ + sph_sha3d512_addbits_and_close(cc, 0, 0, dst); +} + +/* see sph_keccak.h */ +void +sph_sha3d512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + keccak_close64(cc, ub, n, dst); +} + + +#ifdef __cplusplus +} +#endif diff --git a/sph/sph_sha3d.h b/sph/sph_sha3d.h new file mode 100644 index 0000000000..46d5ca84c8 --- /dev/null +++ b/sph/sph_sha3d.h @@ -0,0 +1,293 @@ +/* $Id: sph_keccak.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Keccak interface. This is the interface for Keccak with the + * recommended parameters for SHA-3, with output lengths 224, 256, + * 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_keccak.h + * @author Thomas Pornin + */ + +#ifndef SPH_KECCAK_H__ +#define SPH_KECCAK_H__ + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for Keccak-224. + */ +#define SPH_SIZE_keccak224 224 + +/** + * Output size (in bits) for Keccak-256. + */ +#define SPH_SIZE_keccak256 256 + +/** + * Output size (in bits) for Keccak-384. + */ +#define SPH_SIZE_keccak384 384 + +/** + * Output size (in bits) for Keccak-512. + */ +#define SPH_SIZE_keccak512 512 + +/** + * This structure is a context for Keccak computations: it contains the + * intermediate values and some data from the last entered block. Once a + * Keccak computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running Keccak computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[144]; /* first field, for alignment */ + size_t ptr, lim; + union { +#if SPH_64 + sph_u64 wide[25]; +#endif + sph_u32 narrow[50]; + } u; +#endif +} sph_keccak_context; + +/** + * Type for a Keccak-224 context (identical to the common context). + */ +typedef sph_keccak_context sph_sha3d224_context; + +/** + * Type for a Keccak-256 context (identical to the common context). + */ +typedef sph_keccak_context sph_sha3d256_context; + +/** + * Type for a Keccak-384 context (identical to the common context). + */ +typedef sph_keccak_context sph_sha3d384_context; + +/** + * Type for a Keccak-512 context (identical to the common context). + */ +typedef sph_keccak_context sph_sha3d512_context; + +/** + * Initialize a Keccak-224 context. This process performs no memory allocation. + * + * @param cc the Keccak-224 context (pointer to a + * sph_sha3d224_context) + */ +void sph_sha3d224_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-224 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_sha3d224(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-224 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (28 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-224 context + * @param dst the destination buffer + */ +void sph_sha3d224_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (28 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-224 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_sha3d224_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-256 context. This process performs no memory allocation. + * + * @param cc the Keccak-256 context (pointer to a + * sph_sha3d256_context) + */ +void sph_sha3d256_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-256 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_sha3d256(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-256 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (32 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-256 context + * @param dst the destination buffer + */ +void sph_sha3d256_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (32 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-256 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_sha3d256_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-384 context. This process performs no memory allocation. + * + * @param cc the Keccak-384 context (pointer to a + * sph_sha3d384_context) + */ +void sph_sha3d384_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-384 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_sha3d384(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-384 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (48 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-384 context + * @param dst the destination buffer + */ +void sph_sha3d384_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (48 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-384 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_sha3d384_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +/** + * Initialize a Keccak-512 context. This process performs no memory allocation. + * + * @param cc the Keccak-512 context (pointer to a + * sph_sha3d512_context) + */ +void sph_sha3d512_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Keccak-512 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_sha3d512(void *cc, const void *data, size_t len); + +/** + * Terminate the current Keccak-512 computation and output the result into + * the provided buffer. The destination buffer must be wide enough to + * accomodate the result (64 bytes). The context is automatically + * reinitialized. + * + * @param cc the Keccak-512 context + * @param dst the destination buffer + */ +void sph_sha3d512_close(void *cc, void *dst); + +/** + * Add a few additional bits (0 to 7) to the current computation, then + * terminate it and output the result in the provided buffer, which must + * be wide enough to accomodate the result (64 bytes). If bit number i + * in ub has value 2^i, then the extra bits are those + * numbered 7 downto 8-n (this is the big-endian convention at the byte + * level). The context is automatically reinitialized. + * + * @param cc the Keccak-512 context + * @param ub the extra bits + * @param n the number of extra bits (0 to 7) + * @param dst the destination buffer + */ +void sph_sha3d512_addbits_and_close( + void *cc, unsigned ub, unsigned n, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/sph/sph_tiger.h b/sph/sph_tiger.h new file mode 100644 index 0000000000..1a58685329 --- /dev/null +++ b/sph/sph_tiger.h @@ -0,0 +1,191 @@ +/* $Id: sph_tiger.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Tiger / Tiger-2 interface. + * + * Tiger has been published in: R. Anderson, E. Biham, "Tiger: A Fast + * New Hash Function", Fast Software Encryption - FSE'96, LNCS 1039, + * Springer (1996), pp. 89--97. + * + * Tiger2 has never been formally published, but it was described as + * identical to Tiger, except for the padding which is the same in + * Tiger2 as it is in MD4. Fortunately, an implementation of Tiger2 + * was submitted to NESSIE, which produced test vectors; the sphlib + * implementation of Tiger2 is compatible with the NESSIE test vectors. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_tiger.h + * @author Thomas Pornin + */ + +#ifndef SPH_TIGER_H__ +#define SPH_TIGER_H__ + +#include +#include "sph_types.h" + +#if SPH_64 + +/** + * Output size (in bits) for Tiger. + */ +#define SPH_SIZE_tiger 192 + +/** + * Output size (in bits) for Tiger2. + */ +#define SPH_SIZE_tiger2 192 + +/** + * This structure is a context for Tiger computations: it contains the + * intermediate values and some data from the last entered block. Once + * a Tiger computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running Tiger computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + sph_u64 val[3]; + sph_u64 count; +#endif +} sph_tiger_context; + +/** + * Initialize a Tiger context. This process performs no memory allocation. + * + * @param cc the Tiger context (pointer to + * a sph_tiger_context) + */ +void sph_tiger_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the Tiger context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_tiger(void *cc, const void *data, size_t len); + +/** + * Terminate the current Tiger computation and output the result into the + * provided buffer. The destination buffer must be wide enough to + * accomodate the result (24 bytes). The context is automatically + * reinitialized. + * + * @param cc the Tiger context + * @param dst the destination buffer + */ +void sph_tiger_close(void *cc, void *dst); + +/** + * Apply the Tiger compression function on the provided data. The + * msg parameter contains the 8 64-bit input blocks, + * as numerical values (hence after the little-endian decoding). The + * val parameter contains the 3 64-bit input blocks for + * the compression function; the output is written in place in this + * array. + * + * @param msg the message block (8 values) + * @param val the function 192-bit input and output + */ +void sph_tiger_comp(const sph_u64 msg[8], sph_u64 val[3]); + +/** + * This structure is a context for Tiger2 computations. It is identical + * to the Tiger context, and they may be freely exchanged, since the + * difference between Tiger and Tiger2 resides solely in the padding, which + * is computed only in the last computation step. + */ +typedef sph_tiger_context sph_tiger2_context; + +#ifdef DOXYGEN_IGNORE +/** + * Initialize a Tiger2 context. This function is identical to + * sph_tiger_init(). + * + * @param cc the Tiger2 context (pointer to + * a sph_tiger2_context) + */ +void sph_tiger2_init(void *cc); +#endif + +#ifndef DOXYGEN_IGNORE +#define sph_tiger2_init sph_tiger_init +#endif + +#ifdef DOXYGEN_IGNORE +/** + * Process some data bytes. This function is identical to + * sph_tiger(). + * + * @param cc the Tiger2 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_tiger2(void *cc, const void *data, size_t len); +#endif + +#ifndef DOXYGEN_IGNORE +#define sph_tiger2 sph_tiger +#endif + +/** + * Terminate the current Tiger2 computation and output the result into the + * provided buffer. The destination buffer must be wide enough to + * accomodate the result (24 bytes). The context is automatically + * reinitialized. Note that this function is NOT identical to + * sph_tiger2_close(): this is the exact and unique point + * where Tiger and Tiger2 differ. + * + * @param cc the Tiger context + * @param dst the destination buffer + */ +void sph_tiger2_close(void *cc, void *dst); + +#ifdef DOXYGEN_IGNORE +/** + * Apply the Tiger2 compression function, which is identical to the Tiger + * compression function. + * + * @param msg the message block (8 values) + * @param val the function 192-bit input and output + */ +void sph_tiger2_comp(const sph_u64 msg[8], sph_u64 val[3]); +#endif + +#ifndef DOXYGEN_IGNORE +#define sph_tiger2_comp sph_tiger_comp +#endif + +#endif + +#endif \ No newline at end of file diff --git a/sph/tiger.c b/sph/tiger.c new file mode 100644 index 0000000000..f22fcb9716 --- /dev/null +++ b/sph/tiger.c @@ -0,0 +1,697 @@ +/* $Id: tiger.c 216 2010-06-08 09:46:57Z tp $ */ +/* + * Tiger / Tiger2 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_tiger.h" + +#if SPH_64 + +static const sph_u64 T1[256] = { + SPH_C64(0x02AAB17CF7E90C5E), SPH_C64(0xAC424B03E243A8EC), + SPH_C64(0x72CD5BE30DD5FCD3), SPH_C64(0x6D019B93F6F97F3A), + SPH_C64(0xCD9978FFD21F9193), SPH_C64(0x7573A1C9708029E2), + SPH_C64(0xB164326B922A83C3), SPH_C64(0x46883EEE04915870), + SPH_C64(0xEAACE3057103ECE6), SPH_C64(0xC54169B808A3535C), + SPH_C64(0x4CE754918DDEC47C), SPH_C64(0x0AA2F4DFDC0DF40C), + SPH_C64(0x10B76F18A74DBEFA), SPH_C64(0xC6CCB6235AD1AB6A), + SPH_C64(0x13726121572FE2FF), SPH_C64(0x1A488C6F199D921E), + SPH_C64(0x4BC9F9F4DA0007CA), SPH_C64(0x26F5E6F6E85241C7), + SPH_C64(0x859079DBEA5947B6), SPH_C64(0x4F1885C5C99E8C92), + SPH_C64(0xD78E761EA96F864B), SPH_C64(0x8E36428C52B5C17D), + SPH_C64(0x69CF6827373063C1), SPH_C64(0xB607C93D9BB4C56E), + SPH_C64(0x7D820E760E76B5EA), SPH_C64(0x645C9CC6F07FDC42), + SPH_C64(0xBF38A078243342E0), SPH_C64(0x5F6B343C9D2E7D04), + SPH_C64(0xF2C28AEB600B0EC6), SPH_C64(0x6C0ED85F7254BCAC), + SPH_C64(0x71592281A4DB4FE5), SPH_C64(0x1967FA69CE0FED9F), + SPH_C64(0xFD5293F8B96545DB), SPH_C64(0xC879E9D7F2A7600B), + SPH_C64(0x860248920193194E), SPH_C64(0xA4F9533B2D9CC0B3), + SPH_C64(0x9053836C15957613), SPH_C64(0xDB6DCF8AFC357BF1), + SPH_C64(0x18BEEA7A7A370F57), SPH_C64(0x037117CA50B99066), + SPH_C64(0x6AB30A9774424A35), SPH_C64(0xF4E92F02E325249B), + SPH_C64(0x7739DB07061CCAE1), SPH_C64(0xD8F3B49CECA42A05), + SPH_C64(0xBD56BE3F51382F73), SPH_C64(0x45FAED5843B0BB28), + SPH_C64(0x1C813D5C11BF1F83), SPH_C64(0x8AF0E4B6D75FA169), + SPH_C64(0x33EE18A487AD9999), SPH_C64(0x3C26E8EAB1C94410), + SPH_C64(0xB510102BC0A822F9), SPH_C64(0x141EEF310CE6123B), + SPH_C64(0xFC65B90059DDB154), SPH_C64(0xE0158640C5E0E607), + SPH_C64(0x884E079826C3A3CF), SPH_C64(0x930D0D9523C535FD), + SPH_C64(0x35638D754E9A2B00), SPH_C64(0x4085FCCF40469DD5), + SPH_C64(0xC4B17AD28BE23A4C), SPH_C64(0xCAB2F0FC6A3E6A2E), + SPH_C64(0x2860971A6B943FCD), SPH_C64(0x3DDE6EE212E30446), + SPH_C64(0x6222F32AE01765AE), SPH_C64(0x5D550BB5478308FE), + SPH_C64(0xA9EFA98DA0EDA22A), SPH_C64(0xC351A71686C40DA7), + SPH_C64(0x1105586D9C867C84), SPH_C64(0xDCFFEE85FDA22853), + SPH_C64(0xCCFBD0262C5EEF76), SPH_C64(0xBAF294CB8990D201), + SPH_C64(0xE69464F52AFAD975), SPH_C64(0x94B013AFDF133E14), + SPH_C64(0x06A7D1A32823C958), SPH_C64(0x6F95FE5130F61119), + SPH_C64(0xD92AB34E462C06C0), SPH_C64(0xED7BDE33887C71D2), + SPH_C64(0x79746D6E6518393E), SPH_C64(0x5BA419385D713329), + SPH_C64(0x7C1BA6B948A97564), SPH_C64(0x31987C197BFDAC67), + SPH_C64(0xDE6C23C44B053D02), SPH_C64(0x581C49FED002D64D), + SPH_C64(0xDD474D6338261571), SPH_C64(0xAA4546C3E473D062), + SPH_C64(0x928FCE349455F860), SPH_C64(0x48161BBACAAB94D9), + SPH_C64(0x63912430770E6F68), SPH_C64(0x6EC8A5E602C6641C), + SPH_C64(0x87282515337DDD2B), SPH_C64(0x2CDA6B42034B701B), + SPH_C64(0xB03D37C181CB096D), SPH_C64(0xE108438266C71C6F), + SPH_C64(0x2B3180C7EB51B255), SPH_C64(0xDF92B82F96C08BBC), + SPH_C64(0x5C68C8C0A632F3BA), SPH_C64(0x5504CC861C3D0556), + SPH_C64(0xABBFA4E55FB26B8F), SPH_C64(0x41848B0AB3BACEB4), + SPH_C64(0xB334A273AA445D32), SPH_C64(0xBCA696F0A85AD881), + SPH_C64(0x24F6EC65B528D56C), SPH_C64(0x0CE1512E90F4524A), + SPH_C64(0x4E9DD79D5506D35A), SPH_C64(0x258905FAC6CE9779), + SPH_C64(0x2019295B3E109B33), SPH_C64(0xF8A9478B73A054CC), + SPH_C64(0x2924F2F934417EB0), SPH_C64(0x3993357D536D1BC4), + SPH_C64(0x38A81AC21DB6FF8B), SPH_C64(0x47C4FBF17D6016BF), + SPH_C64(0x1E0FAADD7667E3F5), SPH_C64(0x7ABCFF62938BEB96), + SPH_C64(0xA78DAD948FC179C9), SPH_C64(0x8F1F98B72911E50D), + SPH_C64(0x61E48EAE27121A91), SPH_C64(0x4D62F7AD31859808), + SPH_C64(0xECEBA345EF5CEAEB), SPH_C64(0xF5CEB25EBC9684CE), + SPH_C64(0xF633E20CB7F76221), SPH_C64(0xA32CDF06AB8293E4), + SPH_C64(0x985A202CA5EE2CA4), SPH_C64(0xCF0B8447CC8A8FB1), + SPH_C64(0x9F765244979859A3), SPH_C64(0xA8D516B1A1240017), + SPH_C64(0x0BD7BA3EBB5DC726), SPH_C64(0xE54BCA55B86ADB39), + SPH_C64(0x1D7A3AFD6C478063), SPH_C64(0x519EC608E7669EDD), + SPH_C64(0x0E5715A2D149AA23), SPH_C64(0x177D4571848FF194), + SPH_C64(0xEEB55F3241014C22), SPH_C64(0x0F5E5CA13A6E2EC2), + SPH_C64(0x8029927B75F5C361), SPH_C64(0xAD139FABC3D6E436), + SPH_C64(0x0D5DF1A94CCF402F), SPH_C64(0x3E8BD948BEA5DFC8), + SPH_C64(0xA5A0D357BD3FF77E), SPH_C64(0xA2D12E251F74F645), + SPH_C64(0x66FD9E525E81A082), SPH_C64(0x2E0C90CE7F687A49), + SPH_C64(0xC2E8BCBEBA973BC5), SPH_C64(0x000001BCE509745F), + SPH_C64(0x423777BBE6DAB3D6), SPH_C64(0xD1661C7EAEF06EB5), + SPH_C64(0xA1781F354DAACFD8), SPH_C64(0x2D11284A2B16AFFC), + SPH_C64(0xF1FC4F67FA891D1F), SPH_C64(0x73ECC25DCB920ADA), + SPH_C64(0xAE610C22C2A12651), SPH_C64(0x96E0A810D356B78A), + SPH_C64(0x5A9A381F2FE7870F), SPH_C64(0xD5AD62EDE94E5530), + SPH_C64(0xD225E5E8368D1427), SPH_C64(0x65977B70C7AF4631), + SPH_C64(0x99F889B2DE39D74F), SPH_C64(0x233F30BF54E1D143), + SPH_C64(0x9A9675D3D9A63C97), SPH_C64(0x5470554FF334F9A8), + SPH_C64(0x166ACB744A4F5688), SPH_C64(0x70C74CAAB2E4AEAD), + SPH_C64(0xF0D091646F294D12), SPH_C64(0x57B82A89684031D1), + SPH_C64(0xEFD95A5A61BE0B6B), SPH_C64(0x2FBD12E969F2F29A), + SPH_C64(0x9BD37013FEFF9FE8), SPH_C64(0x3F9B0404D6085A06), + SPH_C64(0x4940C1F3166CFE15), SPH_C64(0x09542C4DCDF3DEFB), + SPH_C64(0xB4C5218385CD5CE3), SPH_C64(0xC935B7DC4462A641), + SPH_C64(0x3417F8A68ED3B63F), SPH_C64(0xB80959295B215B40), + SPH_C64(0xF99CDAEF3B8C8572), SPH_C64(0x018C0614F8FCB95D), + SPH_C64(0x1B14ACCD1A3ACDF3), SPH_C64(0x84D471F200BB732D), + SPH_C64(0xC1A3110E95E8DA16), SPH_C64(0x430A7220BF1A82B8), + SPH_C64(0xB77E090D39DF210E), SPH_C64(0x5EF4BD9F3CD05E9D), + SPH_C64(0x9D4FF6DA7E57A444), SPH_C64(0xDA1D60E183D4A5F8), + SPH_C64(0xB287C38417998E47), SPH_C64(0xFE3EDC121BB31886), + SPH_C64(0xC7FE3CCC980CCBEF), SPH_C64(0xE46FB590189BFD03), + SPH_C64(0x3732FD469A4C57DC), SPH_C64(0x7EF700A07CF1AD65), + SPH_C64(0x59C64468A31D8859), SPH_C64(0x762FB0B4D45B61F6), + SPH_C64(0x155BAED099047718), SPH_C64(0x68755E4C3D50BAA6), + SPH_C64(0xE9214E7F22D8B4DF), SPH_C64(0x2ADDBF532EAC95F4), + SPH_C64(0x32AE3909B4BD0109), SPH_C64(0x834DF537B08E3450), + SPH_C64(0xFA209DA84220728D), SPH_C64(0x9E691D9B9EFE23F7), + SPH_C64(0x0446D288C4AE8D7F), SPH_C64(0x7B4CC524E169785B), + SPH_C64(0x21D87F0135CA1385), SPH_C64(0xCEBB400F137B8AA5), + SPH_C64(0x272E2B66580796BE), SPH_C64(0x3612264125C2B0DE), + SPH_C64(0x057702BDAD1EFBB2), SPH_C64(0xD4BABB8EACF84BE9), + SPH_C64(0x91583139641BC67B), SPH_C64(0x8BDC2DE08036E024), + SPH_C64(0x603C8156F49F68ED), SPH_C64(0xF7D236F7DBEF5111), + SPH_C64(0x9727C4598AD21E80), SPH_C64(0xA08A0896670A5FD7), + SPH_C64(0xCB4A8F4309EBA9CB), SPH_C64(0x81AF564B0F7036A1), + SPH_C64(0xC0B99AA778199ABD), SPH_C64(0x959F1EC83FC8E952), + SPH_C64(0x8C505077794A81B9), SPH_C64(0x3ACAAF8F056338F0), + SPH_C64(0x07B43F50627A6778), SPH_C64(0x4A44AB49F5ECCC77), + SPH_C64(0x3BC3D6E4B679EE98), SPH_C64(0x9CC0D4D1CF14108C), + SPH_C64(0x4406C00B206BC8A0), SPH_C64(0x82A18854C8D72D89), + SPH_C64(0x67E366B35C3C432C), SPH_C64(0xB923DD61102B37F2), + SPH_C64(0x56AB2779D884271D), SPH_C64(0xBE83E1B0FF1525AF), + SPH_C64(0xFB7C65D4217E49A9), SPH_C64(0x6BDBE0E76D48E7D4), + SPH_C64(0x08DF828745D9179E), SPH_C64(0x22EA6A9ADD53BD34), + SPH_C64(0xE36E141C5622200A), SPH_C64(0x7F805D1B8CB750EE), + SPH_C64(0xAFE5C7A59F58E837), SPH_C64(0xE27F996A4FB1C23C), + SPH_C64(0xD3867DFB0775F0D0), SPH_C64(0xD0E673DE6E88891A), + SPH_C64(0x123AEB9EAFB86C25), SPH_C64(0x30F1D5D5C145B895), + SPH_C64(0xBB434A2DEE7269E7), SPH_C64(0x78CB67ECF931FA38), + SPH_C64(0xF33B0372323BBF9C), SPH_C64(0x52D66336FB279C74), + SPH_C64(0x505F33AC0AFB4EAA), SPH_C64(0xE8A5CD99A2CCE187), + SPH_C64(0x534974801E2D30BB), SPH_C64(0x8D2D5711D5876D90), + SPH_C64(0x1F1A412891BC038E), SPH_C64(0xD6E2E71D82E56648), + SPH_C64(0x74036C3A497732B7), SPH_C64(0x89B67ED96361F5AB), + SPH_C64(0xFFED95D8F1EA02A2), SPH_C64(0xE72B3BD61464D43D), + SPH_C64(0xA6300F170BDC4820), SPH_C64(0xEBC18760ED78A77A), +}; + +static const sph_u64 T2[256] = { + SPH_C64(0xE6A6BE5A05A12138), SPH_C64(0xB5A122A5B4F87C98), + SPH_C64(0x563C6089140B6990), SPH_C64(0x4C46CB2E391F5DD5), + SPH_C64(0xD932ADDBC9B79434), SPH_C64(0x08EA70E42015AFF5), + SPH_C64(0xD765A6673E478CF1), SPH_C64(0xC4FB757EAB278D99), + SPH_C64(0xDF11C6862D6E0692), SPH_C64(0xDDEB84F10D7F3B16), + SPH_C64(0x6F2EF604A665EA04), SPH_C64(0x4A8E0F0FF0E0DFB3), + SPH_C64(0xA5EDEEF83DBCBA51), SPH_C64(0xFC4F0A2A0EA4371E), + SPH_C64(0xE83E1DA85CB38429), SPH_C64(0xDC8FF882BA1B1CE2), + SPH_C64(0xCD45505E8353E80D), SPH_C64(0x18D19A00D4DB0717), + SPH_C64(0x34A0CFEDA5F38101), SPH_C64(0x0BE77E518887CAF2), + SPH_C64(0x1E341438B3C45136), SPH_C64(0xE05797F49089CCF9), + SPH_C64(0xFFD23F9DF2591D14), SPH_C64(0x543DDA228595C5CD), + SPH_C64(0x661F81FD99052A33), SPH_C64(0x8736E641DB0F7B76), + SPH_C64(0x15227725418E5307), SPH_C64(0xE25F7F46162EB2FA), + SPH_C64(0x48A8B2126C13D9FE), SPH_C64(0xAFDC541792E76EEA), + SPH_C64(0x03D912BFC6D1898F), SPH_C64(0x31B1AAFA1B83F51B), + SPH_C64(0xF1AC2796E42AB7D9), SPH_C64(0x40A3A7D7FCD2EBAC), + SPH_C64(0x1056136D0AFBBCC5), SPH_C64(0x7889E1DD9A6D0C85), + SPH_C64(0xD33525782A7974AA), SPH_C64(0xA7E25D09078AC09B), + SPH_C64(0xBD4138B3EAC6EDD0), SPH_C64(0x920ABFBE71EB9E70), + SPH_C64(0xA2A5D0F54FC2625C), SPH_C64(0xC054E36B0B1290A3), + SPH_C64(0xF6DD59FF62FE932B), SPH_C64(0x3537354511A8AC7D), + SPH_C64(0xCA845E9172FADCD4), SPH_C64(0x84F82B60329D20DC), + SPH_C64(0x79C62CE1CD672F18), SPH_C64(0x8B09A2ADD124642C), + SPH_C64(0xD0C1E96A19D9E726), SPH_C64(0x5A786A9B4BA9500C), + SPH_C64(0x0E020336634C43F3), SPH_C64(0xC17B474AEB66D822), + SPH_C64(0x6A731AE3EC9BAAC2), SPH_C64(0x8226667AE0840258), + SPH_C64(0x67D4567691CAECA5), SPH_C64(0x1D94155C4875ADB5), + SPH_C64(0x6D00FD985B813FDF), SPH_C64(0x51286EFCB774CD06), + SPH_C64(0x5E8834471FA744AF), SPH_C64(0xF72CA0AEE761AE2E), + SPH_C64(0xBE40E4CDAEE8E09A), SPH_C64(0xE9970BBB5118F665), + SPH_C64(0x726E4BEB33DF1964), SPH_C64(0x703B000729199762), + SPH_C64(0x4631D816F5EF30A7), SPH_C64(0xB880B5B51504A6BE), + SPH_C64(0x641793C37ED84B6C), SPH_C64(0x7B21ED77F6E97D96), + SPH_C64(0x776306312EF96B73), SPH_C64(0xAE528948E86FF3F4), + SPH_C64(0x53DBD7F286A3F8F8), SPH_C64(0x16CADCE74CFC1063), + SPH_C64(0x005C19BDFA52C6DD), SPH_C64(0x68868F5D64D46AD3), + SPH_C64(0x3A9D512CCF1E186A), SPH_C64(0x367E62C2385660AE), + SPH_C64(0xE359E7EA77DCB1D7), SPH_C64(0x526C0773749ABE6E), + SPH_C64(0x735AE5F9D09F734B), SPH_C64(0x493FC7CC8A558BA8), + SPH_C64(0xB0B9C1533041AB45), SPH_C64(0x321958BA470A59BD), + SPH_C64(0x852DB00B5F46C393), SPH_C64(0x91209B2BD336B0E5), + SPH_C64(0x6E604F7D659EF19F), SPH_C64(0xB99A8AE2782CCB24), + SPH_C64(0xCCF52AB6C814C4C7), SPH_C64(0x4727D9AFBE11727B), + SPH_C64(0x7E950D0C0121B34D), SPH_C64(0x756F435670AD471F), + SPH_C64(0xF5ADD442615A6849), SPH_C64(0x4E87E09980B9957A), + SPH_C64(0x2ACFA1DF50AEE355), SPH_C64(0xD898263AFD2FD556), + SPH_C64(0xC8F4924DD80C8FD6), SPH_C64(0xCF99CA3D754A173A), + SPH_C64(0xFE477BACAF91BF3C), SPH_C64(0xED5371F6D690C12D), + SPH_C64(0x831A5C285E687094), SPH_C64(0xC5D3C90A3708A0A4), + SPH_C64(0x0F7F903717D06580), SPH_C64(0x19F9BB13B8FDF27F), + SPH_C64(0xB1BD6F1B4D502843), SPH_C64(0x1C761BA38FFF4012), + SPH_C64(0x0D1530C4E2E21F3B), SPH_C64(0x8943CE69A7372C8A), + SPH_C64(0xE5184E11FEB5CE66), SPH_C64(0x618BDB80BD736621), + SPH_C64(0x7D29BAD68B574D0B), SPH_C64(0x81BB613E25E6FE5B), + SPH_C64(0x071C9C10BC07913F), SPH_C64(0xC7BEEB7909AC2D97), + SPH_C64(0xC3E58D353BC5D757), SPH_C64(0xEB017892F38F61E8), + SPH_C64(0xD4EFFB9C9B1CC21A), SPH_C64(0x99727D26F494F7AB), + SPH_C64(0xA3E063A2956B3E03), SPH_C64(0x9D4A8B9A4AA09C30), + SPH_C64(0x3F6AB7D500090FB4), SPH_C64(0x9CC0F2A057268AC0), + SPH_C64(0x3DEE9D2DEDBF42D1), SPH_C64(0x330F49C87960A972), + SPH_C64(0xC6B2720287421B41), SPH_C64(0x0AC59EC07C00369C), + SPH_C64(0xEF4EAC49CB353425), SPH_C64(0xF450244EEF0129D8), + SPH_C64(0x8ACC46E5CAF4DEB6), SPH_C64(0x2FFEAB63989263F7), + SPH_C64(0x8F7CB9FE5D7A4578), SPH_C64(0x5BD8F7644E634635), + SPH_C64(0x427A7315BF2DC900), SPH_C64(0x17D0C4AA2125261C), + SPH_C64(0x3992486C93518E50), SPH_C64(0xB4CBFEE0A2D7D4C3), + SPH_C64(0x7C75D6202C5DDD8D), SPH_C64(0xDBC295D8E35B6C61), + SPH_C64(0x60B369D302032B19), SPH_C64(0xCE42685FDCE44132), + SPH_C64(0x06F3DDB9DDF65610), SPH_C64(0x8EA4D21DB5E148F0), + SPH_C64(0x20B0FCE62FCD496F), SPH_C64(0x2C1B912358B0EE31), + SPH_C64(0xB28317B818F5A308), SPH_C64(0xA89C1E189CA6D2CF), + SPH_C64(0x0C6B18576AAADBC8), SPH_C64(0xB65DEAA91299FAE3), + SPH_C64(0xFB2B794B7F1027E7), SPH_C64(0x04E4317F443B5BEB), + SPH_C64(0x4B852D325939D0A6), SPH_C64(0xD5AE6BEEFB207FFC), + SPH_C64(0x309682B281C7D374), SPH_C64(0xBAE309A194C3B475), + SPH_C64(0x8CC3F97B13B49F05), SPH_C64(0x98A9422FF8293967), + SPH_C64(0x244B16B01076FF7C), SPH_C64(0xF8BF571C663D67EE), + SPH_C64(0x1F0D6758EEE30DA1), SPH_C64(0xC9B611D97ADEB9B7), + SPH_C64(0xB7AFD5887B6C57A2), SPH_C64(0x6290AE846B984FE1), + SPH_C64(0x94DF4CDEACC1A5FD), SPH_C64(0x058A5BD1C5483AFF), + SPH_C64(0x63166CC142BA3C37), SPH_C64(0x8DB8526EB2F76F40), + SPH_C64(0xE10880036F0D6D4E), SPH_C64(0x9E0523C9971D311D), + SPH_C64(0x45EC2824CC7CD691), SPH_C64(0x575B8359E62382C9), + SPH_C64(0xFA9E400DC4889995), SPH_C64(0xD1823ECB45721568), + SPH_C64(0xDAFD983B8206082F), SPH_C64(0xAA7D29082386A8CB), + SPH_C64(0x269FCD4403B87588), SPH_C64(0x1B91F5F728BDD1E0), + SPH_C64(0xE4669F39040201F6), SPH_C64(0x7A1D7C218CF04ADE), + SPH_C64(0x65623C29D79CE5CE), SPH_C64(0x2368449096C00BB1), + SPH_C64(0xAB9BF1879DA503BA), SPH_C64(0xBC23ECB1A458058E), + SPH_C64(0x9A58DF01BB401ECC), SPH_C64(0xA070E868A85F143D), + SPH_C64(0x4FF188307DF2239E), SPH_C64(0x14D565B41A641183), + SPH_C64(0xEE13337452701602), SPH_C64(0x950E3DCF3F285E09), + SPH_C64(0x59930254B9C80953), SPH_C64(0x3BF299408930DA6D), + SPH_C64(0xA955943F53691387), SPH_C64(0xA15EDECAA9CB8784), + SPH_C64(0x29142127352BE9A0), SPH_C64(0x76F0371FFF4E7AFB), + SPH_C64(0x0239F450274F2228), SPH_C64(0xBB073AF01D5E868B), + SPH_C64(0xBFC80571C10E96C1), SPH_C64(0xD267088568222E23), + SPH_C64(0x9671A3D48E80B5B0), SPH_C64(0x55B5D38AE193BB81), + SPH_C64(0x693AE2D0A18B04B8), SPH_C64(0x5C48B4ECADD5335F), + SPH_C64(0xFD743B194916A1CA), SPH_C64(0x2577018134BE98C4), + SPH_C64(0xE77987E83C54A4AD), SPH_C64(0x28E11014DA33E1B9), + SPH_C64(0x270CC59E226AA213), SPH_C64(0x71495F756D1A5F60), + SPH_C64(0x9BE853FB60AFEF77), SPH_C64(0xADC786A7F7443DBF), + SPH_C64(0x0904456173B29A82), SPH_C64(0x58BC7A66C232BD5E), + SPH_C64(0xF306558C673AC8B2), SPH_C64(0x41F639C6B6C9772A), + SPH_C64(0x216DEFE99FDA35DA), SPH_C64(0x11640CC71C7BE615), + SPH_C64(0x93C43694565C5527), SPH_C64(0xEA038E6246777839), + SPH_C64(0xF9ABF3CE5A3E2469), SPH_C64(0x741E768D0FD312D2), + SPH_C64(0x0144B883CED652C6), SPH_C64(0xC20B5A5BA33F8552), + SPH_C64(0x1AE69633C3435A9D), SPH_C64(0x97A28CA4088CFDEC), + SPH_C64(0x8824A43C1E96F420), SPH_C64(0x37612FA66EEEA746), + SPH_C64(0x6B4CB165F9CF0E5A), SPH_C64(0x43AA1C06A0ABFB4A), + SPH_C64(0x7F4DC26FF162796B), SPH_C64(0x6CBACC8E54ED9B0F), + SPH_C64(0xA6B7FFEFD2BB253E), SPH_C64(0x2E25BC95B0A29D4F), + SPH_C64(0x86D6A58BDEF1388C), SPH_C64(0xDED74AC576B6F054), + SPH_C64(0x8030BDBC2B45805D), SPH_C64(0x3C81AF70E94D9289), + SPH_C64(0x3EFF6DDA9E3100DB), SPH_C64(0xB38DC39FDFCC8847), + SPH_C64(0x123885528D17B87E), SPH_C64(0xF2DA0ED240B1B642), + SPH_C64(0x44CEFADCD54BF9A9), SPH_C64(0x1312200E433C7EE6), + SPH_C64(0x9FFCC84F3A78C748), SPH_C64(0xF0CD1F72248576BB), + SPH_C64(0xEC6974053638CFE4), SPH_C64(0x2BA7B67C0CEC4E4C), + SPH_C64(0xAC2F4DF3E5CE32ED), SPH_C64(0xCB33D14326EA4C11), + SPH_C64(0xA4E9044CC77E58BC), SPH_C64(0x5F513293D934FCEF), + SPH_C64(0x5DC9645506E55444), SPH_C64(0x50DE418F317DE40A), + SPH_C64(0x388CB31A69DDE259), SPH_C64(0x2DB4A83455820A86), + SPH_C64(0x9010A91E84711AE9), SPH_C64(0x4DF7F0B7B1498371), + SPH_C64(0xD62A2EABC0977179), SPH_C64(0x22FAC097AA8D5C0E), +}; + +static const sph_u64 T3[256] = { + SPH_C64(0xF49FCC2FF1DAF39B), SPH_C64(0x487FD5C66FF29281), + SPH_C64(0xE8A30667FCDCA83F), SPH_C64(0x2C9B4BE3D2FCCE63), + SPH_C64(0xDA3FF74B93FBBBC2), SPH_C64(0x2FA165D2FE70BA66), + SPH_C64(0xA103E279970E93D4), SPH_C64(0xBECDEC77B0E45E71), + SPH_C64(0xCFB41E723985E497), SPH_C64(0xB70AAA025EF75017), + SPH_C64(0xD42309F03840B8E0), SPH_C64(0x8EFC1AD035898579), + SPH_C64(0x96C6920BE2B2ABC5), SPH_C64(0x66AF4163375A9172), + SPH_C64(0x2174ABDCCA7127FB), SPH_C64(0xB33CCEA64A72FF41), + SPH_C64(0xF04A4933083066A5), SPH_C64(0x8D970ACDD7289AF5), + SPH_C64(0x8F96E8E031C8C25E), SPH_C64(0xF3FEC02276875D47), + SPH_C64(0xEC7BF310056190DD), SPH_C64(0xF5ADB0AEBB0F1491), + SPH_C64(0x9B50F8850FD58892), SPH_C64(0x4975488358B74DE8), + SPH_C64(0xA3354FF691531C61), SPH_C64(0x0702BBE481D2C6EE), + SPH_C64(0x89FB24057DEDED98), SPH_C64(0xAC3075138596E902), + SPH_C64(0x1D2D3580172772ED), SPH_C64(0xEB738FC28E6BC30D), + SPH_C64(0x5854EF8F63044326), SPH_C64(0x9E5C52325ADD3BBE), + SPH_C64(0x90AA53CF325C4623), SPH_C64(0xC1D24D51349DD067), + SPH_C64(0x2051CFEEA69EA624), SPH_C64(0x13220F0A862E7E4F), + SPH_C64(0xCE39399404E04864), SPH_C64(0xD9C42CA47086FCB7), + SPH_C64(0x685AD2238A03E7CC), SPH_C64(0x066484B2AB2FF1DB), + SPH_C64(0xFE9D5D70EFBF79EC), SPH_C64(0x5B13B9DD9C481854), + SPH_C64(0x15F0D475ED1509AD), SPH_C64(0x0BEBCD060EC79851), + SPH_C64(0xD58C6791183AB7F8), SPH_C64(0xD1187C5052F3EEE4), + SPH_C64(0xC95D1192E54E82FF), SPH_C64(0x86EEA14CB9AC6CA2), + SPH_C64(0x3485BEB153677D5D), SPH_C64(0xDD191D781F8C492A), + SPH_C64(0xF60866BAA784EBF9), SPH_C64(0x518F643BA2D08C74), + SPH_C64(0x8852E956E1087C22), SPH_C64(0xA768CB8DC410AE8D), + SPH_C64(0x38047726BFEC8E1A), SPH_C64(0xA67738B4CD3B45AA), + SPH_C64(0xAD16691CEC0DDE19), SPH_C64(0xC6D4319380462E07), + SPH_C64(0xC5A5876D0BA61938), SPH_C64(0x16B9FA1FA58FD840), + SPH_C64(0x188AB1173CA74F18), SPH_C64(0xABDA2F98C99C021F), + SPH_C64(0x3E0580AB134AE816), SPH_C64(0x5F3B05B773645ABB), + SPH_C64(0x2501A2BE5575F2F6), SPH_C64(0x1B2F74004E7E8BA9), + SPH_C64(0x1CD7580371E8D953), SPH_C64(0x7F6ED89562764E30), + SPH_C64(0xB15926FF596F003D), SPH_C64(0x9F65293DA8C5D6B9), + SPH_C64(0x6ECEF04DD690F84C), SPH_C64(0x4782275FFF33AF88), + SPH_C64(0xE41433083F820801), SPH_C64(0xFD0DFE409A1AF9B5), + SPH_C64(0x4325A3342CDB396B), SPH_C64(0x8AE77E62B301B252), + SPH_C64(0xC36F9E9F6655615A), SPH_C64(0x85455A2D92D32C09), + SPH_C64(0xF2C7DEA949477485), SPH_C64(0x63CFB4C133A39EBA), + SPH_C64(0x83B040CC6EBC5462), SPH_C64(0x3B9454C8FDB326B0), + SPH_C64(0x56F56A9E87FFD78C), SPH_C64(0x2DC2940D99F42BC6), + SPH_C64(0x98F7DF096B096E2D), SPH_C64(0x19A6E01E3AD852BF), + SPH_C64(0x42A99CCBDBD4B40B), SPH_C64(0xA59998AF45E9C559), + SPH_C64(0x366295E807D93186), SPH_C64(0x6B48181BFAA1F773), + SPH_C64(0x1FEC57E2157A0A1D), SPH_C64(0x4667446AF6201AD5), + SPH_C64(0xE615EBCACFB0F075), SPH_C64(0xB8F31F4F68290778), + SPH_C64(0x22713ED6CE22D11E), SPH_C64(0x3057C1A72EC3C93B), + SPH_C64(0xCB46ACC37C3F1F2F), SPH_C64(0xDBB893FD02AAF50E), + SPH_C64(0x331FD92E600B9FCF), SPH_C64(0xA498F96148EA3AD6), + SPH_C64(0xA8D8426E8B6A83EA), SPH_C64(0xA089B274B7735CDC), + SPH_C64(0x87F6B3731E524A11), SPH_C64(0x118808E5CBC96749), + SPH_C64(0x9906E4C7B19BD394), SPH_C64(0xAFED7F7E9B24A20C), + SPH_C64(0x6509EADEEB3644A7), SPH_C64(0x6C1EF1D3E8EF0EDE), + SPH_C64(0xB9C97D43E9798FB4), SPH_C64(0xA2F2D784740C28A3), + SPH_C64(0x7B8496476197566F), SPH_C64(0x7A5BE3E6B65F069D), + SPH_C64(0xF96330ED78BE6F10), SPH_C64(0xEEE60DE77A076A15), + SPH_C64(0x2B4BEE4AA08B9BD0), SPH_C64(0x6A56A63EC7B8894E), + SPH_C64(0x02121359BA34FEF4), SPH_C64(0x4CBF99F8283703FC), + SPH_C64(0x398071350CAF30C8), SPH_C64(0xD0A77A89F017687A), + SPH_C64(0xF1C1A9EB9E423569), SPH_C64(0x8C7976282DEE8199), + SPH_C64(0x5D1737A5DD1F7ABD), SPH_C64(0x4F53433C09A9FA80), + SPH_C64(0xFA8B0C53DF7CA1D9), SPH_C64(0x3FD9DCBC886CCB77), + SPH_C64(0xC040917CA91B4720), SPH_C64(0x7DD00142F9D1DCDF), + SPH_C64(0x8476FC1D4F387B58), SPH_C64(0x23F8E7C5F3316503), + SPH_C64(0x032A2244E7E37339), SPH_C64(0x5C87A5D750F5A74B), + SPH_C64(0x082B4CC43698992E), SPH_C64(0xDF917BECB858F63C), + SPH_C64(0x3270B8FC5BF86DDA), SPH_C64(0x10AE72BB29B5DD76), + SPH_C64(0x576AC94E7700362B), SPH_C64(0x1AD112DAC61EFB8F), + SPH_C64(0x691BC30EC5FAA427), SPH_C64(0xFF246311CC327143), + SPH_C64(0x3142368E30E53206), SPH_C64(0x71380E31E02CA396), + SPH_C64(0x958D5C960AAD76F1), SPH_C64(0xF8D6F430C16DA536), + SPH_C64(0xC8FFD13F1BE7E1D2), SPH_C64(0x7578AE66004DDBE1), + SPH_C64(0x05833F01067BE646), SPH_C64(0xBB34B5AD3BFE586D), + SPH_C64(0x095F34C9A12B97F0), SPH_C64(0x247AB64525D60CA8), + SPH_C64(0xDCDBC6F3017477D1), SPH_C64(0x4A2E14D4DECAD24D), + SPH_C64(0xBDB5E6D9BE0A1EEB), SPH_C64(0x2A7E70F7794301AB), + SPH_C64(0xDEF42D8A270540FD), SPH_C64(0x01078EC0A34C22C1), + SPH_C64(0xE5DE511AF4C16387), SPH_C64(0x7EBB3A52BD9A330A), + SPH_C64(0x77697857AA7D6435), SPH_C64(0x004E831603AE4C32), + SPH_C64(0xE7A21020AD78E312), SPH_C64(0x9D41A70C6AB420F2), + SPH_C64(0x28E06C18EA1141E6), SPH_C64(0xD2B28CBD984F6B28), + SPH_C64(0x26B75F6C446E9D83), SPH_C64(0xBA47568C4D418D7F), + SPH_C64(0xD80BADBFE6183D8E), SPH_C64(0x0E206D7F5F166044), + SPH_C64(0xE258A43911CBCA3E), SPH_C64(0x723A1746B21DC0BC), + SPH_C64(0xC7CAA854F5D7CDD3), SPH_C64(0x7CAC32883D261D9C), + SPH_C64(0x7690C26423BA942C), SPH_C64(0x17E55524478042B8), + SPH_C64(0xE0BE477656A2389F), SPH_C64(0x4D289B5E67AB2DA0), + SPH_C64(0x44862B9C8FBBFD31), SPH_C64(0xB47CC8049D141365), + SPH_C64(0x822C1B362B91C793), SPH_C64(0x4EB14655FB13DFD8), + SPH_C64(0x1ECBBA0714E2A97B), SPH_C64(0x6143459D5CDE5F14), + SPH_C64(0x53A8FBF1D5F0AC89), SPH_C64(0x97EA04D81C5E5B00), + SPH_C64(0x622181A8D4FDB3F3), SPH_C64(0xE9BCD341572A1208), + SPH_C64(0x1411258643CCE58A), SPH_C64(0x9144C5FEA4C6E0A4), + SPH_C64(0x0D33D06565CF620F), SPH_C64(0x54A48D489F219CA1), + SPH_C64(0xC43E5EAC6D63C821), SPH_C64(0xA9728B3A72770DAF), + SPH_C64(0xD7934E7B20DF87EF), SPH_C64(0xE35503B61A3E86E5), + SPH_C64(0xCAE321FBC819D504), SPH_C64(0x129A50B3AC60BFA6), + SPH_C64(0xCD5E68EA7E9FB6C3), SPH_C64(0xB01C90199483B1C7), + SPH_C64(0x3DE93CD5C295376C), SPH_C64(0xAED52EDF2AB9AD13), + SPH_C64(0x2E60F512C0A07884), SPH_C64(0xBC3D86A3E36210C9), + SPH_C64(0x35269D9B163951CE), SPH_C64(0x0C7D6E2AD0CDB5FA), + SPH_C64(0x59E86297D87F5733), SPH_C64(0x298EF221898DB0E7), + SPH_C64(0x55000029D1A5AA7E), SPH_C64(0x8BC08AE1B5061B45), + SPH_C64(0xC2C31C2B6C92703A), SPH_C64(0x94CC596BAF25EF42), + SPH_C64(0x0A1D73DB22540456), SPH_C64(0x04B6A0F9D9C4179A), + SPH_C64(0xEFFDAFA2AE3D3C60), SPH_C64(0xF7C8075BB49496C4), + SPH_C64(0x9CC5C7141D1CD4E3), SPH_C64(0x78BD1638218E5534), + SPH_C64(0xB2F11568F850246A), SPH_C64(0xEDFABCFA9502BC29), + SPH_C64(0x796CE5F2DA23051B), SPH_C64(0xAAE128B0DC93537C), + SPH_C64(0x3A493DA0EE4B29AE), SPH_C64(0xB5DF6B2C416895D7), + SPH_C64(0xFCABBD25122D7F37), SPH_C64(0x70810B58105DC4B1), + SPH_C64(0xE10FDD37F7882A90), SPH_C64(0x524DCAB5518A3F5C), + SPH_C64(0x3C9E85878451255B), SPH_C64(0x4029828119BD34E2), + SPH_C64(0x74A05B6F5D3CECCB), SPH_C64(0xB610021542E13ECA), + SPH_C64(0x0FF979D12F59E2AC), SPH_C64(0x6037DA27E4F9CC50), + SPH_C64(0x5E92975A0DF1847D), SPH_C64(0xD66DE190D3E623FE), + SPH_C64(0x5032D6B87B568048), SPH_C64(0x9A36B7CE8235216E), + SPH_C64(0x80272A7A24F64B4A), SPH_C64(0x93EFED8B8C6916F7), + SPH_C64(0x37DDBFF44CCE1555), SPH_C64(0x4B95DB5D4B99BD25), + SPH_C64(0x92D3FDA169812FC0), SPH_C64(0xFB1A4A9A90660BB6), + SPH_C64(0x730C196946A4B9B2), SPH_C64(0x81E289AA7F49DA68), + SPH_C64(0x64669A0F83B1A05F), SPH_C64(0x27B3FF7D9644F48B), + SPH_C64(0xCC6B615C8DB675B3), SPH_C64(0x674F20B9BCEBBE95), + SPH_C64(0x6F31238275655982), SPH_C64(0x5AE488713E45CF05), + SPH_C64(0xBF619F9954C21157), SPH_C64(0xEABAC46040A8EAE9), + SPH_C64(0x454C6FE9F2C0C1CD), SPH_C64(0x419CF6496412691C), + SPH_C64(0xD3DC3BEF265B0F70), SPH_C64(0x6D0E60F5C3578A9E), +}; + +static const sph_u64 T4[256] = { + SPH_C64(0x5B0E608526323C55), SPH_C64(0x1A46C1A9FA1B59F5), + SPH_C64(0xA9E245A17C4C8FFA), SPH_C64(0x65CA5159DB2955D7), + SPH_C64(0x05DB0A76CE35AFC2), SPH_C64(0x81EAC77EA9113D45), + SPH_C64(0x528EF88AB6AC0A0D), SPH_C64(0xA09EA253597BE3FF), + SPH_C64(0x430DDFB3AC48CD56), SPH_C64(0xC4B3A67AF45CE46F), + SPH_C64(0x4ECECFD8FBE2D05E), SPH_C64(0x3EF56F10B39935F0), + SPH_C64(0x0B22D6829CD619C6), SPH_C64(0x17FD460A74DF2069), + SPH_C64(0x6CF8CC8E8510ED40), SPH_C64(0xD6C824BF3A6ECAA7), + SPH_C64(0x61243D581A817049), SPH_C64(0x048BACB6BBC163A2), + SPH_C64(0xD9A38AC27D44CC32), SPH_C64(0x7FDDFF5BAAF410AB), + SPH_C64(0xAD6D495AA804824B), SPH_C64(0xE1A6A74F2D8C9F94), + SPH_C64(0xD4F7851235DEE8E3), SPH_C64(0xFD4B7F886540D893), + SPH_C64(0x247C20042AA4BFDA), SPH_C64(0x096EA1C517D1327C), + SPH_C64(0xD56966B4361A6685), SPH_C64(0x277DA5C31221057D), + SPH_C64(0x94D59893A43ACFF7), SPH_C64(0x64F0C51CCDC02281), + SPH_C64(0x3D33BCC4FF6189DB), SPH_C64(0xE005CB184CE66AF1), + SPH_C64(0xFF5CCD1D1DB99BEA), SPH_C64(0xB0B854A7FE42980F), + SPH_C64(0x7BD46A6A718D4B9F), SPH_C64(0xD10FA8CC22A5FD8C), + SPH_C64(0xD31484952BE4BD31), SPH_C64(0xC7FA975FCB243847), + SPH_C64(0x4886ED1E5846C407), SPH_C64(0x28CDDB791EB70B04), + SPH_C64(0xC2B00BE2F573417F), SPH_C64(0x5C9590452180F877), + SPH_C64(0x7A6BDDFFF370EB00), SPH_C64(0xCE509E38D6D9D6A4), + SPH_C64(0xEBEB0F00647FA702), SPH_C64(0x1DCC06CF76606F06), + SPH_C64(0xE4D9F28BA286FF0A), SPH_C64(0xD85A305DC918C262), + SPH_C64(0x475B1D8732225F54), SPH_C64(0x2D4FB51668CCB5FE), + SPH_C64(0xA679B9D9D72BBA20), SPH_C64(0x53841C0D912D43A5), + SPH_C64(0x3B7EAA48BF12A4E8), SPH_C64(0x781E0E47F22F1DDF), + SPH_C64(0xEFF20CE60AB50973), SPH_C64(0x20D261D19DFFB742), + SPH_C64(0x16A12B03062A2E39), SPH_C64(0x1960EB2239650495), + SPH_C64(0x251C16FED50EB8B8), SPH_C64(0x9AC0C330F826016E), + SPH_C64(0xED152665953E7671), SPH_C64(0x02D63194A6369570), + SPH_C64(0x5074F08394B1C987), SPH_C64(0x70BA598C90B25CE1), + SPH_C64(0x794A15810B9742F6), SPH_C64(0x0D5925E9FCAF8C6C), + SPH_C64(0x3067716CD868744E), SPH_C64(0x910AB077E8D7731B), + SPH_C64(0x6A61BBDB5AC42F61), SPH_C64(0x93513EFBF0851567), + SPH_C64(0xF494724B9E83E9D5), SPH_C64(0xE887E1985C09648D), + SPH_C64(0x34B1D3C675370CFD), SPH_C64(0xDC35E433BC0D255D), + SPH_C64(0xD0AAB84234131BE0), SPH_C64(0x08042A50B48B7EAF), + SPH_C64(0x9997C4EE44A3AB35), SPH_C64(0x829A7B49201799D0), + SPH_C64(0x263B8307B7C54441), SPH_C64(0x752F95F4FD6A6CA6), + SPH_C64(0x927217402C08C6E5), SPH_C64(0x2A8AB754A795D9EE), + SPH_C64(0xA442F7552F72943D), SPH_C64(0x2C31334E19781208), + SPH_C64(0x4FA98D7CEAEE6291), SPH_C64(0x55C3862F665DB309), + SPH_C64(0xBD0610175D53B1F3), SPH_C64(0x46FE6CB840413F27), + SPH_C64(0x3FE03792DF0CFA59), SPH_C64(0xCFE700372EB85E8F), + SPH_C64(0xA7BE29E7ADBCE118), SPH_C64(0xE544EE5CDE8431DD), + SPH_C64(0x8A781B1B41F1873E), SPH_C64(0xA5C94C78A0D2F0E7), + SPH_C64(0x39412E2877B60728), SPH_C64(0xA1265EF3AFC9A62C), + SPH_C64(0xBCC2770C6A2506C5), SPH_C64(0x3AB66DD5DCE1CE12), + SPH_C64(0xE65499D04A675B37), SPH_C64(0x7D8F523481BFD216), + SPH_C64(0x0F6F64FCEC15F389), SPH_C64(0x74EFBE618B5B13C8), + SPH_C64(0xACDC82B714273E1D), SPH_C64(0xDD40BFE003199D17), + SPH_C64(0x37E99257E7E061F8), SPH_C64(0xFA52626904775AAA), + SPH_C64(0x8BBBF63A463D56F9), SPH_C64(0xF0013F1543A26E64), + SPH_C64(0xA8307E9F879EC898), SPH_C64(0xCC4C27A4150177CC), + SPH_C64(0x1B432F2CCA1D3348), SPH_C64(0xDE1D1F8F9F6FA013), + SPH_C64(0x606602A047A7DDD6), SPH_C64(0xD237AB64CC1CB2C7), + SPH_C64(0x9B938E7225FCD1D3), SPH_C64(0xEC4E03708E0FF476), + SPH_C64(0xFEB2FBDA3D03C12D), SPH_C64(0xAE0BCED2EE43889A), + SPH_C64(0x22CB8923EBFB4F43), SPH_C64(0x69360D013CF7396D), + SPH_C64(0x855E3602D2D4E022), SPH_C64(0x073805BAD01F784C), + SPH_C64(0x33E17A133852F546), SPH_C64(0xDF4874058AC7B638), + SPH_C64(0xBA92B29C678AA14A), SPH_C64(0x0CE89FC76CFAADCD), + SPH_C64(0x5F9D4E0908339E34), SPH_C64(0xF1AFE9291F5923B9), + SPH_C64(0x6E3480F60F4A265F), SPH_C64(0xEEBF3A2AB29B841C), + SPH_C64(0xE21938A88F91B4AD), SPH_C64(0x57DFEFF845C6D3C3), + SPH_C64(0x2F006B0BF62CAAF2), SPH_C64(0x62F479EF6F75EE78), + SPH_C64(0x11A55AD41C8916A9), SPH_C64(0xF229D29084FED453), + SPH_C64(0x42F1C27B16B000E6), SPH_C64(0x2B1F76749823C074), + SPH_C64(0x4B76ECA3C2745360), SPH_C64(0x8C98F463B91691BD), + SPH_C64(0x14BCC93CF1ADE66A), SPH_C64(0x8885213E6D458397), + SPH_C64(0x8E177DF0274D4711), SPH_C64(0xB49B73B5503F2951), + SPH_C64(0x10168168C3F96B6B), SPH_C64(0x0E3D963B63CAB0AE), + SPH_C64(0x8DFC4B5655A1DB14), SPH_C64(0xF789F1356E14DE5C), + SPH_C64(0x683E68AF4E51DAC1), SPH_C64(0xC9A84F9D8D4B0FD9), + SPH_C64(0x3691E03F52A0F9D1), SPH_C64(0x5ED86E46E1878E80), + SPH_C64(0x3C711A0E99D07150), SPH_C64(0x5A0865B20C4E9310), + SPH_C64(0x56FBFC1FE4F0682E), SPH_C64(0xEA8D5DE3105EDF9B), + SPH_C64(0x71ABFDB12379187A), SPH_C64(0x2EB99DE1BEE77B9C), + SPH_C64(0x21ECC0EA33CF4523), SPH_C64(0x59A4D7521805C7A1), + SPH_C64(0x3896F5EB56AE7C72), SPH_C64(0xAA638F3DB18F75DC), + SPH_C64(0x9F39358DABE9808E), SPH_C64(0xB7DEFA91C00B72AC), + SPH_C64(0x6B5541FD62492D92), SPH_C64(0x6DC6DEE8F92E4D5B), + SPH_C64(0x353F57ABC4BEEA7E), SPH_C64(0x735769D6DA5690CE), + SPH_C64(0x0A234AA642391484), SPH_C64(0xF6F9508028F80D9D), + SPH_C64(0xB8E319A27AB3F215), SPH_C64(0x31AD9C1151341A4D), + SPH_C64(0x773C22A57BEF5805), SPH_C64(0x45C7561A07968633), + SPH_C64(0xF913DA9E249DBE36), SPH_C64(0xDA652D9B78A64C68), + SPH_C64(0x4C27A97F3BC334EF), SPH_C64(0x76621220E66B17F4), + SPH_C64(0x967743899ACD7D0B), SPH_C64(0xF3EE5BCAE0ED6782), + SPH_C64(0x409F753600C879FC), SPH_C64(0x06D09A39B5926DB6), + SPH_C64(0x6F83AEB0317AC588), SPH_C64(0x01E6CA4A86381F21), + SPH_C64(0x66FF3462D19F3025), SPH_C64(0x72207C24DDFD3BFB), + SPH_C64(0x4AF6B6D3E2ECE2EB), SPH_C64(0x9C994DBEC7EA08DE), + SPH_C64(0x49ACE597B09A8BC4), SPH_C64(0xB38C4766CF0797BA), + SPH_C64(0x131B9373C57C2A75), SPH_C64(0xB1822CCE61931E58), + SPH_C64(0x9D7555B909BA1C0C), SPH_C64(0x127FAFDD937D11D2), + SPH_C64(0x29DA3BADC66D92E4), SPH_C64(0xA2C1D57154C2ECBC), + SPH_C64(0x58C5134D82F6FE24), SPH_C64(0x1C3AE3515B62274F), + SPH_C64(0xE907C82E01CB8126), SPH_C64(0xF8ED091913E37FCB), + SPH_C64(0x3249D8F9C80046C9), SPH_C64(0x80CF9BEDE388FB63), + SPH_C64(0x1881539A116CF19E), SPH_C64(0x5103F3F76BD52457), + SPH_C64(0x15B7E6F5AE47F7A8), SPH_C64(0xDBD7C6DED47E9CCF), + SPH_C64(0x44E55C410228BB1A), SPH_C64(0xB647D4255EDB4E99), + SPH_C64(0x5D11882BB8AAFC30), SPH_C64(0xF5098BBB29D3212A), + SPH_C64(0x8FB5EA14E90296B3), SPH_C64(0x677B942157DD025A), + SPH_C64(0xFB58E7C0A390ACB5), SPH_C64(0x89D3674C83BD4A01), + SPH_C64(0x9E2DA4DF4BF3B93B), SPH_C64(0xFCC41E328CAB4829), + SPH_C64(0x03F38C96BA582C52), SPH_C64(0xCAD1BDBD7FD85DB2), + SPH_C64(0xBBB442C16082AE83), SPH_C64(0xB95FE86BA5DA9AB0), + SPH_C64(0xB22E04673771A93F), SPH_C64(0x845358C9493152D8), + SPH_C64(0xBE2A488697B4541E), SPH_C64(0x95A2DC2DD38E6966), + SPH_C64(0xC02C11AC923C852B), SPH_C64(0x2388B1990DF2A87B), + SPH_C64(0x7C8008FA1B4F37BE), SPH_C64(0x1F70D0C84D54E503), + SPH_C64(0x5490ADEC7ECE57D4), SPH_C64(0x002B3C27D9063A3A), + SPH_C64(0x7EAEA3848030A2BF), SPH_C64(0xC602326DED2003C0), + SPH_C64(0x83A7287D69A94086), SPH_C64(0xC57A5FCB30F57A8A), + SPH_C64(0xB56844E479EBE779), SPH_C64(0xA373B40F05DCBCE9), + SPH_C64(0xD71A786E88570EE2), SPH_C64(0x879CBACDBDE8F6A0), + SPH_C64(0x976AD1BCC164A32F), SPH_C64(0xAB21E25E9666D78B), + SPH_C64(0x901063AAE5E5C33C), SPH_C64(0x9818B34448698D90), + SPH_C64(0xE36487AE3E1E8ABB), SPH_C64(0xAFBDF931893BDCB4), + SPH_C64(0x6345A0DC5FBBD519), SPH_C64(0x8628FE269B9465CA), + SPH_C64(0x1E5D01603F9C51EC), SPH_C64(0x4DE44006A15049B7), + SPH_C64(0xBF6C70E5F776CBB1), SPH_C64(0x411218F2EF552BED), + SPH_C64(0xCB0C0708705A36A3), SPH_C64(0xE74D14754F986044), + SPH_C64(0xCD56D9430EA8280E), SPH_C64(0xC12591D7535F5065), + SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F), +}; + +#define PASS(a, b, c, mul) do { \ + ROUND(a, b, c, X0, mul); \ + ROUND(b, c, a, X1, mul); \ + ROUND(c, a, b, X2, mul); \ + ROUND(a, b, c, X3, mul); \ + ROUND(b, c, a, X4, mul); \ + ROUND(c, a, b, X5, mul); \ + ROUND(a, b, c, X6, mul); \ + ROUND(b, c, a, X7, mul); \ + } while (0) + +#define ROUND(a, b, c, x, mul) do { \ + c ^= x; \ + a = SPH_T64(a - (T1[c & 0xFF] ^ T2[(c >> 16) & 0xFF] \ + ^ T3[(c >> 32) & 0xFF] ^ T4[(c >> 48) & 0xFF])); \ + b = SPH_T64(b + (T4[(c >> 8) & 0xFF] ^ T3[(c >> 24) & 0xFF] \ + ^ T2[(c >> 40) & 0xFF] ^ T1[(c >> 56) & 0xFF])); \ + b = mul(b); \ + } while (0) + +#define MUL5(x) SPH_T64((x) * SPH_C64(5)) +#define MUL7(x) SPH_T64((x) * SPH_C64(7)) +#define MUL9(x) SPH_T64((x) * SPH_C64(9)) + +#define KSCHED do { \ + X0 = SPH_T64(X0 - (X7 ^ SPH_C64(0xA5A5A5A5A5A5A5A5))); \ + X1 ^= X0; \ + X2 = SPH_T64(X2 + X1); \ + X3 = SPH_T64(X3 - (X2 ^ (~X1 << 19))); \ + X4 ^= X3; \ + X5 = SPH_T64(X5 + X4); \ + X6 = SPH_T64(X6 - (X5 ^ (~X4 >> 23))); \ + X7 ^= X6; \ + X0 = SPH_T64(X0 + X7); \ + X1 = SPH_T64(X1 - (X0 ^ (~X7 << 19))); \ + X2 ^= X1; \ + X3 = SPH_T64(X3 + X2); \ + X4 = SPH_T64(X4 - (X3 ^ (~X2 >> 23))); \ + X5 ^= X4; \ + X6 = SPH_T64(X6 + X5); \ + X7 = SPH_T64(X7 - (X6 ^ SPH_C64(0x0123456789ABCDEF))); \ + } while (0) + +#define TIGER_ROUND_BODY(in, r) do { \ + sph_u64 A, B, C; \ + sph_u64 X0, X1, X2, X3, X4, X5, X6, X7; \ + \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + \ + X0 = (in(0)); \ + X1 = (in(1)); \ + X2 = (in(2)); \ + X3 = (in(3)); \ + X4 = (in(4)); \ + X5 = (in(5)); \ + X6 = (in(6)); \ + X7 = (in(7)); \ + PASS(A, B, C, MUL5); \ + KSCHED; \ + PASS(C, A, B, MUL7); \ + KSCHED; \ + PASS(B, C, A, MUL9); \ + \ + (r)[0] ^= A; \ + (r)[1] = SPH_T64(B - (r)[1]); \ + (r)[2] = SPH_T64(C + (r)[2]); \ + } while (0) + +/* + * One round of Tiger. The data must be aligned for 64-bit access. + */ +static void +tiger_round(const unsigned char *data, sph_u64 r[3]) +{ +#define TIGER_IN(i) sph_dec64le_aligned(data + 8 * (i)) + TIGER_ROUND_BODY(TIGER_IN, r); +#undef TIGER_IN +} + +/* see sph_tiger.h */ +void +sph_tiger_init(void *cc) +{ + sph_tiger_context *sc; + + sc = (sph_tiger_context*)cc; + sc->val[0] = SPH_C64(0x0123456789ABCDEF); + sc->val[1] = SPH_C64(0xFEDCBA9876543210); + sc->val[2] = SPH_C64(0xF096A5B4C3B2E187); + sc->count = 0; +} + +#define RFUN tiger_round +#define HASH tiger +#define LE64 1 +#define BLEN 64U +#define PW01 1 +#define PLW1 1 +#include "md_helper.c" + +/* see sph_tiger.h */ +void +sph_tiger_close(void *cc, void *dst) +{ + tiger_close(cc, dst, 3); + sph_tiger_init(cc); +} + +/* see sph_tiger.h */ +void +sph_tiger_comp(const sph_u64 msg[8], sph_u64 val[3]) +{ +#define TIGER_IN(i) msg[i] + TIGER_ROUND_BODY(TIGER_IN, val); +#undef TIGER_IN +} + +#undef HASH +#define HASH tiger2 +#undef PW01 +#define CLOSE_ONLY 1 +#include "md_helper.c" + +/* see sph_tiger.h */ +void +sph_tiger2_close(void *cc, void *dst) +{ + tiger2_close(cc, dst, 3); + sph_tiger2_init(cc); +} + +#endif \ No newline at end of file diff --git a/tribus/tribus.cu b/tribus/tribus.cu index 4516e7d69c..955b486d0f 100644 --- a/tribus/tribus.cu +++ b/tribus/tribus.cu @@ -105,7 +105,7 @@ extern "C" int scanhash_tribus(int thr_id, struct work *work, uint32_t max_nonce do { int order = 1; jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; if (use_compat_kernels[thr_id]) { x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/util.cpp b/util.cpp index 11b4773805..812aff7100 100644 --- a/util.cpp +++ b/util.cpp @@ -2181,6 +2181,9 @@ void print_hash_tests(void) bmw_hash(&hash[0], &buf[0]); printpfx("bmw", hash); + bmw512_hash(&hash[0], &buf[0]); + printpfx("bmw512", hash); + c11hash(&hash[0], &buf[0]); printpfx("c11", hash); @@ -2203,6 +2206,9 @@ void print_hash_tests(void) fugue256_hash(&hash[0], &buf[0], 32); printpfx("fugue256", hash); + gostd_hash(&hash[0], &buf[0]); + printpfx("gostcoin", hash); + groestlhash(&hash[0], &buf[0]); printpfx("groestl", hash); @@ -2213,7 +2219,7 @@ void print_hash_tests(void) printpfx("hmq1725", hash); hsr_hash(&hash[0], &buf[0]); - printpfx("hsr", hash); + printpfx("hsr", hash); jha_hash(&hash[0], &buf[0]); printpfx("jha", hash); @@ -2221,6 +2227,9 @@ void print_hash_tests(void) keccak256_hash(&hash[0], &buf[0]); printpfx("keccak", hash); + sha3d_hash(&hash[0], &buf[0]); + printpfx("sha3d", hash); + memset(buf, 0, 128); lbry_hash(&hash[0], &buf[0]); printpfx("lbry", hash); @@ -2324,9 +2333,24 @@ void print_hash_tests(void) x15hash(&hash[0], &buf[0]); printpfx("X15", hash); + x16r_hash(&hash[0], &buf[0]); + printpfx("X16R", hash); + + x16rt_hash(&hash[0], &buf[0]); + printpfx("X16RT", hash); + + x16rv2_hash(&hash[0], &buf[0]); + printpfx("X16Rv2", hash); + + x16s_hash(&hash[0], &buf[0]); + printpfx("X16S", hash); + x17hash(&hash[0], &buf[0]); printpfx("X17", hash); + x21s_hash(&hash[0], &buf[0]); + printpfx("X21S", hash); + //memcpy(buf, zrtest, 80); zr5hash(&hash[0], &buf[0]); //zr5hash_pok(&hash[0], (uint32_t*) &buf[0]); diff --git a/x11/0x10.cu b/x11/0x10.cu index e8b062c43f..5120ecfa35 100644 --- a/x11/0x10.cu +++ b/x11/0x10.cu @@ -98,7 +98,7 @@ extern "C" int scanhash_hash0x10(int thr_id, struct work* work, uint32_t max_non uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19; + int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 21 : 20; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8; //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); @@ -123,7 +123,6 @@ extern "C" int scanhash_hash0x10(int thr_id, struct work* work, uint32_t max_non quark_jh512_cpu_init(thr_id, throughput); x11_luffa512_cpu_init(thr_id, throughput); quark_keccak512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); if (x11_simd512_cpu_init(thr_id, throughput) != 0) { return 0; } @@ -148,28 +147,17 @@ extern "C" int scanhash_hash0x10(int thr_id, struct work* work, uint32_t max_non // Hash with CUDA quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; - TRACE("blake :"); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("skein :"); quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("bmw :"); quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("groestl:"); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("jh512 :"); x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("luffa+c:"); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("keccak :"); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("cubehash :"); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("simd :"); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("shavite:"); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - TRACE("echo => "); - + *hashes_done = pdata[19] - first_nonce + throughput; work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); diff --git a/x11/bitcore.cu b/x11/bitcore.cu index 78739679c3..ede1d169ca 100644 --- a/x11/bitcore.cu +++ b/x11/bitcore.cu @@ -278,10 +278,10 @@ extern "C" int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonc quark_bmw512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); - quark_keccak512_cpu_init(thr_id, throughput); + //quark_keccak512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); x11_luffa512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); + //x11_cubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); if (x11_simd512_cpu_init(thr_id, throughput) != 0) { return 0; @@ -346,7 +346,7 @@ extern "C" int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonc TRACE("jh512 :"); break; case KECCAK: - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); i++; TRACE("keccak :"); break; case LUFFA: @@ -354,7 +354,7 @@ extern "C" int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonc TRACE("luffa :"); break; case CUBEHASH: - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], i); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); i; TRACE("cube :"); break; case SHAVITE: diff --git a/x11/c11.cu b/x11/c11.cu index 8f8f6663b2..2f6b8f6078 100644 --- a/x11/c11.cu +++ b/x11/c11.cu @@ -178,7 +178,7 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u TRACE("groestl:"); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("jh512 :"); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; TRACE("keccak :"); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("skein :"); diff --git a/x11/cuda_x11.h b/x11/cuda_x11.h index 8552157941..8b47f6f91e 100644 --- a/x11/cuda_x11.h +++ b/x11/cuda_x11.h @@ -7,7 +7,7 @@ extern void x11_luffa512_cpu_init(int thr_id, uint32_t threads); extern void x11_luffa512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads); -extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); extern void x11_shavite512_cpu_init(int thr_id, uint32_t threads); extern void x11_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); diff --git a/x11/cuda_x11_aes_sp.cuh b/x11/cuda_x11_aes_sp.cuh new file mode 100644 index 0000000000..b99296698a --- /dev/null +++ b/x11/cuda_x11_aes_sp.cuh @@ -0,0 +1,585 @@ +/* +Based upon the 2 Christians,klaus_t's, Tanguy Pruvot's and SP's work (2013-2016) +Provos Alexis - 2016 +optimized by sp - 2018 (+50% faster on the gtx 1080ti) +*/ +#include "miner.h" + +__device__ static uint32_t __align__(16) d_AES0[256] = { + 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, + 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, + 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, + 0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F, 0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA, + 0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B, 0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413, + 0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6, 0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85, + 0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511, 0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B, + 0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1, 0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF, + 0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E, 0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6, + 0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B, 0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD, + 0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8, 0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2, + 0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949, 0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810, + 0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697, 0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F, + 0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C, 0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27, + 0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433, 0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5, + 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C +}; + +__device__ static uint32_t __align__(16) d_AES3[256] = { + 0xC6A56363, 0xF8847C7C, 0xEE997777, 0xF68D7B7B, 0xFF0DF2F2, 0xD6BD6B6B, 0xDEB16F6F, 0x9154C5C5, 0x60503030, 0x02030101, 0xCEA96767, 0x567D2B2B, 0xE719FEFE, 0xB562D7D7, 0x4DE6ABAB, 0xEC9A7676, + 0x8F45CACA, 0x1F9D8282, 0x8940C9C9, 0xFA877D7D, 0xEF15FAFA, 0xB2EB5959, 0x8EC94747, 0xFB0BF0F0, 0x41ECADAD, 0xB367D4D4, 0x5FFDA2A2, 0x45EAAFAF, 0x23BF9C9C, 0x53F7A4A4, 0xE4967272, 0x9B5BC0C0, + 0x75C2B7B7, 0xE11CFDFD, 0x3DAE9393, 0x4C6A2626, 0x6C5A3636, 0x7E413F3F, 0xF502F7F7, 0x834FCCCC, 0x685C3434, 0x51F4A5A5, 0xD134E5E5, 0xF908F1F1, 0xE2937171, 0xAB73D8D8, 0x62533131, 0x2A3F1515, + 0x080C0404, 0x9552C7C7, 0x46652323, 0x9D5EC3C3, 0x30281818, 0x37A19696, 0x0A0F0505, 0x2FB59A9A, 0x0E090707, 0x24361212, 0x1B9B8080, 0xDF3DE2E2, 0xCD26EBEB, 0x4E692727, 0x7FCDB2B2, 0xEA9F7575, + 0x121B0909, 0x1D9E8383, 0x58742C2C, 0x342E1A1A, 0x362D1B1B, 0xDCB26E6E, 0xB4EE5A5A, 0x5BFBA0A0, 0xA4F65252, 0x764D3B3B, 0xB761D6D6, 0x7DCEB3B3, 0x527B2929, 0xDD3EE3E3, 0x5E712F2F, 0x13978484, + 0xA6F55353, 0xB968D1D1, 0x00000000, 0xC12CEDED, 0x40602020, 0xE31FFCFC, 0x79C8B1B1, 0xB6ED5B5B, 0xD4BE6A6A, 0x8D46CBCB, 0x67D9BEBE, 0x724B3939, 0x94DE4A4A, 0x98D44C4C, 0xB0E85858, 0x854ACFCF, + 0xBB6BD0D0, 0xC52AEFEF, 0x4FE5AAAA, 0xED16FBFB, 0x86C54343, 0x9AD74D4D, 0x66553333, 0x11948585, 0x8ACF4545, 0xE910F9F9, 0x04060202, 0xFE817F7F, 0xA0F05050, 0x78443C3C, 0x25BA9F9F, 0x4BE3A8A8, + 0xA2F35151, 0x5DFEA3A3, 0x80C04040, 0x058A8F8F, 0x3FAD9292, 0x21BC9D9D, 0x70483838, 0xF104F5F5, 0x63DFBCBC, 0x77C1B6B6, 0xAF75DADA, 0x42632121, 0x20301010, 0xE51AFFFF, 0xFD0EF3F3, 0xBF6DD2D2, + 0x814CCDCD, 0x18140C0C, 0x26351313, 0xC32FECEC, 0xBEE15F5F, 0x35A29797, 0x88CC4444, 0x2E391717, 0x9357C4C4, 0x55F2A7A7, 0xFC827E7E, 0x7A473D3D, 0xC8AC6464, 0xBAE75D5D, 0x322B1919, 0xE6957373, + 0xC0A06060, 0x19988181, 0x9ED14F4F, 0xA37FDCDC, 0x44662222, 0x547E2A2A, 0x3BAB9090, 0x0B838888, 0x8CCA4646, 0xC729EEEE, 0x6BD3B8B8, 0x283C1414, 0xA779DEDE, 0xBCE25E5E, 0x161D0B0B, 0xAD76DBDB, + 0xDB3BE0E0, 0x64563232, 0x744E3A3A, 0x141E0A0A, 0x92DB4949, 0x0C0A0606, 0x486C2424, 0xB8E45C5C, 0x9F5DC2C2, 0xBD6ED3D3, 0x43EFACAC, 0xC4A66262, 0x39A89191, 0x31A49595, 0xD337E4E4, 0xF28B7979, + 0xD532E7E7, 0x8B43C8C8, 0x6E593737, 0xDAB76D6D, 0x018C8D8D, 0xB164D5D5, 0x9CD24E4E, 0x49E0A9A9, 0xD8B46C6C, 0xACFA5656, 0xF307F4F4, 0xCF25EAEA, 0xCAAF6565, 0xF48E7A7A, 0x47E9AEAE, 0x10180808, + 0x6FD5BABA, 0xF0887878, 0x4A6F2525, 0x5C722E2E, 0x38241C1C, 0x57F1A6A6, 0x73C7B4B4, 0x9751C6C6, 0xCB23E8E8, 0xA17CDDDD, 0xE89C7474, 0x3E211F1F, 0x96DD4B4B, 0x61DCBDBD, 0x0D868B8B, 0x0F858A8A, + 0xE0907070, 0x7C423E3E, 0x71C4B5B5, 0xCCAA6666, 0x90D84848, 0x06050303, 0xF701F6F6, 0x1C120E0E, 0xC2A36161, 0x6A5F3535, 0xAEF95757, 0x69D0B9B9, 0x17918686, 0x9958C1C1, 0x3A271D1D, 0x27B99E9E, + 0xD938E1E1, 0xEB13F8F8, 0x2BB39898, 0x22331111, 0xD2BB6969, 0xA970D9D9, 0x07898E8E, 0x33A79494, 0x2DB69B9B, 0x3C221E1E, 0x15928787, 0xC920E9E9, 0x8749CECE, 0xAAFF5555, 0x50782828, 0xA57ADFDF, + 0x038F8C8C, 0x59F8A1A1, 0x09808989, 0x1A170D0D, 0x65DABFBF, 0xD731E6E6, 0x84C64242, 0xD0B86868, 0x82C34141, 0x29B09999, 0x5A772D2D, 0x1E110F0F, 0x7BCBB0B0, 0xA8FC5454, 0x6DD6BBBB, 0x2C3A1616 +}; + +__device__ __forceinline__ +void aes_gpu_init_mt_256(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x<256){ + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + sharedMemory[0][threadIdx.x] = temp; + sharedMemory[1][threadIdx.x] = ROL8(temp); + sharedMemory[2][threadIdx.x] = ROL16(temp); + sharedMemory[3][threadIdx.x] = ROR8(temp); + } +} + +__device__ __forceinline__ +void aes_gpu_init256(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill a uint32 */ + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + sharedMemory[0][threadIdx.x] = temp; + sharedMemory[1][threadIdx.x] = ROL8(temp); + sharedMemory[2][threadIdx.x] = ROL16(temp); + sharedMemory[3][threadIdx.x] = ROR8(temp); +} + +__device__ __forceinline__ +void aes_gpu_init128(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill 2 uint32 */ + uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); + + sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; + sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); +} + + +__device__ __forceinline__ +void aes_gpu_init64(uint32_t sharedMemory[4][256]) +{ + /* each thread startup will fill 2 uint32 */ + uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); + uint2 temp2 = __ldg(&((uint2*)&d_AES0)[threadIdx.x + 128]); + + if (threadIdx.x < 64) + { + sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; + sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); + + sharedMemory[0][(threadIdx.x << 1) + 0] = temp2.x; + sharedMemory[0][128 + (threadIdx.x << 1) + 1] = temp2.y; + sharedMemory[1][128 + (threadIdx.x << 1) + 0] = ROL8(temp2.x); + sharedMemory[1][128 + (threadIdx.x << 1) + 1] = ROL8(temp2.y); + sharedMemory[2][128 + (threadIdx.x << 1) + 0] = ROL16(temp2.x); + sharedMemory[2][128 + (threadIdx.x << 1) + 1] = ROL16(temp2.y); + sharedMemory[3][128 + (threadIdx.x << 1) + 0] = ROR8(temp2.x); + sharedMemory[3][128 + (threadIdx.x << 1) + 1] = ROR8(temp2.y); + } +} + + +__device__ __forceinline__ +void aes_gpu_init32(uint32_t sharedMemory[4][256]) +{ + if (threadIdx.x < 32) + { + + /* each thread startup will fill 2 uint32 */ + uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); + uint2 temp1 = __ldg(&((uint2*)&d_AES0)[threadIdx.x + 32]); + uint2 temp2 = __ldg(&((uint2*)&d_AES0)[threadIdx.x + 64]); + uint2 temp3 = __ldg(&((uint2*)&d_AES0)[threadIdx.x + 96]); + + sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; + sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); + + sharedMemory[0][32 + (threadIdx.x << 1) + 0] = temp1.x; + sharedMemory[0][32 + (threadIdx.x << 1) + 1] = temp1.y; + sharedMemory[1][32 + (threadIdx.x << 1) + 0] = ROL8(temp1.x); + sharedMemory[1][32 + (threadIdx.x << 1) + 1] = ROL8(temp1.y); + sharedMemory[2][32 + (threadIdx.x << 1) + 0] = ROL16(temp1.x); + sharedMemory[2][32 + (threadIdx.x << 1) + 1] = ROL16(temp1.y); + sharedMemory[3][32 + (threadIdx.x << 1) + 0] = ROR8(temp1.x); + sharedMemory[3][32 + (threadIdx.x << 1) + 1] = ROR8(temp1.y); + + sharedMemory[0][64 + (threadIdx.x << 1) + 0] = temp2.x; + sharedMemory[0][64 + (threadIdx.x << 1) + 1] = temp2.y; + sharedMemory[1][64 + (threadIdx.x << 1) + 0] = ROL8(temp2.x); + sharedMemory[1][64 + (threadIdx.x << 1) + 1] = ROL8(temp2.y); + sharedMemory[2][64 + (threadIdx.x << 1) + 0] = ROL16(temp2.x); + sharedMemory[2][64 + (threadIdx.x << 1) + 1] = ROL16(temp2.y); + sharedMemory[3][64 + (threadIdx.x << 1) + 0] = ROR8(temp2.x); + sharedMemory[3][64 + (threadIdx.x << 1) + 1] = ROR8(temp2.y); + + sharedMemory[0][96 + (threadIdx.x << 1) + 0] = temp3.x; + sharedMemory[0][96 + (threadIdx.x << 1) + 1] = temp3.y; + sharedMemory[1][96 + (threadIdx.x << 1) + 0] = ROL8(temp3.x); + sharedMemory[1][96 + (threadIdx.x << 1) + 1] = ROL8(temp3.y); + sharedMemory[2][96 + (threadIdx.x << 1) + 0] = ROL16(temp3.x); + sharedMemory[2][96 + (threadIdx.x << 1) + 1] = ROL16(temp3.y); + sharedMemory[3][96 + (threadIdx.x << 1) + 0] = ROR8(temp3.x); + sharedMemory[3][96 + (threadIdx.x << 1) + 1] = ROR8(temp3.y); + } +} + + +__device__ __forceinline__ +void aes_gpu_init_lt_256(uint32_t sharedMemory[4][256]) +{ + if (threadIdx.x<128){ + /* each thread startup will fill 2 uint32 */ + uint2 temp = __ldg(&((uint2*)&d_AES0)[threadIdx.x]); + + sharedMemory[0][(threadIdx.x << 1) + 0] = temp.x; + sharedMemory[0][(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); + } +} + + +__device__ __forceinline__ +static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ + + y0 = __ldg(&d_AES0[__byte_perm(x0, 0, 0x4440)]); + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y0 ^= k0; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); + + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)]; + + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; +#ifdef INTENSIVE_GMF + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); +#else + y2 ^= sharedMemory[3][__byte_perm(x1, 0, 0x4443)]; +#endif + y2 ^= __ldg(&d_AES0[__byte_perm(x2, 0, 0x4440)]); + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + y1 ^= sharedMemory[0][__byte_perm(x1, 0, 0x4440)]; + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; +} + +__device__ __forceinline__ +static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ + + y0 = __ldg(&d_AES0[x0 & 0xff]); + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y0 ^= k0; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); + + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + y3 ^= __ldg(&d_AES0[x3 & 0xff]); + + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); + y2 ^= __ldg(&d_AES0[x2 & 0xff]); + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + y1 ^= sharedMemory[0][x1 & 0xff]; + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; +} + + +__device__ __forceinline__ +static void aes_round(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + + y0 = sharedMemory[0][x0 & 0xff]; + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); + + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + y3 ^= sharedMemory[0][x3 & 0xff]; + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); + y2 ^= sharedMemory[0][x2 & 0xff]; + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); +#ifdef INTENSIVE_GMF + y1 ^= __ldg(&d_AES0[x1 & 0xff]); +#else + y1 ^= sharedMemory[0][x1 & 0xff]; +#endif + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; +} + + +__device__ __forceinline__ +static void aes_round_LDG(const uint32_t sharedMemory[4][256], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3){ + + y0 = sharedMemory[0][__byte_perm(x0, 0, 0x4440)]; + y0 ^= sharedMemory[1][__byte_perm(x1, 0, 0x4441)]; + y0 ^= sharedMemory[2][__byte_perm(x2, 0, 0x4442)]; + y0 ^= __ldg(&d_AES3[__byte_perm(x3, 0, 0x4443)]); + + y3 = sharedMemory[1][__byte_perm(x0, 0, 0x4441)]; + y3 ^= sharedMemory[2][__byte_perm(x1, 0, 0x4442)]; + y3 ^= __ldg(&d_AES3[__byte_perm(x2, 0, 0x4443)]); + y3 ^= sharedMemory[0][__byte_perm(x3, 0, 0x4440)]; + + y2 = sharedMemory[2][__byte_perm(x0, 0, 0x4442)]; + y2 ^= __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)]); + y2 ^= sharedMemory[0][__byte_perm(x2, 0, 0x4440)]; + y2 ^= sharedMemory[1][__byte_perm(x3, 0, 0x4441)]; + + y1 = __ldg(&d_AES3[__byte_perm(x0, 0, 0x4443)]); + y1 ^= __ldg(&d_AES0[__byte_perm(x1, 0, 0x4440)]); + y1 ^= sharedMemory[1][__byte_perm(x2, 0, 0x4441)]; + y1 ^= sharedMemory[2][__byte_perm(x3, 0, 0x4442)]; +} + + +__device__ __forceinline__ void AES_2ROUND(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); + + aes_round(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); + + // hier werden wir ein carry brauchen (oder auch nicht) + k0++; +} + +__device__ __forceinline__ void AES_2ROUND_LDG(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round_LDG(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); + aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); + + // hier werden wir ein carry brauchen (oder auch nicht) + k0++; +} + +__device__ __forceinline__ void AES_2ROUND_LDG2(const uint32_t sharedMemory[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); + + aes_round_LDG(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); + + // hier werden wir ein carry brauchen (oder auch nicht) + k0++; +} + + +__device__ __forceinline__ +static void AES_ROUND_NOKEY(const uint32_t sharedMemory[4][256], uint4* x){ + + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3); + + x->x = y0; + x->y = y1; + x->z = y2; + x->w = y3; +} + +__device__ __forceinline__ +static void KEY_EXPAND_ELT(const uint32_t sharedMemory[4][256], uint32_t *k) +{ + + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3); + + k[0] = y1; + k[1] = y2; + k[2] = y3; + k[3] = y0; +} + +__device__ __forceinline__ +static void aes_round_32(const uint32_t sharedMemory[1024 * 8], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = sharedMemory[(threadIdx.x & 31) + ((x0 & 0xff) << 5)]; + y3 = ROL8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x0, 0, 0x4441) << 5))]); + y2 = ROL16(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x0, 0, 0x4442) << 5)]); + y1 = ROR8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x0, 0, 0x4443) << 5)]); + + y0 ^= ROL8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x1, 0, 0x4441) << 5))]); + y3 ^= ROL16(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x1, 0, 0x4442) << 5))]); + y2 ^= ROR8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x1, 0, 0x4443) << 5)]); + y1 ^= sharedMemory[(threadIdx.x & 31) + ((x1 & 0xff) << 5)]; + + y0 ^= k0; + y0 ^= ROL16(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x2, 0, 0x4442) << 5))]); + y3 ^= ROR8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x2, 0, 0x4443) << 5))]); + y2 ^= sharedMemory[(threadIdx.x & 31) + ((x2 & 0xff) << 5)]; + y1 ^= ROL8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x2, 0, 0x4441) << 5)]); + + y0 ^= ROR8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x3, 0, 0x4443) << 5))]); + y3 ^= sharedMemory[(threadIdx.x & 31) + ((x3 & 0xff) << 5)]; + y2 ^= ROL8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x3, 0, 0x4441) << 5)]); + y1 ^= ROL16(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x3, 0, 0x4442) << 5)]); +} + +__device__ __forceinline__ +static void aes_round_32(const uint32_t sharedMemory[1024 * 8], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = sharedMemory[(threadIdx.x & 31) + ((x0 & 0xff) << 5)]; + y0 ^= ROL8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x1, 0, 0x4441) << 5))]); + y0 ^= ROL16(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x2, 0, 0x4442) << 5))]); + y0 ^= ROR8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x3, 0, 0x4443) << 5))]); + + y3 = ROL8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x0, 0, 0x4441) << 5))]); + y3 ^= ROL16(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x1, 0, 0x4442) << 5))]); + y3 ^= ROR8(sharedMemory[(threadIdx.x & 31) + ((__byte_perm(x2, 0, 0x4443) << 5))]); + y3 ^= sharedMemory[(threadIdx.x & 31) + ((x3 & 0xff) << 5)]; + + y2 = ROL16(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x0, 0, 0x4442) << 5)]); + y2 ^= ROR8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x1, 0, 0x4443) << 5)]); + y2 ^= sharedMemory[(threadIdx.x & 31) + ((x2 & 0xff) << 5)]; + y2 ^= ROL8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x3, 0, 0x4441) << 5)]); + + y1 = ROR8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x0, 0, 0x4443) << 5)]); + y1 ^= sharedMemory[(threadIdx.x & 31) + ((x1 & 0xff) << 5)]; + y1 ^= ROL8(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x2, 0, 0x4441) << 5)]); + y1 ^= ROL16(sharedMemory[(threadIdx.x & 31) + (__byte_perm(x3, 0, 0x4442) << 5)]); +} + +__device__ __forceinline__ void AES_2ROUND_32(const uint32_t sharedMemory[1024 * 8], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round_32(sharedMemory, x0, x1, x2, x3, k0, y0, y1, y2, y3); + + aes_round_32(sharedMemory, y0, y1, y2, y3, x0, x1, x2, x3); + + // hier werden wir ein carry brauchen (oder auch nicht) + k0++; +} +__device__ __forceinline__ +static void AES_ROUND_NOKEY_32(const uint32_t sharedMemory[1024 * 8], uint4* x){ + + uint32_t y0, y1, y2, y3; + aes_round_32(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3); + + x->x = y0; + x->y = y1; + x->z = y2; + x->w = y3; +} + +__device__ __forceinline__ +static void AES_ROUND_NOKEY_32(const uint32_t sharedMemory[1024 * 8], uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4) +{ + uint32_t y0, y1, y2, y3; + aes_round_32(sharedMemory, x1, x2, x3, x4, y0, y1, y2, y3); + + x1 = y0; + x2 = y1; + x3 = y2; + x4 = y3; +} + + + +__device__ __forceinline__ +static void KEY_EXPAND_ELT_32(const uint32_t sharedMemory[1024 * 8], uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4) +{ + + uint32_t y0, y1, y2, y3; + aes_round_32(sharedMemory, x1, x2, x3, x4, y0, y1, y2, y3); + + x1 = y1; + x2 = y2; + x3 = y3; + x4 = y0; +} + + +__device__ __forceinline__ +static void KEY_EXPAND_ELT_32(const uint32_t sharedMemory[1024 * 8], uint32_t *k) +{ + + uint32_t y0, y1, y2, y3; + aes_round_32(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3); + + k[0] = y1; + k[1] = y2; + k[2] = y3; + k[3] = y0; +} + + +__device__ __forceinline__ +void aes_gpu_init256_32(uint32_t sharedMemory[1024 * 8]) +{ + /* each thread startup will fill a uint32 */ + const uint32_t thread = threadIdx.x << 5; + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + sharedMemory[thread] = temp; + sharedMemory[1 + thread] = temp; + sharedMemory[2 + thread] = temp; + sharedMemory[3 + thread] = temp; + sharedMemory[4 + thread] = temp; + sharedMemory[5 + thread] = temp; + sharedMemory[6 + thread] = temp; + sharedMemory[7 + thread] = temp; + sharedMemory[8 + thread] = temp; + sharedMemory[9 + thread] = temp; + sharedMemory[10 + thread] = temp; + sharedMemory[11 + thread] = temp; + sharedMemory[12 + thread] = temp; + sharedMemory[13 + thread] = temp; + sharedMemory[14 + thread] = temp; + sharedMemory[15 + thread] = temp; + sharedMemory[16 + thread] = temp; + sharedMemory[17 + thread] = temp; + sharedMemory[18 + thread] = temp; + sharedMemory[19 + thread] = temp; + sharedMemory[20 + thread] = temp; + sharedMemory[21 + thread] = temp; + sharedMemory[22 + thread] = temp; + sharedMemory[23 + thread] = temp; + sharedMemory[24 + thread] = temp; + sharedMemory[25 + thread] = temp; + sharedMemory[26 + thread] = temp; + sharedMemory[27 + thread] = temp; + sharedMemory[28 + thread] = temp; + sharedMemory[29 + thread] = temp; + sharedMemory[30 + thread] = temp; + sharedMemory[31 + thread] = temp; +} + +__device__ __forceinline__ +void aes_gpu_init128_32(uint32_t sharedMemory[1024 * 8]) +{ + /* each thread startup will fill a uint32 */ + const uint32_t thread = threadIdx.x << 5; + const uint32_t thread2 = ((threadIdx.x + 128) << 5); + + uint32_t temp = __ldg(&d_AES0[threadIdx.x]); + uint32_t temp2 = __ldg(&d_AES0[threadIdx.x + 128]); + sharedMemory[thread] = temp; + sharedMemory[1 + thread] = temp; + sharedMemory[2 + thread] = temp; + sharedMemory[3 + thread] = temp; + sharedMemory[4 + thread] = temp; + sharedMemory[5 + thread] = temp; + sharedMemory[6 + thread] = temp; + sharedMemory[7 + thread] = temp; + sharedMemory[8 + thread] = temp; + sharedMemory[9 + thread] = temp; + sharedMemory[10 + thread] = temp; + sharedMemory[11 + thread] = temp; + sharedMemory[12 + thread] = temp; + sharedMemory[13 + thread] = temp; + sharedMemory[14 + thread] = temp; + sharedMemory[15 + thread] = temp; + sharedMemory[16 + thread] = temp; + sharedMemory[17 + thread] = temp; + sharedMemory[18 + thread] = temp; + sharedMemory[19 + thread] = temp; + sharedMemory[20 + thread] = temp; + sharedMemory[21 + thread] = temp; + sharedMemory[22 + thread] = temp; + sharedMemory[23 + thread] = temp; + sharedMemory[24 + thread] = temp; + sharedMemory[25 + thread] = temp; + sharedMemory[26 + thread] = temp; + sharedMemory[27 + thread] = temp; + sharedMemory[28 + thread] = temp; + sharedMemory[29 + thread] = temp; + sharedMemory[30 + thread] = temp; + sharedMemory[31 + thread] = temp; + + sharedMemory[thread2] = temp2; + sharedMemory[1 + thread2] = temp2; + sharedMemory[2 + thread2] = temp2; + sharedMemory[3 + thread2] = temp2; + sharedMemory[4 + thread2] = temp2; + sharedMemory[5 + thread2] = temp2; + sharedMemory[6 + thread2] = temp2; + sharedMemory[7 + thread2] = temp2; + sharedMemory[8 + thread2] = temp2; + sharedMemory[9 + thread2] = temp2; + sharedMemory[10 + thread2] = temp2; + sharedMemory[11 + thread2] = temp2; + sharedMemory[12 + thread2] = temp2; + sharedMemory[13 + thread2] = temp2; + sharedMemory[14 + thread2] = temp2; + sharedMemory[15 + thread2] = temp2; + sharedMemory[16 + thread2] = temp2; + sharedMemory[17 + thread2] = temp2; + sharedMemory[18 + thread2] = temp2; + sharedMemory[19 + thread2] = temp2; + sharedMemory[20 + thread2] = temp2; + sharedMemory[21 + thread2] = temp2; + sharedMemory[22 + thread2] = temp2; + sharedMemory[23 + thread2] = temp2; + sharedMemory[24 + thread2] = temp2; + sharedMemory[25 + thread2] = temp2; + sharedMemory[26 + thread2] = temp2; + sharedMemory[27 + thread2] = temp2; + sharedMemory[28 + thread2] = temp2; + sharedMemory[29 + thread2] = temp2; + sharedMemory[30 + thread2] = temp2; + sharedMemory[31 + thread2] = temp2; + +} \ No newline at end of file diff --git a/x11/cuda_x11_cubehash512.cu b/x11/cuda_x11_cubehash512.cu index f7ce97c4b8..c630a9e719 100644 --- a/x11/cuda_x11_cubehash512.cu +++ b/x11/cuda_x11_cubehash512.cu @@ -1,320 +1,863 @@ -#include -#include - -#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ -#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ - -#if __CUDA_ARCH__ < 350 -#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) -#else -#define LROT(x, bits) __funnelshift_l(x, x, bits) -#endif - -#define ROTATEUPWARDS7(a) LROT(a,7) -#define ROTATEUPWARDS11(a) LROT(a,11) - -#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } - -__device__ __constant__ -static const uint32_t c_IV_512[32] = { - 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, - 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, - 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, - 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, - 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, - 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, - 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, - 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 +/* + Based on Tanguy Pruvot's repo + Provos Alexis - 2016 + Optimized for nvidia pascal/volta by sp (2018/2019) +*/ + +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" + +#define AESx(x) (x ##UL) /* SPH_C32(x) */ + +__constant__ static uint32_t __align__(16) d_AES0[256] = { + 0xA56363C6, 0x847C7CF8, 0x997777EE, 0x8D7B7BF6, 0x0DF2F2FF, 0xBD6B6BD6, 0xB16F6FDE, 0x54C5C591, 0x50303060, 0x03010102, 0xA96767CE, 0x7D2B2B56, 0x19FEFEE7, 0x62D7D7B5, 0xE6ABAB4D, 0x9A7676EC, + 0x45CACA8F, 0x9D82821F, 0x40C9C989, 0x877D7DFA, 0x15FAFAEF, 0xEB5959B2, 0xC947478E, 0x0BF0F0FB, 0xECADAD41, 0x67D4D4B3, 0xFDA2A25F, 0xEAAFAF45, 0xBF9C9C23, 0xF7A4A453, 0x967272E4, 0x5BC0C09B, + 0xC2B7B775, 0x1CFDFDE1, 0xAE93933D, 0x6A26264C, 0x5A36366C, 0x413F3F7E, 0x02F7F7F5, 0x4FCCCC83, 0x5C343468, 0xF4A5A551, 0x34E5E5D1, 0x08F1F1F9, 0x937171E2, 0x73D8D8AB, 0x53313162, 0x3F15152A, + 0x0C040408, 0x52C7C795, 0x65232346, 0x5EC3C39D, 0x28181830, 0xA1969637, 0x0F05050A, 0xB59A9A2F, 0x0907070E, 0x36121224, 0x9B80801B, 0x3DE2E2DF, 0x26EBEBCD, 0x6927274E, 0xCDB2B27F, 0x9F7575EA, + 0x1B090912, 0x9E83831D, 0x742C2C58, 0x2E1A1A34, 0x2D1B1B36, 0xB26E6EDC, 0xEE5A5AB4, 0xFBA0A05B, 0xF65252A4, 0x4D3B3B76, 0x61D6D6B7, 0xCEB3B37D, 0x7B292952, 0x3EE3E3DD, 0x712F2F5E, 0x97848413, + 0xF55353A6, 0x68D1D1B9, 0x00000000, 0x2CEDEDC1, 0x60202040, 0x1FFCFCE3, 0xC8B1B179, 0xED5B5BB6, 0xBE6A6AD4, 0x46CBCB8D, 0xD9BEBE67, 0x4B393972, 0xDE4A4A94, 0xD44C4C98, 0xE85858B0, 0x4ACFCF85, + 0x6BD0D0BB, 0x2AEFEFC5, 0xE5AAAA4F, 0x16FBFBED, 0xC5434386, 0xD74D4D9A, 0x55333366, 0x94858511, 0xCF45458A, 0x10F9F9E9, 0x06020204, 0x817F7FFE, 0xF05050A0, 0x443C3C78, 0xBA9F9F25, 0xE3A8A84B, + 0xF35151A2, 0xFEA3A35D, 0xC0404080, 0x8A8F8F05, 0xAD92923F, 0xBC9D9D21, 0x48383870, 0x04F5F5F1, 0xDFBCBC63, 0xC1B6B677, 0x75DADAAF, 0x63212142, 0x30101020, 0x1AFFFFE5, 0x0EF3F3FD, 0x6DD2D2BF, + 0x4CCDCD81, 0x140C0C18, 0x35131326, 0x2FECECC3, 0xE15F5FBE, 0xA2979735, 0xCC444488, 0x3917172E, 0x57C4C493, 0xF2A7A755, 0x827E7EFC, 0x473D3D7A, 0xAC6464C8, 0xE75D5DBA, 0x2B191932, 0x957373E6, + 0xA06060C0, 0x98818119, 0xD14F4F9E, 0x7FDCDCA3, 0x66222244, 0x7E2A2A54, 0xAB90903B, 0x8388880B, 0xCA46468C, 0x29EEEEC7, 0xD3B8B86B, 0x3C141428, 0x79DEDEA7, 0xE25E5EBC, 0x1D0B0B16, 0x76DBDBAD, + 0x3BE0E0DB, 0x56323264, 0x4E3A3A74, 0x1E0A0A14, 0xDB494992, 0x0A06060C, 0x6C242448, 0xE45C5CB8, 0x5DC2C29F, 0x6ED3D3BD, 0xEFACAC43, 0xA66262C4, 0xA8919139, 0xA4959531, 0x37E4E4D3, 0x8B7979F2, + 0x32E7E7D5, 0x43C8C88B, 0x5937376E, 0xB76D6DDA, 0x8C8D8D01, 0x64D5D5B1, 0xD24E4E9C, 0xE0A9A949, 0xB46C6CD8, 0xFA5656AC, 0x07F4F4F3, 0x25EAEACF, 0xAF6565CA, 0x8E7A7AF4, 0xE9AEAE47, 0x18080810, + 0xD5BABA6F, 0x887878F0, 0x6F25254A, 0x722E2E5C, 0x241C1C38, 0xF1A6A657, 0xC7B4B473, 0x51C6C697, 0x23E8E8CB, 0x7CDDDDA1, 0x9C7474E8, 0x211F1F3E, 0xDD4B4B96, 0xDCBDBD61, 0x868B8B0D, 0x858A8A0F, + 0x907070E0, 0x423E3E7C, 0xC4B5B571, 0xAA6666CC, 0xD8484890, 0x05030306, 0x01F6F6F7, 0x120E0E1C, 0xA36161C2, 0x5F35356A, 0xF95757AE, 0xD0B9B969, 0x91868617, 0x58C1C199, 0x271D1D3A, 0xB99E9E27, + 0x38E1E1D9, 0x13F8F8EB, 0xB398982B, 0x33111122, 0xBB6969D2, 0x70D9D9A9, 0x898E8E07, 0xA7949433, 0xB69B9B2D, 0x221E1E3C, 0x92878715, 0x20E9E9C9, 0x49CECE87, 0xFF5555AA, 0x78282850, 0x7ADFDFA5, + 0x8F8C8C03, 0xF8A1A159, 0x80898909, 0x170D0D1A, 0xDABFBF65, 0x31E6E6D7, 0xC6424284, 0xB86868D0, 0xC3414182, 0xB0999929, 0x772D2D5A, 0x110F0F1E, 0xCBB0B07B, 0xFC5454A8, 0xD6BBBB6D, 0x3A16162C }; -__device__ __forceinline__ -static void rrounds(uint32_t x[2][2][2][2][2]) + + +__device__ __forceinline__ void aes_round(const uint32_t sharedMemory[256][32], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - int r; - int j; - int k; - int l; - int m; - -//#pragma unroll 16 - for (r = 0;r < CUBEHASH_ROUNDS;++r) { - - /* "add x_0jklm into x_1jklmn modulo 2^32" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 7 bits" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); - - /* "swap x_00klm with x_01klm" */ -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - SWAP(x[0][0][k][l][m],x[0][1][k][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jk0m with x_1jk1m" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (m = 0;m < 2;++m) - SWAP(x[1][j][k][0][m],x[1][j][k][1][m]) - - /* "add x_0jklm into x_1jklm modulo 2^32" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; - - /* "rotate x_0jklm upwards by 11 bits" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); - - /* "swap x_0j0lm with x_0j1lm" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - SWAP(x[0][j][0][l][m],x[0][j][1][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) -#pragma unroll 2 - for (m = 0;m < 2;++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jkl0 with x_1jkl1" */ -#pragma unroll 2 - for (j = 0;j < 2;++j) -#pragma unroll 2 - for (k = 0;k < 2;++k) -#pragma unroll 2 - for (l = 0;l < 2;++l) - SWAP(x[1][j][k][l][0],x[1][j][k][l][1]) - - } + const uint32_t index = threadIdx.x & 0x1f; + + y0 = sharedMemory[__byte_perm(x0, 0, 0x4440)][index]; + y3 = ROL8(sharedMemory[__byte_perm(x0, 0, 0x4441)][index]); + y2 = ROL16(sharedMemory[__byte_perm(x0, 0, 0x4442)][index]); + y1 = ROR8(sharedMemory[__byte_perm(x0, 0, 0x4443)][index]); + + y1 ^= sharedMemory[__byte_perm(x1, 0, 0x4440)][index]; + y0 ^= ROL8(sharedMemory[__byte_perm(x1, 0, 0x4441)][index]); + y3 ^= ROL16(sharedMemory[__byte_perm(x1, 0, 0x4442)][index]); + y2 ^= ROR8(sharedMemory[__byte_perm(x1, 0, 0x4443)][index]); + + y0 ^= k0; + + y2 ^= sharedMemory[__byte_perm(x2, 0, 0x4440)][index]; + y1 ^= ROL8(sharedMemory[__byte_perm(x2, 0, 0x4441)][index]); + y0 ^= ROL16(sharedMemory[__byte_perm(x2, 0, 0x4442)][index]); + y3 ^= ROR8(sharedMemory[__byte_perm(x2, 0, 0x4443)][index]); + + y3 ^= sharedMemory[__byte_perm(x3, 0, 0x4440)][index]; + y2 ^= ROL8(sharedMemory[__byte_perm(x3, 0, 0x4441)][index]); + y1 ^= ROL16(sharedMemory[__byte_perm(x3, 0, 0x4442)][index]); + y0 ^= ROR8(sharedMemory[__byte_perm(x3, 0, 0x4443)][index]); } -__device__ __forceinline__ -static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2]) +__device__ __forceinline__ void aes_round(const uint32_t sharedMemory[256][32], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) { - // read 32 bytes input from global mem with uint2 chunks - AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]); - AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]); - AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]); - AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]); + const uint32_t index = threadIdx.x & 0x1f; + + y0 = sharedMemory[__byte_perm(x0, 0, 0x4440)][index]; + y3 = ROL8(sharedMemory[__byte_perm(x0, 0, 0x4441)][index]); + y2 = ROL16(sharedMemory[__byte_perm(x0, 0, 0x4442)][index]); + y1 = ROR8(sharedMemory[__byte_perm(x0, 0, 0x4443)][index]); + + y1 ^= sharedMemory[__byte_perm(x1, 0, 0x4440)][index]; + y0 ^= ROL8(sharedMemory[__byte_perm(x1, 0, 0x4441)][index]); + y3 ^= ROL16(sharedMemory[__byte_perm(x1, 0, 0x4442)][index]); + y2 ^= ROR8(sharedMemory[__byte_perm(x1, 0, 0x4443)][index]); + + y2 ^= sharedMemory[__byte_perm(x2, 0, 0x4440)][index]; + y1 ^= ROL8(sharedMemory[__byte_perm(x2, 0, 0x4441)][index]); + y0 ^= ROL16(sharedMemory[__byte_perm(x2, 0, 0x4442)][index]); + y3 ^= ROR8(sharedMemory[__byte_perm(x2, 0, 0x4443)][index]); + + y3 ^= sharedMemory[__byte_perm(x3, 0, 0x4440)][index]; + y2 ^= ROL8(sharedMemory[__byte_perm(x3, 0, 0x4441)][index]); + y1 ^= ROL16(sharedMemory[__byte_perm(x3, 0, 0x4442)][index]); + y0 ^= ROR8(sharedMemory[__byte_perm(x3, 0, 0x4443)][index]); } -__device__ __forceinline__ -static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2]) + +__device__ __forceinline__ void AES_ROUND_NOKEY(const uint32_t sharedMemory[256][32], uint4* x){ + + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3); + + x->x = y0; + x->y = y1; + x->z = y2; + x->w = y3; +} + +__device__ __forceinline__ void KEY_EXPAND_ELT(const uint32_t sharedMemory[256][32], uint32_t *k) { - // used to write final hash to global mem - AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]); - AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]); - AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]); - AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]); - AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]); - AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]); - AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]); - AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]); + + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3); + + k[0] = y1; + k[1] = y2; + k[2] = y3; + k[3] = y0; } -#define Init(x) \ - AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \ - AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \ - AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \ - AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \ - AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \ - AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \ - AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \ - AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \ - AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \ - AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \ - AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \ - AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \ - AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \ - AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \ - AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \ - AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]); __device__ __forceinline__ -static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data) +void aes_gpu_init256(uint32_t sharedMemory[256][32]) { - /* "xor the block into the first b bytes of the state" */ - block_tox(data, x); - /* "and then transform the state invertibly through r identical rounds" */ - rrounds(x); + uint32_t temp = d_AES0[threadIdx.x]; + + sharedMemory[threadIdx.x][0] = temp; + sharedMemory[threadIdx.x][1] = temp; + sharedMemory[threadIdx.x][2] = temp; + sharedMemory[threadIdx.x][3] = temp; + sharedMemory[threadIdx.x][4] = temp; + sharedMemory[threadIdx.x][5] = temp; + sharedMemory[threadIdx.x][6] = temp; + sharedMemory[threadIdx.x][7] = temp; + sharedMemory[threadIdx.x][8] = temp; + sharedMemory[threadIdx.x][9] = temp; + sharedMemory[threadIdx.x][10] = temp; + sharedMemory[threadIdx.x][11] = temp; + sharedMemory[threadIdx.x][12] = temp; + sharedMemory[threadIdx.x][13] = temp; + sharedMemory[threadIdx.x][14] = temp; + sharedMemory[threadIdx.x][15] = temp; + sharedMemory[threadIdx.x][16] = temp; + sharedMemory[threadIdx.x][17] = temp; + sharedMemory[threadIdx.x][18] = temp; + sharedMemory[threadIdx.x][19] = temp; + sharedMemory[threadIdx.x][20] = temp; + sharedMemory[threadIdx.x][21] = temp; + sharedMemory[threadIdx.x][22] = temp; + sharedMemory[threadIdx.x][23] = temp; + sharedMemory[threadIdx.x][24] = temp; + sharedMemory[threadIdx.x][25] = temp; + sharedMemory[threadIdx.x][26] = temp; + sharedMemory[threadIdx.x][27] = temp; + sharedMemory[threadIdx.x][28] = temp; + sharedMemory[threadIdx.x][29] = temp; + sharedMemory[threadIdx.x][30] = temp; + sharedMemory[threadIdx.x][31] = temp; + + /* sharedMemory[(threadIdx.x << 1) + 0][0] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][1] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][2] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][3] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][4] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][5] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][6] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][7] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][8] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][9] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][10] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][11] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][12] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][13] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][14] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][15] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][16] = temp.x; + + sharedMemory[(threadIdx.x << 1) + 1][0] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][1] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][2] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][3] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][4] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][5] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][6] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][7] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][8] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][9] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][10] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][11] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][12] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][13] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][14] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][15] = temp.y; + */ + + /* sharedMemory[1][(threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[1][(threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + sharedMemory[3][(threadIdx.x << 1) + 0] = ROR8(temp.x); + sharedMemory[3][(threadIdx.x << 1) + 1] = ROR8(temp.y); + */ } __device__ __forceinline__ -static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) +void aes_gpu_init128(uint32_t sharedMemory[256][32]) { - /* "the integer 1 is xored into the last state word x_11111" */ - x[1][1][1][1][1] ^= 1; + uint32_t temp = d_AES0[threadIdx.x<<1]; + uint32_t temp2 = d_AES0[(threadIdx.x << 1) +1]; + + sharedMemory[threadIdx.x << 1][0] = temp; + sharedMemory[threadIdx.x << 1][1] = temp; + sharedMemory[threadIdx.x << 1][2] = temp; + sharedMemory[threadIdx.x << 1][3] = temp; + sharedMemory[threadIdx.x << 1][4] = temp; + sharedMemory[threadIdx.x << 1][5] = temp; + sharedMemory[threadIdx.x << 1][6] = temp; + sharedMemory[threadIdx.x << 1][7] = temp; + sharedMemory[threadIdx.x << 1][8] = temp; + sharedMemory[threadIdx.x << 1][9] = temp; + sharedMemory[threadIdx.x << 1][10] = temp; + sharedMemory[threadIdx.x << 1][11] = temp; + sharedMemory[threadIdx.x << 1][12] = temp; + sharedMemory[threadIdx.x << 1][13] = temp; + sharedMemory[threadIdx.x << 1][14] = temp; + sharedMemory[threadIdx.x << 1][15] = temp; + sharedMemory[threadIdx.x << 1][16] = temp; + sharedMemory[threadIdx.x << 1][17] = temp; + sharedMemory[threadIdx.x << 1][18] = temp; + sharedMemory[threadIdx.x << 1][19] = temp; + sharedMemory[threadIdx.x << 1][20] = temp; + sharedMemory[threadIdx.x << 1][21] = temp; + sharedMemory[threadIdx.x << 1][22] = temp; + sharedMemory[threadIdx.x << 1][23] = temp; + sharedMemory[threadIdx.x << 1][24] = temp; + sharedMemory[threadIdx.x << 1][25] = temp; + sharedMemory[threadIdx.x << 1][26] = temp; + sharedMemory[threadIdx.x << 1][27] = temp; + sharedMemory[threadIdx.x << 1][28] = temp; + sharedMemory[threadIdx.x << 1][29] = temp; + sharedMemory[threadIdx.x << 1][30] = temp; + sharedMemory[threadIdx.x << 1][31] = temp; + + sharedMemory[(threadIdx.x << 1) + 1][0] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][1] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][2] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][3] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][4] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][5] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][6] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][7] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][8] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][9] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][10] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][11] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][12] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][13] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][14] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][15] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][16] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][17] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][18] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][19] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][20] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][21] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][22] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][23] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][24] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][25] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][26] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][27] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][28] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][29] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][30] = temp2; + sharedMemory[(threadIdx.x << 1) + 1][31] = temp2; + + /* sharedMemory[(threadIdx.x << 1) + 0][0] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][1] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][2] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][3] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][4] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][5] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][6] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][7] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][8] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][9] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][10] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][11] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][12] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][13] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][14] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][15] = temp.x; + sharedMemory[(threadIdx.x << 1) + 0][16] = temp.x; + + sharedMemory[(threadIdx.x << 1) + 1][0] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][1] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][2] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][3] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][4] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][5] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][6] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][7] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][8] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][9] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][10] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][11] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][12] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][13] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][14] = temp.y; + sharedMemory[(threadIdx.x << 1) + 1][15] = temp.y; + */ +} - /* "the state is then transformed invertibly through 10r identical rounds" */ - #pragma unroll 10 - for (int i = 0; i < 10; i++) rrounds(x); - /* "output the first h/8 bytes of the state" */ - hash_fromx(hashval, x); +__device__ __forceinline__ void round_3_7_11(const uint32_t sharedMemory[256][32], uint32_t* r, uint4 *p, uint4 &x){ + KEY_EXPAND_ELT(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + x = p[2] ^ *(uint4*)&r[0]; + KEY_EXPAND_ELT(sharedMemory, &r[4]); + r[4] ^= r[0]; + r[5] ^= r[1]; + r[6] ^= r[2]; + r[7] ^= r[3]; + AES_ROUND_NOKEY(sharedMemory, &x); + x.x ^= r[4]; + x.y ^= r[5]; + x.z ^= r[6]; + x.w ^= r[7]; + KEY_EXPAND_ELT(sharedMemory, &r[8]); + r[8] ^= r[4]; + r[9] ^= r[5]; + r[10] ^= r[6]; + r[11] ^= r[7]; + AES_ROUND_NOKEY(sharedMemory, &x); + x.x ^= r[8]; + x.y ^= r[9]; + x.z ^= r[10]; + x.w ^= r[11]; + KEY_EXPAND_ELT(sharedMemory, &r[12]); + r[12] ^= r[8]; + r[13] ^= r[9]; + r[14] ^= r[10]; + r[15] ^= r[11]; + AES_ROUND_NOKEY(sharedMemory, &x); + x.x ^= r[12]; + x.y ^= r[13]; + x.z ^= r[14]; + x.w ^= r[15]; + AES_ROUND_NOKEY(sharedMemory, &x); + p[1].x ^= x.x; + p[1].y ^= x.y; + p[1].z ^= x.z; + p[1].w ^= x.w; + KEY_EXPAND_ELT(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[0] ^ *(uint4*)&r[16]; + KEY_EXPAND_ELT(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &x); + x ^= *(uint4*)&r[20]; + KEY_EXPAND_ELT(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &x); + x ^= *(uint4*)&r[24]; + KEY_EXPAND_ELT(sharedMemory, &r[28]); + AES_ROUND_NOKEY(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &x); + p[3] ^= x; } +__device__ __forceinline__ +void round_4_8_12(const uint32_t sharedMemory[256][32], uint32_t* r, uint4 *p, uint4 &x){ + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + x = p[1] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &x); + + r[4] ^= r[29]; r[5] ^= r[30]; + r[6] ^= r[31]; r[7] ^= r[0]; + + x ^= *(uint4*)&r[4]; + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + AES_ROUND_NOKEY(sharedMemory, &x); + x ^= *(uint4*)&r[8]; + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + AES_ROUND_NOKEY(sharedMemory, &x); + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &x); + p[0] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + x = p[3] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + AES_ROUND_NOKEY(sharedMemory, &x); + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &x); + p[2] ^= x; +} -/***************************************************/ +//--END OF SHAVITE MACROS------------------------------------ -__global__ -void x11_cubehash512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) + +#define TPB 1024 + +__device__ __forceinline__ +static void rrounds(uint32_t *x) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + #pragma unroll 1 + for (int r = 0; r < 16; r++) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); + /* "add x_0jklm into x_1jklmn modulo 2^32 rotate x_0jklm upwards by 7 bits" */ + x[16] = x[16] + x[ 0]; x[ 0] = ROTL32(x[ 0], 7);x[17] = x[17] + x[ 1];x[ 1] = ROTL32(x[ 1], 7); + x[18] = x[18] + x[ 2]; x[ 2] = ROTL32(x[ 2], 7);x[19] = x[19] + x[ 3];x[ 3] = ROTL32(x[ 3], 7); + x[20] = x[20] + x[ 4]; x[ 4] = ROTL32(x[ 4], 7);x[21] = x[21] + x[ 5];x[ 5] = ROTL32(x[ 5], 7); + x[22] = x[22] + x[ 6]; x[ 6] = ROTL32(x[ 6], 7);x[23] = x[23] + x[ 7];x[ 7] = ROTL32(x[ 7], 7); + x[24] = x[24] + x[ 8]; x[ 8] = ROTL32(x[ 8], 7);x[25] = x[25] + x[ 9];x[ 9] = ROTL32(x[ 9], 7); + x[26] = x[26] + x[10]; x[10] = ROTL32(x[10], 7);x[27] = x[27] + x[11];x[11] = ROTL32(x[11], 7); + x[28] = x[28] + x[12]; x[12] = ROTL32(x[12], 7);x[29] = x[29] + x[13];x[13] = ROTL32(x[13], 7); + x[30] = x[30] + x[14]; x[14] = ROTL32(x[14], 7);x[31] = x[31] + x[15];x[15] = ROTL32(x[15], 7); + /* "swap x_00klm with x_01klm" */ + xchg(x[0], x[8]); x[0] ^= x[16]; x[8] ^= x[24]; xchg(x[1], x[9]); x[1] ^= x[17]; x[9] ^= x[25]; + xchg(x[2], x[10]); x[2] ^= x[18]; x[10] ^= x[26]; xchg(x[3], x[11]); x[3] ^= x[19]; x[11] ^= x[27]; + xchg(x[4], x[12]); x[4] ^= x[20]; x[12] ^= x[28]; xchg(x[5], x[13]); x[5] ^= x[21]; x[13] ^= x[29]; + xchg(x[6], x[14]); x[6] ^= x[22]; x[14] ^= x[30]; xchg(x[7], x[15]); x[7] ^= x[23]; x[15] ^= x[31]; + /* "swap x_1jk0m with x_1jk1m" */ + xchg(x[16], x[18]); xchg(x[17], x[19]); xchg(x[20], x[22]); xchg(x[21], x[23]); xchg(x[24], x[26]); xchg(x[25], x[27]); xchg(x[28], x[30]); xchg(x[29], x[31]); + /* "add x_0jklm into x_1jklm modulo 2^32 rotate x_0jklm upwards by 11 bits" */ + x[16] = x[16] + x[ 0]; x[ 0] = ROTL32(x[ 0],11);x[17] = x[17] + x[ 1];x[ 1] = ROTL32(x[ 1],11); + x[18] = x[18] + x[ 2]; x[ 2] = ROTL32(x[ 2],11);x[19] = x[19] + x[ 3];x[ 3] = ROTL32(x[ 3],11); + x[20] = x[20] + x[ 4]; x[ 4] = ROTL32(x[ 4],11);x[21] = x[21] + x[ 5];x[ 5] = ROTL32(x[ 5],11); + x[22] = x[22] + x[ 6]; x[ 6] = ROTL32(x[ 6],11);x[23] = x[23] + x[ 7];x[ 7] = ROTL32(x[ 7],11); + x[24] = x[24] + x[ 8]; x[ 8] = ROTL32(x[ 8],11);x[25] = x[25] + x[ 9];x[ 9] = ROTL32(x[ 9],11); + x[26] = x[26] + x[10]; x[10] = ROTL32(x[10],11);x[27] = x[27] + x[11];x[11] = ROTL32(x[11],11); + x[28] = x[28] + x[12]; x[12] = ROTL32(x[12],11);x[29] = x[29] + x[13];x[13] = ROTL32(x[13],11); + x[30] = x[30] + x[14]; x[14] = ROTL32(x[14],11);x[31] = x[31] + x[15];x[15] = ROTL32(x[15],11); + /* "swap x_0j0lm with x_0j1lm" */ + xchg(x[0], x[4]); x[0] ^= x[16]; x[4] ^= x[20]; xchg(x[1], x[5]); x[1] ^= x[17]; x[5] ^= x[21]; + xchg(x[2], x[6]); x[2] ^= x[18]; x[6] ^= x[22]; xchg(x[3], x[7]); x[3] ^= x[19]; x[7] ^= x[23]; + xchg(x[8], x[12]); x[8] ^= x[24]; x[12] ^= x[28]; xchg(x[9], x[13]); x[9] ^= x[25]; x[13] ^= x[29]; + xchg(x[10], x[14]); x[10] ^= x[26]; x[14] ^= x[30]; xchg(x[11], x[15]); x[11] ^= x[27]; x[15] ^= x[31]; + /* "swap x_1jkl0 with x_1jkl1" */ + xchg(x[16], x[17]); xchg(x[18], x[19]); xchg(x[20], x[21]); xchg(x[22], x[23]); xchg(x[24], x[25]); xchg(x[26], x[27]); xchg(x[28], x[29]); xchg(x[30], x[31]); + } +} - int hashPosition = nounce - startNounce; - uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition]; +/***************************************************/ +// GPU Hash Function +__global__ +void x11_cubehash512_gpu_hash_64(uint32_t threads, uint64_t *g_hash){ - uint32_t x[2][2][2][2][2]; - Init(x); + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - Update32(x, &Hash[0]); - Update32(x, &Hash[8]); + if (thread < threads){ + + uint32_t *Hash = (uint32_t*)&g_hash[8 * thread]; + + uint32_t x[32] = { + 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, + 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, + 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, + 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, + 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, + 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, + 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, + 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 + }; + + // erste Hälfte des Hashes (32 bytes) + //Update32(x, (const BitSequence*)Hash); + *(uint2x4*)&x[ 0] ^= __ldg4((uint2x4*)&Hash[0]); + + rrounds(x); + + // zweite Hälfte des Hashes (32 bytes) + // Update32(x, (const BitSequence*)(Hash+8)); + *(uint2x4*)&x[ 0] ^= __ldg4((uint2x4*)&Hash[8]); + + rrounds(x); // Padding Block - uint32_t last[8]; - last[0] = 0x80; - #pragma unroll 7 - for (int i=1; i < 8; i++) last[i] = 0; - Update32(x, last); - - Final(x, Hash); + x[ 0] ^= 0x80; + rrounds(x); + + // Final(x, (BitSequence*)Hash); + x[31] ^= 1; + + /* "the state is then transformed invertibly through 10r identical rounds" */ + #pragma unroll 10 + for (int i = 0;i < 10;++i) + rrounds(x); + + /* "output the first h/8 bytes of the state" */ + *(uint2x4*)&Hash[ 0] = *(uint2x4*)&x[ 0]; + *(uint2x4*)&Hash[ 8] = *(uint2x4*)&x[ 8]; } } -__host__ -void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) -{ - const uint32_t threadsperblock = 256; +__global__ +__launch_bounds__(448,2) +void x11_cubehashShavite512_gpu_hash_64(uint32_t threads, uint32_t *g_hash){ - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - size_t shared_size = 0; + __shared__ uint32_t sharedMemory[256][32]; - x11_cubehash512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); -} + if (threadIdx.x<256) aes_gpu_init256(sharedMemory); -__host__ -void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { } + if (thread < threads) + { + uint32_t *const hash = &g_hash[thread * 16U]; -/***************************************************/ + //Cubehash -#define WANT_CUBEHASH80 -#ifdef WANT_CUBEHASH80 + uint32_t x[32] = { + 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, + 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, + 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, + 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 + }; + uint32_t Hash[16]; + *(uint2x4*)&Hash[0] = __ldg4((uint2x4*)&hash[0]); + *(uint2x4*)&Hash[8] = __ldg4((uint2x4*)&hash[8]); -__constant__ -static uint32_t c_PaddedMessage80[20]; + *(uint2x4*)&x[0] ^= *(uint2x4*)&Hash[0]; -__host__ -void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata) -{ - cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); -} + rrounds(x); -__global__ -void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - const uint32_t nonce = startNounce + thread; - - uint32_t x[2][2][2][2][2]; - Init(x); - - uint32_t message[8]; - // first 32 bytes - AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]); - AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]); - Update32(x, message); - - // second 32 bytes - AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[8]); - AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]); - Update32(x, message); - - // last 16 bytes + Padding - AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]); - message[3] = cuda_swab32(nonce); - message[4] = 0x80; - message[5] = 0; - message[6] = 0; - message[7] = 0; - Update32(x, message); - - uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]); - Final(x, output); + *(uint2x4*)&x[0] ^= *(uint2x4*)&Hash[8]; + + rrounds(x); + x[0] ^= 0x80; + + rrounds(x); + x[31] ^= 1; + +// #pragma unroll 10 + for (int i = 0; i < 9; ++i) + rrounds(x); + + + rrounds(x); + + uint4 y; + uint32_t r[32]; + uint4 msg[4]; + // kopiere init-state + uint4 p[4]; + const uint32_t state[16] = { + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A + }; + *(uint2x4*)&p[0] = *(uint2x4*)&state[0]; + *(uint2x4*)&p[2] = *(uint2x4*)&state[8]; + +#pragma unroll 4 + for (int i = 0; i < 4; i++){ + *(uint4*)&msg[i] = *(uint4*)&x[i << 2]; + *(uint4*)&r[i << 2] = *(uint4*)&x[i << 2]; + } + r[16] = 0x80; r[17] = 0; r[18] = 0; r[19] = 0; + r[20] = 0; r[21] = 0; r[22] = 0; r[23] = 0; + r[24] = 0; r[25] = 0; r[26] = 0; r[27] = 0x02000000; + r[28] = 0; r[29] = 0; r[30] = 0; r[31] = 0x02000000; + y = p[1] ^ msg[0]; + __syncthreads(); + + AES_ROUND_NOKEY(sharedMemory, &y); + y ^= msg[1]; + AES_ROUND_NOKEY(sharedMemory, &y); + y ^= msg[2]; + AES_ROUND_NOKEY(sharedMemory, &y); + y ^= msg[3]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[0] ^= y; + y = p[3]; + y.x ^= 0x80; + AES_ROUND_NOKEY(sharedMemory, &y); + AES_ROUND_NOKEY(sharedMemory, &y); + y.w ^= 0x02000000; + AES_ROUND_NOKEY(sharedMemory, &y); + y.w ^= 0x02000000; + AES_ROUND_NOKEY(sharedMemory, &y); + p[2] ^= y; + + // 1 + KEY_EXPAND_ELT(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + r[0] ^= 0x200; + r[3] ^= 0xFFFFFFFF; + y = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[3] ^= y; + KEY_EXPAND_ELT(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + y = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[1] ^= y; + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + y = p[3] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + + r[4] ^= r[29]; r[5] ^= r[30]; + r[6] ^= r[31]; r[7] ^= r[0]; + + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[2] ^= y; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + y = p[1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + + p[0] ^= y; + + /* round 3, 7, 11 */ + round_3_7_11(sharedMemory, r, p, y); + + + /* round 4, 8, 12 */ + round_4_8_12(sharedMemory, r, p, y); + + // 2 + KEY_EXPAND_ELT(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + y = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + r[7] ^= (~0x200); + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[3] ^= y; + KEY_EXPAND_ELT(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + y = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[1] ^= y; + + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + y = p[3] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + r[4] ^= r[29]; + r[5] ^= r[30]; + r[6] ^= r[31]; + r[7] ^= r[0]; + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[2] ^= y; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + y = p[1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[0] ^= y; + + /* round 3, 7, 11 */ + round_3_7_11(sharedMemory, r, p, y); + + /* round 4, 8, 12 */ + round_4_8_12(sharedMemory, r, p, y); + + // 3 + KEY_EXPAND_ELT(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + y = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[3] ^= y; + KEY_EXPAND_ELT(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + y = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + r[30] ^= 0x200; + r[31] ^= 0xFFFFFFFF; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[1] ^= y; + + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + y = p[3] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + r[4] ^= r[29]; + r[5] ^= r[30]; + r[6] ^= r[31]; + r[7] ^= r[0]; + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[2] ^= y; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + y = p[1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[0] ^= y; + + /* round 3, 7, 11 */ + round_3_7_11(sharedMemory, r, p, y); + + /* round 4, 8, 12 */ + round_4_8_12(sharedMemory, r, p, y); + + /* round 13 */ + KEY_EXPAND_ELT(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + y = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + y ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + y ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + y ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[3] ^= y; + KEY_EXPAND_ELT(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + y = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + y ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + r[25] ^= 0x200; + r[27] ^= 0xFFFFFFFF; + y ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY(sharedMemory, &y); + KEY_EXPAND_ELT(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + y ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY(sharedMemory, &y); + p[1] ^= y; + + *(uint2x4*)&hash[0] = *(uint2x4*)&state[0] ^ *(uint2x4*)&p[2]; + *(uint2x4*)&hash[8] = *(uint2x4*)&state[8] ^ *(uint2x4*)&p[0]; } } + __host__ -void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) -{ - const uint32_t threadsperblock = 256; - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); +void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ + + // berechne wie viele Thread Blocks wir brauchen + dim3 grid((threads + TPB-1)/TPB); + dim3 block(TPB); - cubehash512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*) d_hash); + x11_cubehash512_gpu_hash_64<<>>(threads, (uint64_t*)d_hash); } -#endif \ No newline at end of file +__host__ +void x11_cubehash_shavite512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + dim3 grid((threads + 256 - 1) / 256); + dim3 block(256); + + x11_cubehashShavite512_gpu_hash_64 << > > (threads, d_hash); +} diff --git a/x11/cuda_x11_cubehash512_80.cu b/x11/cuda_x11_cubehash512_80.cu new file mode 100644 index 0000000000..f07cfe2820 --- /dev/null +++ b/x11/cuda_x11_cubehash512_80.cu @@ -0,0 +1,322 @@ +// (c) SP may 2016 + +#include +#include + +#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ +#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ + +#if __CUDA_ARCH__ < 350 +#define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) +#else +#define LROT(x, bits) __funnelshift_l(x, x, bits) +#endif + +#define ROTATEUPWARDS7(a) LROT(a,7) +#define ROTATEUPWARDS11(a) LROT(a,11) + +#define SWAP(a,b) xchg(a,b); + +__device__ __constant__ +static const uint32_t c_IV_512[32] = { + 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, + 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, + 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, + 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, + 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, + 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, + 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, + 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 +}; + +static const uint32_t c_IV_512_c[32] = { + 0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E, + 0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695, + 0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537, + 0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE, + 0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532, + 0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9, + 0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576, + 0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44 +}; + +__device__ __host__ __forceinline__ +static void rrounds(uint32_t x[2][2][2][2][2]) +{ + int r; + int j; + int k; + int l; + int m; + + #pragma unroll 16 + for (r = 0;r < CUBEHASH_ROUNDS;++r) { + + /* "add x_0jklm into x_1jklmn modulo 2^32" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 7 bits" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); + + /* "swap x_00klm with x_01klm" */ +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + SWAP(x[0][0][k][l][m],x[0][1][k][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jk0m with x_1jk1m" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (m = 0;m < 2;++m) + SWAP(x[1][j][k][0][m],x[1][j][k][1][m]) + + /* "add x_0jklm into x_1jklm modulo 2^32" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[1][j][k][l][m] += x[0][j][k][l][m]; + + /* "rotate x_0jklm upwards by 11 bits" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); + + /* "swap x_0j0lm with x_0j1lm" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + SWAP(x[0][j][0][l][m],x[0][j][1][l][m]) + + /* "xor x_1jklm into x_0jklm" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) +#pragma unroll 2 + for (m = 0;m < 2;++m) + x[0][j][k][l][m] ^= x[1][j][k][l][m]; + + /* "swap x_1jkl0 with x_1jkl1" */ +#pragma unroll 2 + for (j = 0;j < 2;++j) +#pragma unroll 2 + for (k = 0;k < 2;++k) +#pragma unroll 2 + for (l = 0;l < 2;++l) + SWAP(x[1][j][k][l][0],x[1][j][k][l][1]) + + } +} + +__device__ __host__ __forceinline__ +static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2]) +{ + // read 32 bytes input from global mem with uint2 chunks + AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]); + AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]); + AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]); + AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]); +} + +__device__ __host__ __forceinline__ +static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2]) +{ + // used to write final hash to global mem + AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]); + AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]); + AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]); + AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]); + AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]); + AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]); + AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]); + AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]); +} + +#define Init(x) \ + AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \ + AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \ + AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \ + AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \ + AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \ + AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \ + AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \ + AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \ + AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \ + AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \ + AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \ + AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \ + AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \ + AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \ + AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \ + AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]); + +#define Init_cpu(x) \ + AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512_c[ 0]); \ + AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512_c[ 2]); \ + AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512_c[ 4]); \ + AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512_c[ 6]); \ + AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512_c[ 8]); \ + AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512_c[10]); \ + AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512_c[12]); \ + AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512_c[14]); \ + AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512_c[16]); \ + AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512_c[18]); \ + AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512_c[20]); \ + AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512_c[22]); \ + AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512_c[24]); \ + AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512_c[26]); \ + AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512_c[28]); \ + AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512_c[30]); + + +__device__ __host__ __forceinline__ +static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data) +{ + /* "xor the block into the first b bytes of the state" */ + block_tox(data, x); + /* "and then transform the state invertibly through r identical rounds" */ + rrounds(x); +} + +__device__ __forceinline__ +static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) +{ + /* "the integer 1 is xored into the last state word x_11111" */ + x[1][1][1][1][1] ^= 1; + + /* "the state is then transformed invertibly through 10r identical rounds" */ + #pragma unroll 10 + for (int i = 0; i < 10; i++) rrounds(x); + + /* "output the first h/8 bytes of the state" */ + hash_fromx(hashval, x); +} + + +/***************************************************/ + + +#define WANT_CUBEHASH80 +#ifdef WANT_CUBEHASH80 + +__constant__ static uint32_t c_x[32]; +__constant__ static uint32_t c_message[3]; + +__host__ +void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata) +{ + uint32_t x[2][2][2][2][2]; + Init_cpu(x); + + uint32_t message[8]; + // first 32 bytes + AS_UINT4(&message[0]) = AS_UINT4(&endiandata[0]); + AS_UINT4(&message[4]) = AS_UINT4(&endiandata[4]); + Update32(x, message); + + // second 32 bytes + AS_UINT4(&message[0]) = AS_UINT4(&endiandata[8]); + AS_UINT4(&message[4]) = AS_UINT4(&endiandata[12]); + Update32(x, message); + + AS_UINT4(&message[0]) = AS_UINT4(&endiandata[16]); + message[4] = 0x80; + message[5] = 0; + message[6] = 0; + message[7] = 0; + + cudaMemcpyToSymbol(c_x, x, sizeof(c_x), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_message, message, sizeof(c_message), 0, cudaMemcpyHostToDevice); +} + +__global__ +void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNounce + thread; + + uint32_t x[2][2][2][2][2]; + uint32_t *xx = (uint32_t *)&x[0][0][0][0][0]; + uint32_t message[8]; + + *(uint2x4*)&xx[0] = *(uint2x4*)&c_x[0]; + *(uint2x4*)&xx[8] = *(uint2x4*)&c_x[8]; + *(uint2x4*)&xx[16] = *(uint2x4*)&c_x[16]; + *(uint2x4*)&xx[24] = *(uint2x4*)&c_x[24]; + *(uint2*)&message[0] = *(uint2*)&c_message[0]; + message[2] = c_message[2]; + + message[3] = cuda_swab32(nonce); + message[4] = 0x80; + message[5] = 0; + message[6] = 0; + message[7] = 0; + Update32(x, message); + + uint32_t* output = (uint32_t*)(&g_outhash[(size_t)8 * thread]); + Final(x, output); + } +} + +__host__ +void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 512; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + cubehash512_gpu_hash_80 << > > (threads, startNounce, (uint64_t*)d_hash); +} + +#endif diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index fa5c4f7885..723b4ec3d5 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -315,5 +315,5 @@ void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, dim3 block(threadsperblock); x11_echo512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x11/cuda_x11_echo_sp.cu b/x11/cuda_x11_echo_sp.cu new file mode 100644 index 0000000000..ee5660c021 --- /dev/null +++ b/x11/cuda_x11_echo_sp.cu @@ -0,0 +1,420 @@ +/* + Based on Tanguy Pruvot's repo + Provos Alexis - 2016 +*/ + +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" + +#define INTENSIVE_GMF +#include "cuda_x11_aes_sp.cuh" + +__device__ +static void echo_round_sp(const uint32_t sharedMemory[8 * 1024], uint32_t *W, uint32_t &k0){ + // Big Sub Words +#pragma unroll 16 + for (int idx = 0; idx < 16; idx++) + AES_2ROUND_32(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0); + + // Shift Rows +#pragma unroll 4 + for (int i = 0; i < 4; i++){ + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i + 4]; + t[1] = W[i + 8]; + t[2] = W[i + 24]; + t[3] = W[i + 60]; + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i + 24] = W[i + 56]; + W[i + 60] = W[i + 44]; + + W[i + 20] = W[i + 36]; + W[i + 40] = t[1]; + W[i + 56] = t[2]; + W[i + 44] = W[i + 28]; + + W[i + 28] = W[i + 12]; + W[i + 12] = t[3]; + W[i + 36] = W[i + 52]; + W[i + 52] = t[0]; + } + // Mix Columns +#pragma unroll 4 + for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t +#pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i + 12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = (bc^ a[3] ^ abx); + W[idx + i + 4] = xor3(a[0], cd, bcx); + W[idx + i + 8] = xor3(ab, a[3], cdx); + W[idx + i + 12] = xor3(ab, a[2], xor3(abx, bcx, cdx)); + } + } +} + +__global__ __launch_bounds__(256, 3) /* will force 80 registers */ +void x11_echo512_gpu_hash_64_final_sp(uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target) +{ + __shared__ __align__(16) uint32_t sharedMemory[8 * 1024]; + + aes_gpu_init256_32(sharedMemory); + + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //8-12 + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //21-25 + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //34-38 + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + //58-61 + }; + uint32_t k0; + uint32_t h[16]; + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads){ + + const uint32_t *hash = (uint32_t*)&g_hash[thread << 3]; + + *(uint2x4*)&h[0] = __ldg4((uint2x4*)&hash[0]); + *(uint2x4*)&h[8] = __ldg4((uint2x4*)&hash[8]); + + uint64_t backup = *(uint64_t*)&h[6]; + + k0 = 512 + 8; + + __threadfence_block(); + +#pragma unroll 4 + for (uint32_t idx = 0; idx < 16; idx += 4) + AES_2ROUND_32(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + + k0 += 4; + + uint32_t W[64]; + +#pragma unroll 4 + for (uint32_t i = 0; i < 4; i++){ + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + + uint32_t t = ((a ^ b) & 0x80808080); + uint32_t t2 = ((b ^ c) & 0x80808080); + uint32_t t3 = ((c ^ d) & 0x80808080); + + uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1); + uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1); + uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1); + + W[0 + i] = bc ^ d ^ abx; + W[4 + i] = a ^ cd ^ bcx; + W[8 + i] = ab ^ d ^ cdx; + W[12 + i] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[12 + i]; + b = h[i + 4]; + c = P[12 + i + 4]; + d = P[12 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16 + i] = abx ^ bc ^ d; + W[16 + i + 4] = bcx ^ a ^ cd; + W[16 + i + 8] = cdx ^ ab ^ d; + W[16 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = h[i]; + b = P[24 + i]; + c = P[24 + i + 4]; + d = P[24 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32 + i] = abx ^ bc ^ d; + W[32 + i + 4] = bcx ^ a ^ cd; + W[32 + i + 8] = cdx ^ ab ^ d; + W[32 + i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[36 + i]; + b = P[36 + i + 4]; + c = P[36 + i + 8]; + d = h[i + 12]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48 + i] = abx ^ bc ^ d; + W[48 + i + 4] = xor3(bcx , a , cd); + W[48 + i + 8] = xor3(cdx , ab, d); + W[48 + i + 12] = xor3(abx , bcx , xor3(cdx, ab, c)); + + + } + + for (int k = 1; k < 10; k++) + echo_round_sp(sharedMemory, W, k0); + +#pragma unroll 4 + for (int i = 0; i < 16; i += 4) + { + W[i] ^= W[32 + i] ^ 512; + W[i + 1] ^= W[32 + i + 1]; + W[i + 2] ^= W[32 + i + 2]; + W[i + 3] ^= W[32 + i + 3]; + } + uint64_t check = ((uint64_t*)hash)[3] ^ ((uint64_t*)W)[3]; + + if (check <= target) + { + uint32_t tmp = atomicExch(&resNonce[0], thread); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } + } +} + +__host__ +void x11_echo512_cpu_hash_64_final_sp(int thr_id, uint32_t threads, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x11_echo512_gpu_hash_64_final_sp<<>>(threads, (uint64_t*)d_hash,d_resNonce,target); +} + +__global__ __launch_bounds__(384, 2) +static void x11_echo512_gpu_hash_64_sp(uint32_t threads, uint32_t *g_hash) +{ + __shared__ uint32_t sharedMemory[8 * 1024]; + + // if (threadIdx.x < 256) + // { + aes_gpu_init256_32(sharedMemory); + // } + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t k0; + uint32_t h[16]; + uint32_t hash[16]; + if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + *(uint2x4*)&h[0] = __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&h[8] = __ldg4((uint2x4*)&Hash[8]); + + // *(uint2x4*)&hash[0] = *(uint2x4*)&h[0]; + // *(uint2x4*)&hash[8] = *(uint2x4*)&h[8]; + + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //8-12 + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //21-25 + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //34-38 + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + //58-61 + }; + + k0 = 520; + __threadfence_block(); + +#pragma unroll 4 + for (uint32_t idx = 0; idx < 16; idx += 4) + { + AES_2ROUND_32(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + } + k0 += 4; + + uint32_t W[64]; + +#pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) + { + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + + uint32_t t = (ab & 0x80808080); + uint32_t t2 = (bc & 0x80808080); + uint32_t t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[i] = abx ^ bc ^ d; + W[i + 4] = bcx ^ a ^ cd; + W[i + 8] = cdx ^ ab ^ d; + W[i + 12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[i + 12]; + b = h[i + 4]; + c = P[i + 16]; + d = P[i + 20]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16 + i] = bc ^ d ^ abx; + W[16 + i + 4] = a ^ cd ^ bcx; + W[16 + i + 8] = d ^ ab ^ cdx; + W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + a = h[i]; + b = P[24 + i + 0]; + c = P[24 + i + 4]; + d = P[24 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32 + i] = bc ^ d ^ abx; + W[32 + i + 4] = a ^ cd ^ bcx; + W[32 + i + 8] = d ^ ab ^ cdx; + W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + a = P[36 + i]; + b = P[36 + i + 4]; + c = P[36 + i + 8]; + d = h[i + 12]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48 + i] = (bc ^ d ^ abx); + W[48 + i + 4] = (a ^ cd ^ bcx); + W[48 + i + 8] = (d^ ab^ cdx); + W[48 + i + 12] = (c ^ ab ^ (abx ^ bcx ^cdx)); + + } + + for (int k = 1; k < 10; k++) + echo_round_sp(sharedMemory, W, k0); + +#pragma unroll 4 + for (int i = 0; i < 16; i += 4) + { + W[i] ^= W[32 + i] ^ 512; + W[i + 1] ^= W[32 + i + 1]; + W[i + 2] ^= W[32 + i + 2]; + W[i + 3] ^= W[32 + i + 3]; + } + *(uint2x4*)&Hash[0] = *(uint2x4*)&Hash[0] ^ *(uint2x4*)&W[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&Hash[8] ^ *(uint2x4*)&W[8]; + } +} + +__host__ +void x11_echo512_cpu_hash_64_sp(int thr_id, uint32_t threads, uint32_t *d_hash){ + + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x11_echo512_gpu_hash_64_sp<<>>(threads, d_hash); +} diff --git a/x11/cuda_x11_shavite512.cu b/x11/cuda_x11_shavite512.cu index cfebf0d8ee..a4dbbb0132 100644 --- a/x11/cuda_x11_shavite512.cu +++ b/x11/cuda_x11_shavite512.cu @@ -8,12 +8,350 @@ extern __device__ __device_builtin__ void __threadfence_block(void); __constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding) -#include "cuda_x11_aes.cuh" +#define AESx(x) (x ##UL) /* SPH_C32(x) */ + +#define TPB 128 +__device__ __align__(64) uint32_t d_AES0[256] = { + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +__constant__ __align__(64) uint32_t d_AES1[256] = { + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +__constant__ __align__(64) uint32_t d_AES2[256] = { + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +__constant__ __align__(64) uint32_t d_AES3[256] = { + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + +#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d)); + + +__device__ +static void aes_round( +const uint32_t *sharedMemory, +uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, +uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + __ldg(&d_AES0[x0 & 0xff]), //sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + __ldg(&d_AES0[x1 & 0xff]), //sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + __ldg(&d_AES0[x2 & 0xff]), //sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 + + y0 ^= k0; + + y3 = xor4_32( + __ldg(&d_AES0[x3 & 0xff]), //sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 +} + +__device__ +static void aes_round( +const uint32_t *sharedMemory, +uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, +uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + __ldg(&d_AES0[x0 & 0xff]),//sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + __ldg(&d_AES0[x1 & 0xff]),//sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + __ldg(&d_AES0[x2 & 0xff]), //sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x1, 0, 0x4443) + 768]); // ^k2 + + y3 = xor4_32( + __ldg(&d_AES0[x3 & 0xff]), //sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 +} + + __device__ __forceinline__ static void AES_ROUND_NOKEY( - const uint32_t* __restrict__ sharedMemory, - uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3) +const uint32_t* __restrict__ sharedMemory, +uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3) { uint32_t y0, y1, y2, y3; aes_round(sharedMemory, @@ -28,8 +366,8 @@ static void AES_ROUND_NOKEY( __device__ __forceinline__ static void KEY_EXPAND_ELT( - const uint32_t* __restrict__ sharedMemory, - uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) +const uint32_t* __restrict__ sharedMemory, +uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) { uint32_t y0, y1, y2, y3; aes_round(sharedMemory, @@ -1464,7 +1802,7 @@ void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun __host__ void x11_shavite512_cpu_init(int thr_id, uint32_t threads) { - aes_cpu_init(thr_id); +// aes_cpu_init(thr_id); } __host__ diff --git a/x11/cuda_x11_shavite512_sp.cu b/x11/cuda_x11_shavite512_sp.cu new file mode 100644 index 0000000000..0c165f4dde --- /dev/null +++ b/x11/cuda_x11_shavite512_sp.cu @@ -0,0 +1,987 @@ +/* + Based on Tanguy Pruvot's repo + Provos Alexis - 2016 + optimized by sp - 2018/2019 +*/ +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" + +#define INTENSIVE_GMF +#include "cuda_x11_aes_sp.cuh" +__constant__ uint32_t c_PaddedMessage80[20]; // padded message (80 bytes + padding) + +__device__ __forceinline__ void aes_round_s(const uint32_t sharedMemory[256][32], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, const uint32_t k0, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + const uint32_t index = threadIdx.x & 0x1f; + + y0 = sharedMemory[__byte_perm(x0, 0, 0x4440)][index]; + y3 = ROL8(sharedMemory[__byte_perm(x0, 0, 0x4441)][index]); + y2 = ROL16(sharedMemory[__byte_perm(x0, 0, 0x4442)][index]); + y1 = ROR8(sharedMemory[__byte_perm(x0, 0, 0x4443)][index]); + + y1 ^= sharedMemory[__byte_perm(x1, 0, 0x4440)][index]; + y0 ^= ROL8(sharedMemory[__byte_perm(x1, 0, 0x4441)][index]); + y3 ^= ROL16(sharedMemory[__byte_perm(x1, 0, 0x4442)][index]); + y2 ^= ROR8(sharedMemory[__byte_perm(x1, 0, 0x4443)][index]); + + y0 ^= k0; + + y2 ^= sharedMemory[__byte_perm(x2, 0, 0x4440)][index]; + y1 ^= ROL8(sharedMemory[__byte_perm(x2, 0, 0x4441)][index]); + y0 ^= ROL16(sharedMemory[__byte_perm(x2, 0, 0x4442)][index]); + y3 ^= ROR8(sharedMemory[__byte_perm(x2, 0, 0x4443)][index]); + + y3 ^= sharedMemory[__byte_perm(x3, 0, 0x4440)][index]; + y2 ^= ROL8(sharedMemory[__byte_perm(x3, 0, 0x4441)][index]); + y1 ^= ROL16(sharedMemory[__byte_perm(x3, 0, 0x4442)][index]); + y0 ^= ROR8(sharedMemory[__byte_perm(x3, 0, 0x4443)][index]); +} + +__device__ __forceinline__ void aes_round_s(const uint32_t sharedMemory[256][32], const uint32_t x0, const uint32_t x1, const uint32_t x2, const uint32_t x3, uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + const uint32_t index = threadIdx.x & 0x1f; + + y0 = sharedMemory[__byte_perm(x0, 0, 0x4440)][index]; + y3 = ROL8(sharedMemory[__byte_perm(x0, 0, 0x4441)][index]); + y2 = ROL16(sharedMemory[__byte_perm(x0, 0, 0x4442)][index]); + y1 = ROR8(sharedMemory[__byte_perm(x0, 0, 0x4443)][index]); + + y1 ^= sharedMemory[__byte_perm(x1, 0, 0x4440)][index]; + y0 ^= ROL8(sharedMemory[__byte_perm(x1, 0, 0x4441)][index]); + y3 ^= ROL16(sharedMemory[__byte_perm(x1, 0, 0x4442)][index]); + y2 ^= ROR8(sharedMemory[__byte_perm(x1, 0, 0x4443)][index]); + + y2 ^= sharedMemory[__byte_perm(x2, 0, 0x4440)][index]; + y1 ^= ROL8(sharedMemory[__byte_perm(x2, 0, 0x4441)][index]); + y0 ^= ROL16(sharedMemory[__byte_perm(x2, 0, 0x4442)][index]); + y3 ^= ROR8(sharedMemory[__byte_perm(x2, 0, 0x4443)][index]); + + y3 ^= sharedMemory[__byte_perm(x3, 0, 0x4440)][index]; + y2 ^= ROL8(sharedMemory[__byte_perm(x3, 0, 0x4441)][index]); + y1 ^= ROL16(sharedMemory[__byte_perm(x3, 0, 0x4442)][index]); + y0 ^= ROR8(sharedMemory[__byte_perm(x3, 0, 0x4443)][index]); +} + + +__device__ __forceinline__ void AES_ROUND_NOKEY_s(const uint32_t sharedMemory[256][32], uint4* x){ + + uint32_t y0, y1, y2, y3; + aes_round_s(sharedMemory, x->x, x->y, x->z, x->w, y0, y1, y2, y3); + + x->x = y0; + x->y = y1; + x->z = y2; + x->w = y3; +} + +__device__ __forceinline__ void KEY_EXPAND_ELT_s(const uint32_t sharedMemory[256][32], uint32_t *k) +{ + + uint32_t y0, y1, y2, y3; + aes_round_s(sharedMemory, k[0], k[1], k[2], k[3], y0, y1, y2, y3); + + k[0] = y1; + k[1] = y2; + k[2] = y3; + k[3] = y0; +} + + +__device__ __forceinline__ +void aes_gpu_init256_s(uint32_t sharedMemory[256][32]) +{ + uint32_t temp = d_AES0[threadIdx.x]; + + sharedMemory[threadIdx.x][0] = temp; + sharedMemory[threadIdx.x][1] = temp; + sharedMemory[threadIdx.x][2] = temp; + sharedMemory[threadIdx.x][3] = temp; + sharedMemory[threadIdx.x][4] = temp; + sharedMemory[threadIdx.x][5] = temp; + sharedMemory[threadIdx.x][6] = temp; + sharedMemory[threadIdx.x][7] = temp; + sharedMemory[threadIdx.x][8] = temp; + sharedMemory[threadIdx.x][9] = temp; + sharedMemory[threadIdx.x][10] = temp; + sharedMemory[threadIdx.x][11] = temp; + sharedMemory[threadIdx.x][12] = temp; + sharedMemory[threadIdx.x][13] = temp; + sharedMemory[threadIdx.x][14] = temp; + sharedMemory[threadIdx.x][15] = temp; + sharedMemory[threadIdx.x][16] = temp; + sharedMemory[threadIdx.x][17] = temp; + sharedMemory[threadIdx.x][18] = temp; + sharedMemory[threadIdx.x][19] = temp; + sharedMemory[threadIdx.x][20] = temp; + sharedMemory[threadIdx.x][21] = temp; + sharedMemory[threadIdx.x][22] = temp; + sharedMemory[threadIdx.x][23] = temp; + sharedMemory[threadIdx.x][24] = temp; + sharedMemory[threadIdx.x][25] = temp; + sharedMemory[threadIdx.x][26] = temp; + sharedMemory[threadIdx.x][27] = temp; + sharedMemory[threadIdx.x][28] = temp; + sharedMemory[threadIdx.x][29] = temp; + sharedMemory[threadIdx.x][30] = temp; + sharedMemory[threadIdx.x][31] = temp; +} + + +__device__ __forceinline__ void round_3_7_11_s(const uint32_t sharedMemory[256][32], uint32_t* r, uint4 *p, uint4 &x){ + KEY_EXPAND_ELT_s(sharedMemory, &r[ 0]); + *(uint4*)&r[ 0] ^= *(uint4*)&r[28]; + x = p[ 2] ^ *(uint4*)&r[ 0]; + KEY_EXPAND_ELT_s(sharedMemory, &r[ 4]); + r[4] ^= r[0]; + r[5] ^= r[1]; + r[6] ^= r[2]; + r[7] ^= r[3]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x.x ^= r[4]; + x.y ^= r[5]; + x.z ^= r[6]; + x.w ^= r[7]; + KEY_EXPAND_ELT_s(sharedMemory, &r[ 8]); + r[8] ^= r[4]; + r[9] ^= r[5]; + r[10]^= r[6]; + r[11]^= r[7]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x.x ^= r[8]; + x.y ^= r[9]; + x.z ^= r[10]; + x.w ^= r[11]; + KEY_EXPAND_ELT_s(sharedMemory, &r[12]); + r[12] ^= r[8]; + r[13] ^= r[9]; + r[14]^= r[10]; + r[15]^= r[11]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x.x ^= r[12]; + x.y ^= r[13]; + x.z ^= r[14]; + x.w ^= r[15]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 1].x ^= x.x; + p[ 1].y ^= x.y; + p[ 1].z ^= x.z; + p[ 1].w ^= x.w; + KEY_EXPAND_ELT_s(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[ 0] ^ *(uint4*)&r[16]; + KEY_EXPAND_ELT_s(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[20]; + KEY_EXPAND_ELT_s(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[24]; + KEY_EXPAND_ELT_s(sharedMemory,&r[28]); + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 3] ^= x; +} + +__device__ __forceinline__ +void round_4_8_12_s(const uint32_t sharedMemory[256][32], uint32_t* r, uint4 *p, uint4 &x){ + *(uint4*)&r[ 0] ^= *(uint4*)&r[25]; + x = p[ 1] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + + r[ 4] ^= r[29]; r[ 5] ^= r[30]; + r[ 6] ^= r[31]; r[ 7] ^= r[ 0]; + + x ^= *(uint4*)&r[ 4]; + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 1]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[ 8]; + *(uint4*)&r[12] ^= *(uint4*)&r[ 5]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 0] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[ 9]; + x = p[ 3] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 2] ^= x; +} + +// GPU Hash +__global__ __launch_bounds__(448, 2) /* 64 registers with 128,8 - 72 regs with 128,7 */ +void x11_shavite512_gpu_hash_64_sp(const uint32_t threads, uint64_t *g_hash) +{ + __shared__ uint32_t sharedMemory[256][32]; + + if(threadIdx.x<256) aes_gpu_init256_s(sharedMemory); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + uint4 p[ 4]; + uint4 x; + uint32_t r[32]; + + // kopiere init-state + const uint32_t state[16] = { + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A + }; + if (thread < threads) + { + uint64_t *Hash = &g_hash[thread<<3]; + + // fülle die Nachricht mit 64-byte (vorheriger Hash) + *(uint2x4*)&r[ 0] = __ldg4((uint2x4*)&Hash[ 0]); + __syncthreads(); + + *(uint2x4*)&p[ 0] = *(uint2x4*)&state[ 0]; + *(uint2x4*)&p[ 2] = *(uint2x4*)&state[ 8]; + r[16] = 0x80; r[17] = 0; r[18] = 0; r[19] = 0; + r[20] = 0; r[21] = 0; r[22] = 0; r[23] = 0; + r[24] = 0; r[25] = 0; r[26] = 0; r[27] = 0x02000000; + r[28] = 0; r[29] = 0; r[30] = 0; r[31] = 0x02000000; + /* round 0 */ + x = p[ 1] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + + + *(uint2x4*)&r[8] = __ldg4((uint2x4*)&Hash[4]); + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 0] ^= x; + + x = p[ 3]; + x.x ^= 0x80; + AES_ROUND_NOKEY_s(sharedMemory, &x); + AES_ROUND_NOKEY_s(sharedMemory, &x); + x.w ^= 0x02000000; + AES_ROUND_NOKEY_s(sharedMemory, &x); + x.w ^= 0x02000000; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 2]^= x; + + + // 1 + KEY_EXPAND_ELT_s(sharedMemory, &r[ 0]); + *(uint4*)&r[ 0]^=*(uint4*)&r[28]; + r[ 0] ^= 0x200; + r[3] = ~r[3]; + + x = p[ 0] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 4]); + *(uint4*)&r[ 4] ^= *(uint4*)&r[ 0]; + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 8]); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 4]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[ 8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 3] ^= x; + KEY_EXPAND_ELT_s(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[ 2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 1] ^= x; + *(uint4*)&r[ 0] ^= *(uint4*)&r[25]; + x = p[ 3] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + + r[ 4] ^= r[29]; r[ 5] ^= r[30]; + r[ 6] ^= r[31]; r[ 7] ^= r[ 0]; + + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 1]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[12] ^= *(uint4*)&r[ 5]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 2] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[ 9]; + x = p[ 1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); +// *(uint4*)&r[24] ^= *(uint4*)&r[17]; + r[24] ^= r[17]; + r[25] ^= r[18]; + r[26] ^= r[19]; + r[27] ^= r[20]; + + + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + + p[ 0] ^= x; + + /* round 3, 7, 11 */ + round_3_7_11_s(sharedMemory,r,p,x); + + + /* round 4, 8, 12 */ + round_4_8_12_s(sharedMemory,r,p,x); + + // 2 + KEY_EXPAND_ELT_s(sharedMemory,&r[ 0]); + *(uint4*)&r[ 0] ^= *(uint4*)&r[28]; + x = p[ 0] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 4]); + *(uint4*)&r[ 4] ^= *(uint4*)&r[ 0]; + r[ 7] ^= (~0x200); + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 8]); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 4]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[ 8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 3] ^= x; + KEY_EXPAND_ELT_s(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[ 2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory,&r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 1] ^= x; + + *(uint4*)&r[ 0] ^= *(uint4*)&r[25]; + x = p[ 3] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + r[ 4] ^= r[29]; + r[ 5] ^= r[30]; + r[ 6] ^= r[31]; + r[ 7] ^= r[ 0]; + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 1]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[12] ^= *(uint4*)&r[ 5]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 2] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[ 9]; + x = p[ 1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 0] ^= x; + + /* round 3, 7, 11 */ + round_3_7_11_s(sharedMemory,r,p,x); + + /* round 4, 8, 12 */ + round_4_8_12_s(sharedMemory,r,p,x); + + // 3 + KEY_EXPAND_ELT_s(sharedMemory,&r[ 0]); + *(uint4*)&r[ 0] ^= *(uint4*)&r[28]; + x = p[ 0] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 4]); + *(uint4*)&r[ 4] ^= *(uint4*)&r[ 0]; + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 8]); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 4]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[ 8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 3] ^= x; + KEY_EXPAND_ELT_s(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[ 2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x^=*(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[24]); + *(uint4*)&r[24]^=*(uint4*)&r[20]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory,&r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + r[30] ^= 0x200; + r[31] = ~r[31]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 1] ^= x; + + *(uint4*)&r[ 0] ^= *(uint4*)&r[25]; + x = p[ 3] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + r[ 4] ^= r[29]; + r[ 5] ^= r[30]; + r[ 6] ^= r[31]; + r[ 7] ^= r[ 0]; + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 1]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[12] ^= *(uint4*)&r[ 5]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 2] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[ 9]; + x = p[ 1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 0] ^= x; + + /* round 3, 7, 11 */ + round_3_7_11_s(sharedMemory,r,p,x); + + /* round 4, 8, 12 */ + round_4_8_12_s(sharedMemory,r,p,x); + + /* round 13 */ + KEY_EXPAND_ELT_s(sharedMemory,&r[ 0]); + *(uint4*)&r[ 0] ^= *(uint4*)&r[28]; + x = p[ 0] ^ *(uint4*)&r[ 0]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 4]); + *(uint4*)&r[ 4] ^= *(uint4*)&r[ 0]; + x ^= *(uint4*)&r[ 4]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[ 8]); + *(uint4*)&r[ 8] ^= *(uint4*)&r[ 4]; + x ^= *(uint4*)&r[ 8]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[ 8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 3] ^= x; + KEY_EXPAND_ELT_s(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[ 2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + r[25] ^= 0x200; + r[27] = ~r[27]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + KEY_EXPAND_ELT_s(sharedMemory,&r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_s(sharedMemory, &x); + p[ 1] ^= x; + + *(uint2x4*)&Hash[ 0] = *(uint2x4*)&state[ 0] ^ *(uint2x4*)&p[ 2]; + *(uint2x4*)&Hash[ 4] = *(uint2x4*)&state[ 8] ^ *(uint2x4*)&p[ 0]; + } +} + +__device__ __forceinline__ void round_3_7_11(const uint32_t sharedMemory[8*1024], uint32_t* r, uint4 *p, uint4 &x){ + KEY_EXPAND_ELT_32(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + x = p[2] ^ *(uint4*)&r[0]; + KEY_EXPAND_ELT_32(sharedMemory, &r[4]); + r[4] ^= r[0]; + r[5] ^= r[1]; + r[6] ^= r[2]; + r[7] ^= r[3]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x.x ^= r[4]; + x.y ^= r[5]; + x.z ^= r[6]; + x.w ^= r[7]; + KEY_EXPAND_ELT_32(sharedMemory, &r[8]); + r[8] ^= r[4]; + r[9] ^= r[5]; + r[10] ^= r[6]; + r[11] ^= r[7]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x.x ^= r[8]; + x.y ^= r[9]; + x.z ^= r[10]; + x.w ^= r[11]; + KEY_EXPAND_ELT_32(sharedMemory, &r[12]); + r[12] ^= r[8]; + r[13] ^= r[9]; + r[14] ^= r[10]; + r[15] ^= r[11]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x.x ^= r[12]; + x.y ^= r[13]; + x.z ^= r[14]; + x.w ^= r[15]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[1].x ^= x.x; + p[1].y ^= x.y; + p[1].z ^= x.z; + p[1].w ^= x.w; + KEY_EXPAND_ELT_32(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[0] ^ *(uint4*)&r[16]; + KEY_EXPAND_ELT_32(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[20]; + KEY_EXPAND_ELT_32(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[24]; + KEY_EXPAND_ELT_32(sharedMemory, &r[28]); + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[3] ^= x; +} + +__device__ __forceinline__ void round_4_8_12(const uint32_t sharedMemory[8*1024], uint32_t* r, uint4 *p, uint4 &x){ + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + x = p[1] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + + r[4] ^= r[29]; r[5] ^= r[30]; + r[6] ^= r[31]; r[7] ^= r[0]; + + x ^= *(uint4*)&r[4]; + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[8]; + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[0] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + x = p[3] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[2] ^= x; +} + + +__global__ __launch_bounds__(384, 2) +void x11_shavite512_gpu_hash_64_sp_final(const uint32_t threads, uint64_t *g_hash, uint32_t* resNonce, const uint64_t target) +{ + __shared__ uint32_t sharedMemory[8 * 1024]; + + if (threadIdx.x<256) aes_gpu_init256_32(sharedMemory); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + uint4 p[4]; + uint4 x; + uint32_t r[32]; + + // kopiere init-state + const uint32_t state[16] = { + 0x72FCCDD8, 0x79CA4727, 0x128A077B, 0x40D55AEC, 0xD1901A06, 0x430AE307, 0xB29F5CD1, 0xDF07FBFC, + 0x8E45D73D, 0x681AB538, 0xBDE86578, 0xDD577E47, 0xE275EADE, 0x502D9FCD, 0xB9357178, 0x022A4B9A + }; + if (thread < threads) + { + uint2 *Hash = (uint2 *)&g_hash[thread << 3]; + + // fülle die Nachricht mit 64-byte (vorheriger Hash) + *(uint2x4*)&r[0] = __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&r[8] = __ldg4((uint2x4*)&Hash[4]); + __syncthreads(); + + *(uint2x4*)&p[0] = *(uint2x4*)&state[0]; + *(uint2x4*)&p[2] = *(uint2x4*)&state[8]; + r[16] = 0x80; r[17] = 0; r[18] = 0; r[19] = 0; + r[20] = 0; r[21] = 0; r[22] = 0; r[23] = 0; + r[24] = 0; r[25] = 0; r[26] = 0; r[27] = 0x02000000; + r[28] = 0; r[29] = 0; r[30] = 0; r[31] = 0x02000000; + /* round 0 */ + x = p[1] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[0] ^= x; + x = p[3]; + x.x ^= 0x80; + + AES_ROUND_NOKEY_32(sharedMemory, &x); + + AES_ROUND_NOKEY_32(sharedMemory, &x); + + x.w ^= 0x02000000; + AES_ROUND_NOKEY_32(sharedMemory, &x); + + x.w ^= 0x02000000; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[2] ^= x; + // 1 + KEY_EXPAND_ELT_32(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + r[0] ^= 0x200; + r[3] = ~r[3]; + x = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[3] ^= x; + KEY_EXPAND_ELT_32(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[1] ^= x; + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + x = p[3] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + + r[4] ^= r[29]; r[5] ^= r[30]; + r[6] ^= r[31]; r[7] ^= r[0]; + + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[2] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + x = p[1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + + p[0] ^= x; + + /* round 3, 7, 11 */ + round_3_7_11(sharedMemory, r, p, x); + + + /* round 4, 8, 12 */ + round_4_8_12(sharedMemory, r, p, x); + + // 2 + KEY_EXPAND_ELT_32(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + x = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + r[7] ^= (~0x200); + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[3] ^= x; + KEY_EXPAND_ELT_32(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[1] ^= x; + + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + x = p[3] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + r[4] ^= r[29]; + r[5] ^= r[30]; + r[6] ^= r[31]; + r[7] ^= r[0]; + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[2] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + x = p[1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[0] ^= x; + + /* round 3, 7, 11 */ + round_3_7_11(sharedMemory, r, p, x); + + /* round 4, 8, 12 */ + round_4_8_12(sharedMemory, r, p, x); + + // 3 + KEY_EXPAND_ELT_32(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + x = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[3] ^= x; + KEY_EXPAND_ELT_32(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + r[30] ^= 0x200; + r[31] = ~r[31]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[1] ^= x; + + *(uint4*)&r[0] ^= *(uint4*)&r[25]; + x = p[3] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + r[4] ^= r[29]; + r[5] ^= r[30]; + r[6] ^= r[31]; + r[7] ^= r[0]; + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[8] ^= *(uint4*)&r[1]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[12] ^= *(uint4*)&r[5]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[2] ^= x; + *(uint4*)&r[16] ^= *(uint4*)&r[9]; + x = p[1] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[20] ^= *(uint4*)&r[13]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[24] ^= *(uint4*)&r[17]; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + *(uint4*)&r[28] ^= *(uint4*)&r[21]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[0] ^= x; + + /* round 3, 7, 11 */ + round_3_7_11(sharedMemory, r, p, x); + + /* round 4, 8, 12 */ + round_4_8_12(sharedMemory, r, p, x); + + /* round 13 */ + KEY_EXPAND_ELT_32(sharedMemory, &r[0]); + *(uint4*)&r[0] ^= *(uint4*)&r[28]; + x = p[0] ^ *(uint4*)&r[0]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[4]); + *(uint4*)&r[4] ^= *(uint4*)&r[0]; + x ^= *(uint4*)&r[4]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[8]); + *(uint4*)&r[8] ^= *(uint4*)&r[4]; + x ^= *(uint4*)&r[8]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[12]); + *(uint4*)&r[12] ^= *(uint4*)&r[8]; + x ^= *(uint4*)&r[12]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[3] ^= x; + /* KEY_EXPAND_ELT_32(sharedMemory, &r[16]); + *(uint4*)&r[16] ^= *(uint4*)&r[12]; + x = p[2] ^ *(uint4*)&r[16]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[20]); + *(uint4*)&r[20] ^= *(uint4*)&r[16]; + x ^= *(uint4*)&r[20]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[24]); + *(uint4*)&r[24] ^= *(uint4*)&r[20]; + r[25] ^= 0x200; + r[27] ^= 0xFFFFFFFF; + x ^= *(uint4*)&r[24]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + KEY_EXPAND_ELT_32(sharedMemory, &r[28]); + *(uint4*)&r[28] ^= *(uint4*)&r[24]; + x ^= *(uint4*)&r[28]; + AES_ROUND_NOKEY_32(sharedMemory, &x); + p[1] ^= x; + */ + //Hash[3] = + uint64_t test = (((uint64_t *)state)[3] ^ devectorize(make_uint2(p[3].z, p[3].w))); + + if (test <= target) + { + const uint32_t tmp = atomicExch(&resNonce[0], thread); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; + } + } +} + + +__host__ +void x11_shavite512_cpu_hash_64_sp(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + dim3 grid((threads + 256 - 1) / 256); + dim3 block(256); + + x11_shavite512_gpu_hash_64_sp<<>>(threads, (uint64_t*)d_hash); +} + +__host__ +void x11_shavite512_cpu_hash_64_sp_final(int thr_id, uint32_t threads, uint32_t *d_hash, const uint64_t target, uint32_t* resNonce) +{ + dim3 grid((threads + 384 - 1) / 384); + dim3 block(384); + + x11_shavite512_gpu_hash_64_sp_final << > >(threads, (uint64_t*)d_hash,resNonce,target); +} \ No newline at end of file diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 5495edadf4..28b48ea52a 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -1,734 +1,1720 @@ -/*************************************************************************************************** - * SIMD512 SM3+ CUDA IMPLEMENTATION (require cuda_x11_simd512_func.cuh) - */ +/* +Based upon the 2 Christians,klaus_t's, Tanguy Pruvot's, tsiv and SP's work (2013-2016) +Provos Alexis - 2016 +optimized by sp - 2018 (+20% faster on the gtx 1080ti) +*/ #include "miner.h" -#include "cuda_helper.h" +#include "cuda_helper_alexis.h" +#include "cuda_vectors_alexis.h" +#include "cuda_x11_aes_sp.cuh" -#define TPB 128 +//#define INTENSIVE_GMF +//#include "cuda_x11_aes.cuh" +#include "x15/cuda_whirlpool_tables.cuh" -uint32_t *d_state[MAX_GPUS]; -uint4 *d_temp4[MAX_GPUS]; +__device__ __forceinline__ uint32_t xor3(uint32_t a, uint32_t b, uint32_t c) +{ + asm("lop3.b32 %0, %0, %1, %2, 0x96;" : "+r"(a) : "r"(b), "r"(c)); // 0xEA = (F0 ^ CC) ^ AA + return a; +} +//--------START OF WHIRLPOOL DEVICE MACROS--------------------------------------------------------------------------- +__constant__ static uint2 b0[256]; -// texture bound to d_temp4[thr_id], for read access in Compaction kernel -texture texRef1D_128; +__constant__ static uint2 precomputed_round_key_64[72]; -#define DEVICE_DIRECT_CONSTANTS +__constant__ uint2 InitVector_RC[10]; -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ uint8_t c_perm[8][8] = { -#else -__constant__ uint8_t c_perm[8][8]; -const uint8_t h_perm[8][8] = { -#endif - { 2, 3, 6, 7, 0, 1, 4, 5 }, - { 6, 7, 2, 3, 4, 5, 0, 1 }, - { 7, 6, 5, 4, 3, 2, 1, 0 }, - { 1, 0, 3, 2, 5, 4, 7, 6 }, - { 0, 1, 4, 5, 6, 7, 2, 3 }, - { 6, 7, 2, 3, 0, 1, 4, 5 }, - { 6, 7, 0, 1, 4, 5, 2, 3 }, - { 4, 5, 2, 3, 6, 7, 0, 1 } +__device__ __forceinline__ +void static TRANSFER(uint2 *const __restrict__ dst, const uint2 *const __restrict__ src){ + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; +} + +__device__ __forceinline__ uint2 d_ROUND_ELT(const uint32_t index, const uint2 sharedMemory[256][16], const uint2 *const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7){ + + uint2 ret = sharedMemory[__byte_perm(in[i0].x, 0, 0x4440)][threadIdx.x & index]; //__ldg((uint2*)&b0[__byte_perm(in[i0].x, 0, 0x4440)]); + ret ^= ROL8(sharedMemory[__byte_perm(in[i1].x, 0, 0x4441)][threadIdx.x & index]); + ret ^= ROL16(sharedMemory[__byte_perm(in[i2].x, 0, 0x4442)][threadIdx.x &index]); + ret ^= ROL24(sharedMemory[__byte_perm(in[i3].x, 0, 0x4443)][threadIdx.x &index]); + ret ^= SWAPUINT2(sharedMemory[__byte_perm(in[i4].y, 0, 0x4440)][threadIdx.x &index]); + ret ^= ROR24(sharedMemory[__byte_perm(in[i5].y, 0, 0x4441)][threadIdx.x &index]); + ret ^= ROR16(sharedMemory[__byte_perm(in[i6].y, 0, 0x4442)][threadIdx.x &index]); //ROR8(__ldg((uint2*)&b7[__byte_perm(in[i6].y, 0, 0x4442)])); + ret ^= ROR8(sharedMemory[__byte_perm(in[i7].y, 0, 0x4443)][threadIdx.x &index]); //__ldg((uint2*)&b7[__byte_perm(in[i7].y, 0, 0x4443)]); + return ret; +} + +__device__ __forceinline__ +uint2 d_ROUND_ELT1(const uint32_t index, const uint2 sharedMemory[256][16], const uint2 *const __restrict__ in, const int i0, const int i1, const int i2, const int i3, const int i4, const int i5, const int i6, const int i7, const uint2 c0){ + uint2 ret = sharedMemory[__byte_perm(in[i0].x, 0, 0x4440)][threadIdx.x & index]; + ret ^= ROL8(sharedMemory[__byte_perm(in[i1].x, 0, 0x4441)][threadIdx.x & index]); + ret ^= ROL16(sharedMemory[__byte_perm(in[i2].x, 0, 0x4442)][threadIdx.x & index]); + ret ^= ROL24(sharedMemory[__byte_perm(in[i3].x, 0, 0x4443)][threadIdx.x & index]); + ret ^= SWAPUINT2(sharedMemory[__byte_perm(in[i4].y, 0, 0x4440)][threadIdx.x & index]); + ret ^= ROR24(sharedMemory[__byte_perm(in[i5].y, 0, 0x4441)][threadIdx.x & index]); + ret ^= ROR16(sharedMemory[__byte_perm(in[i6].y, 0, 0x4442)][threadIdx.x & index]);//sharedMemory[6][__byte_perm(in[i6].y, 0, 0x4442)] + ret ^= ROR8(sharedMemory[__byte_perm(in[i7].y, 0, 0x4443)][threadIdx.x & index]);//sharedMemory[7][__byte_perm(in[i7].y, 0, 0x4443)] + ret ^= c0; + return ret; +} +//--------END OF WHIRLPOOL DEVICE MACROS----------------------------------------------------------------------------- + +//---hamsi macros--- +__constant__ uint32_t d_alpha_n[] = { + 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00, 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa, + 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0 }; -/* used in cuda_x11_simd512_func.cuh (SIMD_Compress2) */ -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ uint32_t c_IV_512[32] = { -#else -__constant__ uint32_t c_IV_512[32]; -const uint32_t h_IV_512[32] = { -#endif - 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, - 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, - 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, - 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 +__constant__ uint32_t d_alpha_f[] = { + 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9, 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0, + 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c }; -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ short c_FFT128_8_16_Twiddle[128] = { -#else -__constant__ short c_FFT128_8_16_Twiddle[128]; -static const short h_FFT128_8_16_Twiddle[128] = { -#endif - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, - 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, - 1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2,-123, 17,-111, - 1,-118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123,-122, - 1, 116, 92,-122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, - 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, - 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 +__constant__ uint32_t c_c[] = { + 0x73746565, 0x6c706172, 0x6b204172, 0x656e6265, 0x72672031, 0x302c2062, 0x75732032, 0x3434362c, + 0x20422d33, 0x30303120, 0x4c657576, 0x656e2d48, 0x65766572, 0x6c65652c, 0x2042656c, 0x6769756d }; -#ifdef DEVICE_DIRECT_CONSTANTS -__constant__ short c_FFT256_2_128_Twiddle[128] = { -#else -__constant__ short c_FFT256_2_128_Twiddle[128]; -static const short h_FFT256_2_128_Twiddle[128] = { -#endif - 1, 41,-118, 45, 46, 87, -31, 14, - 60,-110, 116,-127, -67, 80, -61, 69, - 2, 82, 21, 90, 92, -83, -62, 28, - 120, 37, -25, 3, 123, -97,-122,-119, - 4, -93, 42, -77, -73, 91,-124, 56, - -17, 74, -50, 6, -11, 63, 13, 19, - 8, 71, 84, 103, 111, -75, 9, 112, - -34,-109,-100, 12, -22, 126, 26, 38, - 16,-115, -89, -51, -35, 107, 18, -33, - -68, 39, 57, 24, -44, -5, 52, 76, - 32, 27, 79,-102, -70, -43, 36, -66, - 121, 78, 114, 48, -88, -10, 104,-105, - 64, 54, -99, 53, 117, -86, 72, 125, - -15,-101, -29, 96, 81, -20, -49, 47, - 128, 108, 59, 106, -23, 85,-113, -7, - -30, 55, -58, -65, -95, -40, -98, 94 +__constant__ uint32_t d_T512[1024] = { + 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, + 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262, + 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5, 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, + 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da, + 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782, 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, + 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab, + 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4, 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, + 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526, + 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31, 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, + 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9, + 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a, 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, + 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a, + 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb, 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, + 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf, + 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, + 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9, + 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090, 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, + 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f, + 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df, 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, + 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1, + 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e, 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, + 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55, + 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b, 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, + 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f, + 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9, 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, + 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d, + 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80, 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, + 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257, + 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, + 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59, + 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce, 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, + 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd, + 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe, 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, + 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004, + 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab, 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, + 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0, + 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6, 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, + 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198, + 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b, 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, + 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55, + 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1, 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, + 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17, + 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, + 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888, + 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494, 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, + 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7, + 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f, 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, + 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e, + 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897, 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, + 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908, + 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de, 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, + 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7, + 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e, 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, + 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e, + 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb, 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, + 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4, + 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, + 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe, + 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b, 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, + 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac, + 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69, 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, + 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7, + 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64, 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, + 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }; -/************* the round function ****************/ -#define IF(x, y, z) (((y ^ z) & x) ^ z) -#define MAJ(x, y, z) ((z &y) | ((z|y) & x)) +#define SBOX(a, b, c, d) { \ + uint32_t t; \ + t =(a); \ + a =(a & c) ^ d; \ + c =(c ^ b) ^ a; \ + d =(d | t) ^ b; \ + b = d; \ + d =((d | (t ^ c)) ^ a); \ + a&= b; \ + t^=(c ^ a); \ + b = b ^ d ^ t; \ + (a) = (c); \ + (c) = (b); \ + (b) = (d); \ + (d) = (~t); \ + } -#include "cuda_x11_simd512_sm2.cuh" -#include "cuda_x11_simd512_func.cuh" +#define HAMSI_L(a, b, c, d) { \ + (a) = ROTL32(a, 13); \ + (c) = ROTL32(c, 3); \ + (b) ^= (a) ^ (c); \ + (d) ^= (c) ^ ((a) << 3); \ + (b) = ROTL32(b, 1); \ + (d) = ROTL32(d, 7); \ + (a) = ROTL32(a ^ b ^ d, 5); \ + (c) = ROTL32(c ^ d ^ (b<<7), 22); \ + } + +#define ROUND_BIG(rc, alpha) { \ + m[ 0] ^= alpha[ 0]; \ + c[ 4] ^= alpha[ 8]; \ + m[ 8] ^= alpha[16]; \ + c[12] ^= alpha[24]; \ + m[ 1] ^= alpha[ 1] ^ (rc); \ + c[ 5] ^= alpha[ 9]; \ + m[ 9] ^= alpha[17]; \ + c[13] ^= alpha[25]; \ + c[ 0] ^= alpha[ 2]; \ + m[ 4] ^= alpha[10]; \ + c[ 8] ^= alpha[18]; \ + m[12] ^= alpha[26]; \ + c[ 1] ^= alpha[ 3]; \ + m[ 5] ^= alpha[11]; \ + c[ 9] ^= alpha[19]; \ + m[13] ^= alpha[27]; \ + m[ 2] ^= alpha[ 4]; \ + c[ 6] ^= alpha[12]; \ + m[10] ^= alpha[20]; \ + c[14] ^= alpha[28]; \ + m[ 3] ^= alpha[ 5]; \ + c[ 7] ^= alpha[13]; \ + m[11] ^= alpha[21]; \ + c[15] ^= alpha[29]; \ + c[ 2] ^= alpha[ 6]; \ + m[ 6] ^= alpha[14]; \ + c[10] ^= alpha[22]; \ + m[14] ^= alpha[30]; \ + c[ 3] ^= alpha[ 7]; \ + m[ 7] ^= alpha[15]; \ + c[11] ^= alpha[23]; \ + m[15] ^= alpha[31]; \ + SBOX(m[ 0], c[ 4], m[ 8], c[12]); \ + SBOX(m[ 1], c[ 5], m[ 9], c[13]); \ + SBOX(c[ 0], m[ 4], c[ 8], m[12]); \ + SBOX(c[ 1], m[ 5], c[ 9], m[13]); \ + HAMSI_L(m[ 0], c[ 5], c[ 8], m[13]); \ + SBOX(m[ 2], c[ 6], m[10], c[14]); \ + HAMSI_L(m[ 1], m[ 4], c[ 9], c[14]); \ + SBOX(m[ 3], c[ 7], m[11], c[15]); \ + HAMSI_L(c[ 0], m[ 5], m[10], c[15]); \ + SBOX(c[ 2], m[ 6], c[10], m[14]); \ + HAMSI_L(c[ 1], c[ 6], m[11], m[14]); \ + SBOX(c[ 3], m[ 7], c[11], m[15]); \ + HAMSI_L(m[ 2], c[ 7], c[10], m[15]); \ + HAMSI_L(m[ 3], m[ 6], c[11], c[12]); \ + HAMSI_L(c[ 2], m[ 7], m[ 8], c[13]); \ + HAMSI_L(c[ 3], c[ 4], m[ 9], m[12]); \ + HAMSI_L(m[ 0], c[ 0], m[ 3], c[ 3]); \ + HAMSI_L(m[ 8], c[ 9], m[11], c[10]); \ + HAMSI_L(c[ 5], m[ 5], c[ 6], m[ 6]); \ + HAMSI_L(c[13], m[12], c[14], m[15]); \ + } + + + +//------FUGUE MACROS-------------------------------------------------- +static __constant__ const uint32_t c_S[16] = { + 0x8807a57e, 0xe616af75, 0xc5d3e4db, 0xac9ab027, + 0xd915f117, 0xb6eecc54, 0x06e8020b, 0x4a92efd1, + 0xaac6e2c9, 0xddb21398, 0xcae65838, 0x437f203f, + 0x25ea78e7, 0x951fddd6, 0xda6ed11d, 0xe13e3567 +}; -#ifdef __INTELLISENSE__ -/* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 -#endif +static __device__ uint32_t mixtab0[256] = { + 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, + 0x767659c3, 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, + 0x727245d3, 0xc0c0762d, 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, + 0xd8d83e4d, 0x313197c4, 0x15156b54, 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, + 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, 0x5252a553, 0x3b3ba1ec, 0xd6d61475, + 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, 0x6a6a0db3, 0xcbcb4701, + 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, 0x4545c00f, + 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9, + 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, + 0x1717655c, 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, + 0x9090dd76, 0x88889516, 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, + 0x06061218, 0x2424fc90, 0x5c5c8f6b, 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, + 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, 0xbaba0bde, 0x787873fb, 0x2525fb94, + 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, 0x70704bdb, 0x3e3ebaf8, + 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, 0xe1e191a9, + 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51, + 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, + 0x16166258 +}; -#if __CUDA_ARCH__ >= 300 -/********************* Message expansion ************************/ +#define mixtab0(x) shared[0][x] +#define mixtab1(x) shared[1][x] +#define mixtab2(x) shared[2][x] +#define mixtab3(x) shared[3][x] -/* - * Reduce modulo 257; result is in [-127; 383] - * REDUCE(x) := (x&255) - (x>>8) - */ -#define REDUCE(x) \ - (((x)&255) - ((x)>>8)) +#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ + x22 ^= x00; \ + x00 = (q); \ + x08 ^= (q); \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ + } -/* - * Reduce from [-127; 383] to [-128; 128] - * EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 - */ -#define EXTRA_REDUCE_S(x) \ - ((x)<=128 ? (x) : (x)-257) +#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ + } -/* - * Reduce modulo 257; result is in [-128; 128] - */ -#define REDUCE_FULL_S(x) \ - EXTRA_REDUCE_S(REDUCE(x)) - -// Parallelization: -// -// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) -// and 1 time 16-fach parallel (in FFT_128_full) -// -// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations - -/** - * FFT_8 using w=4 as 8th root of unity - * Unrolled decimation in frequency (DIF) radix-2 NTT. - * Output data is in revbin_permuted order. - */ __device__ __forceinline__ -void FFT_8(int *y, int stripe) -{ -#define X(i) y[stripe*i] - -#define DO_REDUCE(i) \ - X(i) = REDUCE(X(i)) - -#define DO_REDUCE_FULL_S(i) \ -do { \ - X(i) = REDUCE(X(i)); \ - X(i) = EXTRA_REDUCE_S(X(i)); \ -} while(0) - -#define BUTTERFLY(i,j,n) \ -do { \ - int u= X(i); \ - int v= X(j); \ - X(i) = u+v; \ - X(j) = (u-v) << (2*n); \ -} while(0) - - BUTTERFLY(0, 4, 0); - BUTTERFLY(1, 5, 1); - BUTTERFLY(2, 6, 2); - BUTTERFLY(3, 7, 3); - - DO_REDUCE(6); - DO_REDUCE(7); - - BUTTERFLY(0, 2, 0); - BUTTERFLY(4, 6, 0); - BUTTERFLY(1, 3, 2); - BUTTERFLY(5, 7, 2); - - DO_REDUCE(7); - - BUTTERFLY(0, 1, 0); - BUTTERFLY(2, 3, 0); - BUTTERFLY(4, 5, 0); - BUTTERFLY(6, 7, 0); - - DO_REDUCE_FULL_S(0); - DO_REDUCE_FULL_S(1); - DO_REDUCE_FULL_S(2); - DO_REDUCE_FULL_S(3); - DO_REDUCE_FULL_S(4); - DO_REDUCE_FULL_S(5); - DO_REDUCE_FULL_S(6); - DO_REDUCE_FULL_S(7); - -#undef X -#undef DO_REDUCE -#undef DO_REDUCE_FULL_S -#undef BUTTERFLY +static void SMIX_LDG(const uint32_t shared[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3){ + uint32_t c0 = __ldg(&mixtab0[__byte_perm(x0, 0, 0x4443)]); + uint32_t r1 = mixtab1(__byte_perm(x0, 0, 0x4442)); + uint32_t r2 = mixtab2(__byte_perm(x0, 0, 0x4441)); + uint32_t r3 = mixtab3(__byte_perm(x0, 0, 0x4440)); + c0 = c0 ^ r1 ^ r2 ^ r3; + uint32_t r0 = mixtab0(__byte_perm(x1, 0, 0x4443)); + uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1, 0, 0x4442)); + uint32_t tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); + c1 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x1, 0, 0x4440)); + c1 ^= tmp; + r3 ^= tmp; + uint32_t c2 = __ldg(&mixtab0[__byte_perm(x2, 0, 0x4443)]); + r0 ^= c2; + tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); + c2 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x2, 0, 0x4441)); + c2 ^= tmp; + tmp = mixtab3(__byte_perm(x2, 0, 0x4440)); + c2 ^= tmp; + r3 ^= tmp; + uint32_t c3 = __ldg(&mixtab0[__byte_perm(x3, 0, 0x4443)]); + r0 ^= c3; + tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); + c3 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x3, 0, 0x4441)); + c3 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); + c3 ^= tmp; + x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF); + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); } -#if defined(__CUDA_ARCH__) -#if __CUDA_ARCH__ < 300 - #define __shfl(var, srcLane, width) (uint32_t)(var) - // #error __shfl() not supported by SM 2.x -#endif -#endif - -/** - * FFT_16 using w=2 as 16th root of unity - * Unrolled decimation in frequency (DIF) radix-2 NTT. - * Output data is in revbin_permuted order. - */ __device__ __forceinline__ -void FFT_16(int *y) -{ -#define DO_REDUCE_FULL_S(i) \ - do { \ - y[i] = REDUCE(y[i]); \ - y[i] = EXTRA_REDUCE_S(y[i]); \ - } while(0) - - int u,v; - - // BUTTERFLY(0, 8, 0); - // BUTTERFLY(1, 9, 1); - // BUTTERFLY(2, 10, 2); - // BUTTERFLY(3, 11, 3); - // BUTTERFLY(4, 12, 4); - // BUTTERFLY(5, 13, 5); - // BUTTERFLY(6, 14, 6); - // BUTTERFLY(7, 15, 7); - { - u = y[0]; // 0..7 - v = y[1]; // 8..15 - y[0] = u+v; - y[1] = (u-v) << (threadIdx.x&7); - } - - // DO_REDUCE(11); - // DO_REDUCE(12); - // DO_REDUCE(13); - // DO_REDUCE(14); - // DO_REDUCE(15); - if ((threadIdx.x&7) >=3) y[1] = REDUCE(y[1]); // 11...15 - - // BUTTERFLY( 0, 4, 0); - // BUTTERFLY( 1, 5, 2); - // BUTTERFLY( 2, 6, 4); - // BUTTERFLY( 3, 7, 6); - { - u = __shfl((int)y[0], (threadIdx.x&3),8); // 0,1,2,3 0,1,2,3 - v = __shfl((int)y[0],4+(threadIdx.x&3),8); // 4,5,6,7 4,5,6,7 - y[0] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); - } +static void SMIX(const uint32_t shared[4][256], uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3){ + uint32_t c0 = mixtab0(__byte_perm(x0, 0, 0x4443)); + uint32_t r1 = mixtab1(__byte_perm(x0, 0, 0x4442)); + uint32_t r2 = mixtab2(__byte_perm(x0, 0, 0x4441)); + uint32_t r3 = mixtab3(__byte_perm(x0, 0, 0x4440)); + c0 = c0 ^ r1 ^ r2 ^ r3; + uint32_t r0 = mixtab0(__byte_perm(x1, 0, 0x4443)); + uint32_t c1 = r0 ^ mixtab1(__byte_perm(x1, 0, 0x4442)); + uint32_t tmp = mixtab2(__byte_perm(x1, 0, 0x4441)); + c1 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x1, 0, 0x4440)); + c1 ^= tmp; + r3 ^= tmp; + uint32_t c2 = mixtab0(__byte_perm(x2, 0, 0x4443)); + r0 ^= c2; + tmp = mixtab1(__byte_perm(x2, 0, 0x4442)); + c2 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x2, 0, 0x4441)); + c2 ^= tmp; + tmp = mixtab3(__byte_perm(x2, 0, 0x4440)); + c2 ^= tmp; + r3 ^= tmp; + uint32_t c3 = mixtab0(__byte_perm(x3, 0, 0x4443)); + r0 ^= c3; + tmp = mixtab1(__byte_perm(x3, 0, 0x4442)); + c3 ^= tmp; + r1 ^= tmp; + tmp = mixtab2(__byte_perm(x3, 0, 0x4441)); + c3 ^= tmp; + r2 ^= tmp; + tmp = mixtab3(__byte_perm(x3, 0, 0x4440)); + c3 ^= tmp; + x0 = ((c0 ^ (r0 << 0)) & 0xFF000000) | ((c1 ^ (r1 << 0)) & 0x00FF0000) | ((c2 ^ (r2 << 0)) & 0x0000FF00) | ((c3 ^ (r3 << 0)) & 0x000000FF); + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); +} - // BUTTERFLY( 8, 12, 0); - // BUTTERFLY( 9, 13, 2); - // BUTTERFLY(10, 14, 4); - // BUTTERFLY(11, 15, 6); - { - u = __shfl((int)y[1], (threadIdx.x&3),8); // 8,9,10,11 8,9,10,11 - v = __shfl((int)y[1],4+(threadIdx.x&3),8); // 12,13,14,15 12,13,14,15 - y[1] = ((threadIdx.x&7) < 4) ? (u+v) : ((u-v) << (2*(threadIdx.x&3))); +#define mROR3 { \ + B[ 6] = S[33], B[ 7] = S[34], B[ 8] = S[35]; \ + S[35] = S[32]; S[34] = S[31]; S[33] = S[30]; S[32] = S[29]; S[31] = S[28]; S[30] = S[27]; S[29] = S[26]; S[28] = S[25]; S[27] = S[24]; \ + S[26] = S[23]; S[25] = S[22]; S[24] = S[21]; S[23] = S[20]; S[22] = S[19]; S[21] = S[18]; S[20] = S[17]; S[19] = S[16]; S[18] = S[15]; \ + S[17] = S[14]; S[16] = S[13]; S[15] = S[12]; S[14] = S[11]; S[13] = S[10]; S[12] = S[ 9]; S[11] = S[ 8]; S[10] = S[ 7]; S[ 9] = S[ 6]; \ + S[ 8] = S[ 5]; S[ 7] = S[ 4]; S[ 6] = S[ 3]; S[ 5] = S[ 2]; S[ 4] = S[ 1]; S[ 3] = S[ 0]; S[ 2] = B[ 8]; S[ 1] = B[ 7]; S[ 0] = B[ 6]; \ } - // DO_REDUCE(5); - // DO_REDUCE(7); - // DO_REDUCE(13); - // DO_REDUCE(15); - if ((threadIdx.x&1) && (threadIdx.x&7) >= 4) { - y[0] = REDUCE(y[0]); // 5, 7 - y[1] = REDUCE(y[1]); // 13, 15 +#define mROR8 { \ + B[ 1] = S[28]; B[ 2] = S[29]; B[ 3] = S[30]; B[ 4] = S[31]; B[ 5] = S[32]; B[ 6] = S[33]; B[ 7] = S[34]; B[ 8] = S[35]; \ + S[35] = S[27]; S[34] = S[26]; S[33] = S[25]; S[32] = S[24]; S[31] = S[23]; S[30] = S[22]; S[29] = S[21]; S[28] = S[20]; S[27] = S[19]; \ + S[26] = S[18]; S[25] = S[17]; S[24] = S[16]; S[23] = S[15]; S[22] = S[14]; S[21] = S[13]; S[20] = S[12]; S[19] = S[11]; S[18] = S[10]; \ + S[17] = S[ 9]; S[16] = S[ 8]; S[15] = S[ 7]; S[14] = S[ 6]; S[13] = S[ 5]; S[12] = S[ 4]; S[11] = S[ 3]; S[10] = S[ 2]; S[ 9] = S[ 1]; \ + S[ 8] = S[ 0]; S[ 7] = B[ 8]; S[ 6] = B[ 7]; S[ 5] = B[ 6]; S[ 4] = B[ 5]; S[ 3] = B[ 4]; S[ 2] = B[ 3]; S[ 1] = B[ 2]; S[ 0] = B[ 1]; \ } - // BUTTERFLY( 0, 2, 0); - // BUTTERFLY( 1, 3, 4); - // BUTTERFLY( 4, 6, 0); - // BUTTERFLY( 5, 7, 4); - { - u = __shfl((int)y[0], (threadIdx.x&5),8); // 0,1,0,1 4,5,4,5 - v = __shfl((int)y[0],2+(threadIdx.x&5),8); // 2,3,2,3 6,7,6,7 - y[0] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); +#define mROR9 { \ + B[ 0] = S[27]; B[ 1] = S[28]; B[ 2] = S[29]; B[ 3] = S[30]; B[ 4] = S[31]; B[ 5] = S[32]; B[ 6] = S[33]; B[ 7] = S[34]; B[ 8] = S[35]; \ + S[35] = S[26]; S[34] = S[25]; S[33] = S[24]; S[32] = S[23]; S[31] = S[22]; S[30] = S[21]; S[29] = S[20]; S[28] = S[19]; S[27] = S[18]; \ + S[26] = S[17]; S[25] = S[16]; S[24] = S[15]; S[23] = S[14]; S[22] = S[13]; S[21] = S[12]; S[20] = S[11]; S[19] = S[10]; S[18] = S[ 9]; \ + S[17] = S[ 8]; S[16] = S[ 7]; S[15] = S[ 6]; S[14] = S[ 5]; S[13] = S[ 4]; S[12] = S[ 3]; S[11] = S[ 2]; S[10] = S[ 1]; S[ 9] = S[ 0]; \ + S[ 8] = B[ 8]; S[ 7] = B[ 7]; S[ 6] = B[ 6]; S[ 5] = B[ 5]; S[ 4] = B[ 4]; S[ 3] = B[ 3]; S[ 2] = B[ 2]; S[ 1] = B[ 1]; S[ 0] = B[ 0]; \ } - // BUTTERFLY( 8, 10, 0); - // BUTTERFLY( 9, 11, 4); - // BUTTERFLY(12, 14, 0); - // BUTTERFLY(13, 15, 4); - { - u = __shfl((int)y[1], (threadIdx.x&5),8); // 8,9,8,9 12,13,12,13 - v = __shfl((int)y[1],2+(threadIdx.x&5),8); // 10,11,10,11 14,15,14,15 - y[1] = ((threadIdx.x&3) < 2) ? (u+v) : ((u-v) << (4*(threadIdx.x&1))); +#define FUGUE512_3(x, y, z) { \ + TIX4(x, S[ 0], S[ 1], S[ 4], S[ 7], S[ 8], S[22], S[24], S[27], S[30]); \ + CMIX36(S[33], S[34], S[35], S[ 1], S[ 2], S[ 3], S[15], S[16], S[17]); \ + SMIX(shared, S[33], S[34], S[35], S[ 0]); \ + CMIX36(S[30], S[31], S[32], S[34], S[35], S[ 0], S[12], S[13], S[14]); \ + SMIX(shared, S[30], S[31], S[32], S[33]); \ + CMIX36(S[27], S[28], S[29], S[31], S[32], S[33], S[ 9], S[10], S[11]); \ + SMIX(shared, S[27], S[28], S[29], S[30]); \ + CMIX36(S[24], S[25], S[26], S[28], S[29], S[30], S[ 6], S[ 7], S[ 8]); \ + SMIX_LDG(shared, S[24], S[25], S[26], S[27]); \ + \ + TIX4(y, S[24], S[25], S[28], S[31], S[32], S[10], S[12], S[15], S[18]); \ + CMIX36(S[21], S[22], S[23], S[25], S[26], S[27], S[ 3], S[ 4], S[ 5]); \ + SMIX(shared, S[21], S[22], S[23], S[24]); \ + CMIX36(S[18], S[19], S[20], S[22], S[23], S[24], S[ 0], S[ 1], S[ 2]); \ + SMIX_LDG(shared, S[18], S[19], S[20], S[21]); \ + CMIX36(S[15], S[16], S[17], S[19], S[20], S[21], S[33], S[34], S[35]); \ + SMIX(shared, S[15], S[16], S[17], S[18]); \ + CMIX36(S[12], S[13], S[14], S[16], S[17], S[18], S[30], S[31], S[32]); \ + SMIX_LDG(shared, S[12], S[13], S[14], S[15]); \ + \ + TIX4(z, S[12], S[13], S[16], S[19], S[20], S[34], S[ 0], S[ 3], S[ 6]); \ + CMIX36(S[ 9], S[10], S[11], S[13], S[14], S[15], S[27], S[28], S[29]); \ + SMIX(shared, S[ 9], S[10], S[11], S[12]); \ + CMIX36(S[ 6], S[ 7], S[ 8], S[10], S[11], S[12], S[24], S[25], S[26]); \ + SMIX_LDG(shared, S[ 6], S[ 7], S[ 8], S[ 9]); \ + CMIX36(S[ 3], S[ 4], S[ 5], S[ 7], S[ 8], S[ 9], S[21], S[22], S[23]); \ + SMIX_LDG(shared, S[ 3], S[ 4], S[ 5], S[ 6]); \ + CMIX36(S[ 0], S[ 1], S[ 2], S[ 4], S[ 5], S[ 6], S[18], S[19], S[20]); \ + SMIX_LDG(shared, S[ 0], S[ 1], S[ 2], S[ 3]); \ } - // BUTTERFLY( 0, 1, 0); - // BUTTERFLY( 2, 3, 0); - // BUTTERFLY( 4, 5, 0); - // BUTTERFLY( 6, 7, 0); - { - u = __shfl((int)y[0], (threadIdx.x&6),8); // 0,0,2,2 4,4,6,6 - v = __shfl((int)y[0],1+(threadIdx.x&6),8); // 1,1,3,3 5,5,7,7 - y[0] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); - } +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#endif - // BUTTERFLY( 8, 9, 0); - // BUTTERFLY(10, 11, 0); - // BUTTERFLY(12, 13, 0); - // BUTTERFLY(14, 15, 0); - { - u = __shfl((int)y[1], (threadIdx.x&6),8); // 8,8,10,10 12,12,14,14 - v = __shfl((int)y[1],1+(threadIdx.x&6),8); // 9,9,11,11 13,13,15,15 - y[1] = ((threadIdx.x&1) < 1) ? (u+v) : (u-v); +#define TPB50_1 128 +#define TPB50_2 128 +#define TPB52_1 128 +#define TPB52_2 128 + +static uint4 *d_temp4[MAX_GPUS]; +#include "cuda_x11_simd512_func.cuh" + +//ECHO MACROS-------------------------------- +#define SHIFT_ROW1(a, b, c, d) do { \ + tmp0 = W[a+0]; \ + W[a+0] = W[b+0]; \ + W[b+0] = W[c+0]; \ + W[c+0] = W[d+0]; \ + W[d+0] = tmp0; \ +\ + tmp0 = W[a+1]; \ + W[a+1] = W[b+1]; \ + W[b+1] = W[c+1]; \ + W[c+1] = W[d+1]; \ + W[d+1] = tmp0; \ +\ + tmp0 = W[a+2]; \ + W[a+2] = W[b+2]; \ + W[b+2] = W[c+2]; \ + W[c+2] = W[d+2]; \ + W[d+2] = tmp0; \ +\ + tmp0 = W[a+3]; \ + W[a+3] = W[b+3]; \ + W[b+3] = W[c+3]; \ + W[c+3] = W[d+3]; \ + W[d+3] = tmp0; \ + } while (0) + +#define SHIFT_ROW2(a, b, c, d) do { \ + tmp0 = W[a+0]; \ + W[a+0] = W[c+0]; \ + W[c+0] = tmp0; \ +\ + tmp0 = W[a+1]; \ + W[a+1] = W[c+1]; \ + W[c+1] = tmp0; \ +\ + tmp0 = W[a+2]; \ + W[a+2] = W[c+2]; \ + W[c+2] = tmp0; \ +\ + tmp0 = W[a+3]; \ + W[a+3] = W[c+3]; \ + W[c+3] = tmp0; \ +\ + tmp0 = W[b+0]; \ + W[b+0] = W[d+0]; \ + W[d+0] = tmp0; \ +\ + tmp0 = W[b+1]; \ + W[b+1] = W[d+1]; \ + W[d+1] = tmp0; \ +\ + tmp0 = W[b+2]; \ + W[b+2] = W[d+2]; \ + W[d+2] = tmp0; \ +\ + tmp0 = W[b+3]; \ + W[b+3] = W[d+3]; \ + W[d+3] = tmp0; \ + } while (0) + +#define MIX_COLUMN1(ia, ib, ic, id, n) do { \ + tmp0 = W[ia+n]; \ + unsigned int tmp1 = W[ic+n]; \ + unsigned int tmp2 = tmp0 ^ W[ib+n]; \ + unsigned int tmp3 = W[ib+n] ^ tmp1; \ + unsigned int tmp4 = tmp1 ^ W[id+n]; \ + unsigned int tmp5 = (((tmp2 & (0x80808080)) >> 7) * 27 ^ ((tmp2 & (0x7F7F7F7F)) << 1));\ + unsigned int tmp6 = (((tmp3 & (0x80808080)) >> 7) * 27 ^ ((tmp3 & (0x7F7F7F7F)) << 1));\ + unsigned int tmp7 = (((tmp4 & (0x80808080)) >> 7) * 27 ^ ((tmp4 & (0x7F7F7F7F)) << 1));\ + W[ia+n] = tmp5 ^ tmp3 ^ W[id+n]; \ + W[ib+n] = tmp6 ^ tmp0 ^ tmp4; \ + W[ic+n] = tmp7 ^ tmp2 ^ W[id+n]; \ + W[id+n] = tmp5^tmp6^tmp7^tmp2^tmp1; \ + } while (0) + +#define MIX_COLUMN(a, b, c, d) do { \ + MIX_COLUMN1(a, b, c, d, 0); \ + MIX_COLUMN1(a, b, c, d, 1); \ + MIX_COLUMN1(a, b, c, d, 2); \ + MIX_COLUMN1(a, b, c, d, 3); \ + } while (0) +//END OF ECHO MACROS------------------------- + +__device__ +static void echo_round_sp(const uint32_t sharedMemory[8 * 1024], uint32_t *W, uint32_t &k0){ + // Big Sub Words +#pragma unroll 16 + for (int idx = 0; idx < 16; idx++) + AES_2ROUND_32(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0); + + // Shift Rows +#pragma unroll 4 + for (int i = 0; i < 4; i++){ + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i + 4]; + t[1] = W[i + 8]; + t[2] = W[i + 24]; + t[3] = W[i + 60]; + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i + 24] = W[i + 56]; + W[i + 60] = W[i + 44]; + + W[i + 20] = W[i + 36]; + W[i + 40] = t[1]; + W[i + 56] = t[2]; + W[i + 44] = W[i + 28]; + + W[i + 28] = W[i + 12]; + W[i + 12] = t[3]; + W[i + 36] = W[i + 52]; + W[i + 52] = t[0]; + } + // Mix Columns +#pragma unroll 4 + for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t +#pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i + 12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = (bc^ a[3] ^ abx); + W[idx + i + 4] = xor3(a[0], cd, bcx); + W[idx + i + 8] = xor3(ab, a[3], cdx); + W[idx + i + 12] = xor3(ab, a[2], xor3(abx, bcx, cdx)); + } } +} - DO_REDUCE_FULL_S( 0); // 0...7 - DO_REDUCE_FULL_S( 1); // 8...15 -#undef DO_REDUCE_FULL_S +__global__ __launch_bounds__(128,5) +static void x11_simd512_gpu_compress_64(uint32_t threads, uint32_t *g_hash,const uint4 *const __restrict__ g_fft4) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>3; + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + uint32_t IV[32]; + if (thread < threads){ + + uint32_t *Hash = &g_hash[thread<<4]; +// Compression1(Hash, thread, g_fft4, g_state); + uint32_t A[32]; + + *(uint2x4*)&IV[ 0] = *(uint2x4*)&c_IV_512[ 0]; + *(uint2x4*)&IV[ 8] = *(uint2x4*)&c_IV_512[ 8]; + *(uint2x4*)&IV[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&IV[24] = *(uint2x4*)&c_IV_512[24]; + + *(uint2x4*)&A[ 0] = __ldg4((uint2x4*)&Hash[ 0]); + *(uint2x4*)&A[ 8] = __ldg4((uint2x4*)&Hash[ 8]); + + #pragma unroll 16 + for(uint32_t i=0;i<16;i++) + A[ i] = A[ i] ^ IV[ i]; + + #pragma unroll 16 + for(uint32_t i=16;i<32;i++) + A[ i] = IV[ i]; + + Round8(A, thr_offset, g_fft4); + + STEP8_IF(&IV[ 0],32, 4,13,&A[ 0],&A[ 8],&A[16],&A[24]); + STEP8_IF(&IV[ 8],33,13,10,&A[24],&A[ 0],&A[ 8],&A[16]); + STEP8_IF(&IV[16],34,10,25,&A[16],&A[24],&A[ 0],&A[ 8]); + STEP8_IF(&IV[24],35,25, 4,&A[ 8],&A[16],&A[24],&A[ 0]); + + #pragma unroll 32 + for(uint32_t i=0;i<32;i++){ + IV[ i] = A[ i]; + } + + A[ 0] ^= 512; + + Round8_0_final(A, 3,23,17,27); + Round8_1_final(A,28,19,22, 7); + Round8_2_final(A,29, 9,15, 5); + Round8_3_final(A, 4,13,10,25); + STEP8_IF(&IV[ 0],32, 4,13, &A[ 0], &A[ 8], &A[16], &A[24]); + STEP8_IF(&IV[ 8],33,13,10, &A[24], &A[ 0], &A[ 8], &A[16]); + STEP8_IF(&IV[16],34,10,25, &A[16], &A[24], &A[ 0], &A[ 8]); + STEP8_IF(&IV[24],35,25, 4, &A[ 8], &A[16], &A[24], &A[ 0]); + + *(uint2x4*)&Hash[ 0] = *(uint2x4*)&A[ 0]; + *(uint2x4*)&Hash[ 8] = *(uint2x4*)&A[ 8]; + } } -__device__ __forceinline__ -void FFT_128_full(int y[128]) +__global__ +__launch_bounds__(128, 5) +void x11_simd512_gpu_compress_64_pascal_final(uint32_t threads, uint32_t startnonce, uint32_t *g_hash, const uint4 *const __restrict__ g_fft4, uint32_t *d_resNonce, const uint64_t target) { - int i; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x)>>3; + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + uint32_t IV[32]; + if (thread < threads){ - FFT_8(y+0,2); // eight parallel FFT8's - FFT_8(y+1,2); // eight parallel FFT8's + uint32_t *Hash = &g_hash[thread << 4]; + // Compression1(Hash, thread, g_fft4, g_state); + uint32_t A[32]; + + *(uint2x4*)&IV[0] = *(uint2x4*)&c_IV_512[0]; + *(uint2x4*)&IV[8] = *(uint2x4*)&c_IV_512[8]; + *(uint2x4*)&IV[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&IV[24] = *(uint2x4*)&c_IV_512[24]; + + *(uint2x4*)&A[0] = __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = __ldg4((uint2x4*)&Hash[8]); #pragma unroll 16 - for (i=0; i<16; i++) - /*if (i & 7)*/ y[i] = REDUCE(y[i]*c_FFT128_8_16_Twiddle[i*8+(threadIdx.x&7)]); + for (uint32_t i = 0; i<16; i++) + A[i] = A[i] ^ IV[i]; -#pragma unroll 8 - for (i=0; i<8; i++) - FFT_16(y+2*i); // eight sequential FFT16's, each one executed in parallel by 8 threads +#pragma unroll 16 + for (uint32_t i = 16; i<32; i++) + A[i] = IV[i]; + + Round8(A, thr_offset, g_fft4); + + STEP8_IF(&IV[0], 32, 4, 13, &A[0], &A[8], &A[16], &A[24]); + STEP8_IF(&IV[8], 33, 13, 10, &A[24], &A[0], &A[8], &A[16]); + STEP8_IF(&IV[16], 34, 10, 25, &A[16], &A[24], &A[0], &A[8]); + STEP8_IF(&IV[24], 35, 25, 4, &A[8], &A[16], &A[24], &A[0]); + +#pragma unroll 32 + for (uint32_t i = 0; i<32; i++){ + IV[i] = A[i]; + } + + A[0] ^= 512; + + Round8_0_final(A, 3, 23, 17, 27); + Round8_1_final(A, 28, 19, 22, 7); + Round8_2_final(A, 29, 9, 15, 5); + Round8_3_final(A, 4, 13, 10, 25); + STEP8_IF(&IV[0], 32, 4, 13, &A[0], &A[8], &A[16], &A[24]); + STEP8_IF(&IV[8], 33, 13, 10, &A[24], &A[0], &A[8], &A[16]); + STEP8_IF(&IV[16], 34, 10, 25, &A[16], &A[24], &A[0], &A[8]); + STEP8_IF(&IV[24], 35, 25, 4, &A[8], &A[16], &A[24], &A[0]); + + // *(uint2x4*)&Hash[0] = *(uint2x4*)&A[0]; + // *(uint2x4*)&Hash[8] = *(uint2x4*)&A[8]; + + // *(uint64_t*)&Hash[6] = *(uint64_t*)&A[6]; + + // __syncthreads(); + + uint64_t check = ((uint64_t*)A)[3]; + uint32_t nonce = thread + startnonce; + if (check <= target) + { + uint32_t tmp = atomicExch(&d_resNonce[0], nonce); + if (tmp != UINT32_MAX) + if (tmp != d_resNonce[0] ) d_resNonce[1] = tmp; + } + + + } } -__device__ __forceinline__ -void FFT_256_halfzero(int y[256]) -{ - /* - * FFT_256 using w=41 as 256th root of unity. - * Decimation in frequency (DIF) NTT. - * Output data is in revbin_permuted order. - * In place. - */ - const int tmp = y[15]; -#pragma unroll 8 - for (int i=0; i<8; i++) - y[16+i] = REDUCE(y[i] * c_FFT256_2_128_Twiddle[8*i+(threadIdx.x&7)]); -#pragma unroll 8 - for (int i=8; i<16; i++) - y[16+i] = 0; - /* handle X^255 with an additional butterfly */ - if ((threadIdx.x&7) == 7) - { - y[15] = REDUCE(tmp + 1); - y[31] = REDUCE((tmp - 1) * c_FFT256_2_128_Twiddle[127]); + +__host__ +int x11_simd512_cpu_init(int thr_id, uint32_t threads) +{ + cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads); + + // whirlpool + uint64_t* table0 = NULL; + table0 = (uint64_t*)plain_T0; + cudaMemcpyToSymbol(InitVector_RC, plain_RC, 10 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(precomputed_round_key_64, plain_precomputed_round_key_64, 72 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(b0, table0, 256 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice); + uint64_t table7[256]; + for (int i = 0; i<256; i++){ + table7[i] = ROTR64(table0[i], 8); } - FFT_128_full(y); - FFT_128_full(y+16); + return 0; } -/***************************************************/ +__host__ +void x11_simd512_cpu_free(int thr_id){ + cudaFree(d_temp4[thr_id]); +} +//extern void x11_simd512_cpu_free(int thr_id); __device__ __forceinline__ -void Expansion(const uint32_t *data, uint4 *g_temp4) -{ - /* Message Expansion using Number Theoretical Transform similar to FFT */ - int expanded[32]; -#pragma unroll 4 - for (int i=0; i < 4; i++) { - expanded[ i] = __byte_perm(__shfl((int)data[0], 2*i, 8), __shfl((int)data[0], (2*i)+1, 8), threadIdx.x&7)&0xff; - expanded[4+i] = __byte_perm(__shfl((int)data[1], 2*i, 8), __shfl((int)data[1], (2*i)+1, 8), threadIdx.x&7)&0xff; +static void SIMD_Compress(uint32_t *A, const uint32_t thr_offset, const uint4 *const __restrict__ g_fft4){ + + uint32_t IV[32]; + + *(uint2x4*)&IV[0] = *(uint2x4*)&c_IV_512[0]; + *(uint2x4*)&IV[8] = *(uint2x4*)&c_IV_512[8]; + *(uint2x4*)&IV[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&IV[24] = *(uint2x4*)&c_IV_512[24]; + + Round8(A, thr_offset, g_fft4); + + const uint32_t a[4] = { 4, 13, 10, 25 }; + + for (int i = 0; i<4; i++) + STEP8_IF(&IV[i * 8], 32 + i, a[i], a[(i + 1) & 3], &A[(0 + i * 24) & 31], &A[(8 + i * 24) & 31], &A[(16 + i * 24) & 31], &A[(24 + i * 24) & 31]); + +#pragma unroll 32 + for (uint32_t i = 0; i<32; i++){ + IV[i] = A[i]; } -#pragma unroll 8 - for (int i=8; i < 16; i++) - expanded[i] = 0; - - FFT_256_halfzero(expanded); - - // store w matrices in global memory - -#define mul_185(x) ( (x)*185 ) -#define mul_233(x) ( (x)*233 ) - - uint4 vec0; - int P, Q, P1, Q1, P2, Q2; - bool even = (threadIdx.x & 1) == 0; - -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 - - // 2 6 0 4 - - P1 = expanded[ 0]; P2 = __shfl(expanded[ 2], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); - P1 = expanded[ 8]; P2 = __shfl(expanded[10], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); - P1 = expanded[ 4]; P2 = __shfl(expanded[ 6], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); - P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[0][threadIdx.x&7], 8); - g_temp4[threadIdx.x&7] = vec0; - -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - - // 6 2 4 0 - - P1 = expanded[ 1]; P2 = __shfl(expanded[ 3], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); - P1 = expanded[ 9]; P2 = __shfl(expanded[11], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); - P1 = expanded[ 5]; P2 = __shfl(expanded[ 7], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); - P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x-1)&7, 8); P = even ? P1 : P2; - Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x-1)&7, 8); Q = even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[1][threadIdx.x&7], 8); - g_temp4[8+(threadIdx.x&7)] = vec0; - -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 - - // 7 5 3 1 - - bool hi = (threadIdx.x&7)>=4; - - P1 = hi?expanded[ 1]:expanded[ 0]; P2 = __shfl(hi?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = hi?expanded[17]:expanded[16]; Q2 = __shfl(hi?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); - P1 = hi?expanded[ 9]:expanded[ 8]; P2 = __shfl(hi?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = hi?expanded[25]:expanded[24]; Q2 = __shfl(hi?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); - P1 = hi?expanded[ 5]:expanded[ 4]; P2 = __shfl(hi?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = hi?expanded[21]:expanded[20]; Q2 = __shfl(hi?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); - P1 = hi?expanded[13]:expanded[12]; P2 = __shfl(hi?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = hi?expanded[29]:expanded[28]; Q2 = __shfl(hi?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[2][threadIdx.x&7], 8); - g_temp4[16+(threadIdx.x&7)] = vec0; - -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 -// 1 9 5 13 3 11 7 15 17 25 21 29 19 27 23 31 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 -// 0 8 4 12 2 10 6 14 16 24 20 28 18 26 22 30 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 - - // 1 3 5 7 - - bool lo = (threadIdx.x&7)<4; - - P1 = lo?expanded[ 1]:expanded[ 0]; P2 = __shfl(lo?expanded[ 3]:expanded[ 2], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = lo?expanded[17]:expanded[16]; Q2 = __shfl(lo?expanded[19]:expanded[18], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); - P1 = lo?expanded[ 9]:expanded[ 8]; P2 = __shfl(lo?expanded[11]:expanded[10], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = lo?expanded[25]:expanded[24]; Q2 = __shfl(lo?expanded[27]:expanded[26], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); - P1 = lo?expanded[ 5]:expanded[ 4]; P2 = __shfl(lo?expanded[ 7]:expanded[ 6], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = lo?expanded[21]:expanded[20]; Q2 = __shfl(lo?expanded[23]:expanded[22], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); - P1 = lo?expanded[13]:expanded[12]; P2 = __shfl(lo?expanded[15]:expanded[14], (threadIdx.x+1)&7, 8); P = !even ? P1 : P2; - Q1 = lo?expanded[29]:expanded[28]; Q2 = __shfl(lo?expanded[31]:expanded[30], (threadIdx.x+1)&7, 8); Q = !even ? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_185(P), mul_185(Q) , 0x5410), c_perm[3][threadIdx.x&7], 8); - g_temp4[24+(threadIdx.x&7)] = vec0; - -// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 -// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 -// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 -// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 - -//{ 8, 72, 40, 104, 24, 88, 56, 120 }, { 9, 73, 41, 105, 25, 89, 57, 121 }, -//{ 4, 68, 36, 100, 20, 84, 52, 116 }, { 5, 69, 37, 101, 21, 85, 53, 117 }, -//{ 14, 78, 46, 110, 30, 94, 62, 126 }, { 15, 79, 47, 111, 31, 95, 63, 127 }, -//{ 2, 66, 34, 98, 18, 82, 50, 114 }, { 3, 67, 35, 99, 19, 83, 51, 115 }, - - bool sel = ((threadIdx.x+2)&7) >= 4; // 2,3,4,5 - - P1 = sel?expanded[0]:expanded[1]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[2]:expanded[3]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); - P1 = sel?expanded[8]:expanded[9]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[10]:expanded[11]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); - P1 = sel?expanded[4]:expanded[5]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[6]:expanded[7]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); - P1 = sel?expanded[12]:expanded[13]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[14]:expanded[15]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[4][threadIdx.x&7], 8); - - g_temp4[32+(threadIdx.x&7)] = vec0; - -// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 -// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 -// 0 8 4 12 2 10 6 14 0 8 4 12 2 10 6 14 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 -// 1 9 5 13 3 11 7 15 1 9 5 13 3 11 7 15 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 - - P1 = sel?expanded[1]:expanded[0]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[3]:expanded[2]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); - P1 = sel?expanded[9]:expanded[8]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[11]:expanded[10]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); - P1 = sel?expanded[5]:expanded[4]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[7]:expanded[6]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); - P1 = sel?expanded[13]:expanded[12]; Q1 = __shfl(P1, threadIdx.x^1, 8); - Q2 = sel?expanded[15]:expanded[14]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[5][threadIdx.x&7], 8); - - g_temp4[40+(threadIdx.x&7)] = vec0; - -// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 -// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 -// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 -// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 - - // sel markiert threads 2,3,4,5 - - int t; - t = __shfl(expanded[17],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[16]; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[19],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[18]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); - t = __shfl(expanded[25],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[24]; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[27],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[26]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); - t = __shfl(expanded[21],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[20]; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[23],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[22]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); - t = __shfl(expanded[29],(threadIdx.x+4)&7,8); P1 = sel?t:expanded[28]; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[31],(threadIdx.x+4)&7,8); Q2 = sel?t:expanded[30]; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[6][threadIdx.x&7], 8); - - g_temp4[48+(threadIdx.x&7)] = vec0; - -// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 -// 17 25 21 29 19 27 23 31 17 25 21 29 19 27 23 31 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 -// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 -// 16 24 20 28 18 26 22 30 16 24 20 28 18 26 22 30 4 4 4 4 4 4 4 4 5 5 5 5 5 5 5 5 - - // sel markiert threads 2,3,4,5 - - t = __shfl(expanded[16],(threadIdx.x+4)&7,8); P1 = sel?expanded[17]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[18],(threadIdx.x+4)&7,8); Q2 = sel?expanded[19]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.x = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); - t = __shfl(expanded[24],(threadIdx.x+4)&7,8); P1 = sel?expanded[25]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[26],(threadIdx.x+4)&7,8); Q2 = sel?expanded[27]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.y = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); - t = __shfl(expanded[20],(threadIdx.x+4)&7,8); P1 = sel?expanded[21]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[22],(threadIdx.x+4)&7,8); Q2 = sel?expanded[23]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.z = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); - t = __shfl(expanded[28],(threadIdx.x+4)&7,8); P1 = sel?expanded[29]:t; Q1 = __shfl(P1, threadIdx.x^1, 8); - t = __shfl(expanded[30],(threadIdx.x+4)&7,8); Q2 = sel?expanded[31]:t; P2 = __shfl(Q2, threadIdx.x^1, 8); - P = even? P1 : P2; Q = even? Q1 : Q2; - vec0.w = __shfl((int)__byte_perm(mul_233(P), mul_233(Q) , 0x5410), c_perm[7][threadIdx.x&7], 8); - - g_temp4[56+(threadIdx.x&7)] = vec0; - -#undef mul_185 -#undef mul_233 + + A[0] ^= 512; + + Round8_0_final(A, 3, 23, 17, 27); + Round8_1_final(A, 28, 19, 22, 7); + Round8_2_final(A, 29, 9, 15, 5); + Round8_3_final(A, 4, 13, 10, 25); + + for (int i = 0; i<4; i++) + STEP8_IF(&IV[i * 8], 32 + i, a[i], a[(i + 1) & 3], &A[(0 + i * 24) & 31], &A[(8 + i * 24) & 31], &A[(16 + i * 24) & 31], &A[(24 + i * 24) & 31]); + } -/***************************************************/ -__global__ __launch_bounds__(TPB, 4) -void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) + +__global__ //__launch_bounds__(128, 5) +static void x16_simd512_gpu_compress_64_fugue512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) { - int threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) / 8; - if (threadBloc < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + __shared__ uint32_t shared[4][256]; + + if (threadIdx.x<128) + { + uint2 temp = __ldg(&((uint2*)&mixtab0)[threadIdx.x]); + + shared[0][(threadIdx.x << 1) + 0] = temp.x; + shared[0][(threadIdx.x << 1) + 1] = temp.y; + shared[1][(threadIdx.x << 1) + 0] = ROR8(temp.x); + shared[1][(threadIdx.x << 1) + 1] = ROR8(temp.y); + shared[2][(threadIdx.x << 1) + 0] = ROL16(temp.x); + shared[2][(threadIdx.x << 1) + 1] = ROL16(temp.y); + shared[3][(threadIdx.x << 1) + 0] = ROL8(temp.x); + shared[3][(threadIdx.x << 1) + 1] = ROL8(temp.y); + + + /* const uint32_t tmp = mixtab0[threadIdx.x]; + shared[0][threadIdx.x] = tmp; + shared[1][threadIdx.x] = ROR8(tmp); + shared[2][threadIdx.x] = ROL16(tmp); + shared[3][threadIdx.x] = ROL8(tmp); + */ + } + + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + uint32_t k0; + uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + /* + #pragma unroll 16 + for (int i = 0; i<16; i++){ + h[i] = A[i]; + } + *(uint2x4*)&p[0] = *(uint2x4*)&A[0]; + *(uint2x4*)&p[4] = *(uint2x4*)&A[8]; + */ + + // uint32_t *Hash = &g_hash[thread << 4]; + // uint8_t h1[64]; + uint32_t c[16], m[16]; + *(uint2x4*)&h[0] = *((uint2x4*)&A[0]); + *(uint2x4*)&h[8] = *((uint2x4*)&A[8]); + + for (int i = 0; i < 16; i++) { - int hashPosition = threadBloc * 16; - uint32_t *inpHash = &g_hash[hashPosition]; + h[i] = cuda_swab32(h[i]); + } + + __syncthreads(); - // Read hash per 8 threads - uint32_t Hash[2]; - int ndx = threadIdx.x & 7; - Hash[0] = inpHash[ndx]; - Hash[1] = inpHash[ndx + 8]; + // *(uint2x4*)&Hash[ 0] = *(uint2x4*)&h[ 0]; + // *(uint2x4*)&Hash[ 8] = *(uint2x4*)&h[ 8]; + uint32_t S[36]; + uint32_t B[9]; - // Puffer für expandierte Nachricht - uint4 *temp4 = &g_temp4[hashPosition * 4]; + S[0] = S[1] = S[2] = S[3] = S[4] = S[5] = S[6] = S[7] = S[8] = S[9] = S[10] = S[11] = S[12] = S[13] = S[14] = S[15] = S[16] = S[17] = S[18] = S[19] = 0; + *(uint2x4*)&S[20] = *(uint2x4*)&c_S[0]; +#pragma unroll 8 + for (int i = 0; i<8; i++){ + S[28 + i] = c_S[i + 8]; + } - Expansion(Hash, temp4); + FUGUE512_3(h[0x0], h[0x1], h[0x2]); + FUGUE512_3(h[0x3], h[0x4], h[0x5]); + FUGUE512_3(h[0x6], h[0x7], h[0x8]); + FUGUE512_3(h[0x9], h[0xA], h[0xB]); + FUGUE512_3(h[0xC], h[0xD], h[0xE]); + FUGUE512_3(h[0xF], 0U, 512U); + + for (uint32_t i = 0; i < 32; i += 2){ + mROR3; + CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]); + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + mROR3; + CMIX36(S[0], S[1], S[2], S[4], S[5], S[6], S[18], S[19], S[20]); + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); } + #pragma unroll 11 + for (uint32_t i = 0; i < 13; i++) { + S[4] ^= S[0]; S[9] ^= S[0]; S[18] ^= S[0]; S[27] ^= S[0]; + mROR9; + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; S[10] ^= S[0]; S[18] ^= S[0]; S[27] ^= S[0]; + mROR9; + SMIX(shared, S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; S[10] ^= S[0]; S[19] ^= S[0]; S[27] ^= S[0]; + mROR9; + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + S[4] ^= S[0]; S[10] ^= S[0]; S[19] ^= S[0]; S[28] ^= S[0]; + mROR8; + SMIX_LDG(shared, S[0], S[1], S[2], S[3]); + } + S[4] ^= S[0]; S[9] ^= S[0]; S[18] ^= S[0]; S[27] ^= S[0]; + + S[0] = cuda_swab32(S[1]); S[1] = cuda_swab32(S[2]); S[2] = cuda_swab32(S[3]); S[3] = cuda_swab32(S[4]); + S[4] = cuda_swab32(S[9]); S[5] = cuda_swab32(S[10]); S[6] = cuda_swab32(S[11]); S[7] = cuda_swab32(S[12]); + S[8] = cuda_swab32(S[18]); S[9] = cuda_swab32(S[19]); S[10] = cuda_swab32(S[20]); S[11] = cuda_swab32(S[21]); + S[12] = cuda_swab32(S[27]); S[13] = cuda_swab32(S[28]); S[14] = cuda_swab32(S[29]); S[15] = cuda_swab32(S[30]); + + *(uint2x4*)&Hash[0] = *(uint2x4*)&S[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&S[8]; + // } } -__global__ __launch_bounds__(TPB, 1) -void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) +__global__ //__launch_bounds__(128, 5) +static void x16_simd512_gpu_compress_64_hamsi512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + uint32_t k0; + uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + uint8_t h1[64]; + + // uint32_t c[16], m[16]; + // *(uint2x4*)&h[0] = *((uint2x4*)&A[0]); + // *(uint2x4*)&h[8] = *((uint2x4*)&A[8]); + + *(uint2x4*)&h1[0] = *(uint2x4*)&A[0]; + *(uint2x4*)&h1[32] = *(uint2x4*)&A[8]; + + uint32_t c[16], m[16]; + *(uint16*)&c[0] = *(uint16*)&c_c[0]; + *(uint16*)&h[0] = *(uint16*)&c_c[0]; + + const uint32_t *tp; + uint32_t dm; + + for (int i = 0; i < 64; i += 8) { - uint32_t *Hash = &g_hash[thread * 16]; - Compression1(Hash, thread, g_fft4, g_state); + tp = &d_T512[0]; + + dm = -(h1[i] & 1); + m[0] = dm & tp[0]; m[1] = dm & tp[1]; + m[2] = dm & tp[2]; m[3] = dm & tp[3]; + m[4] = dm & tp[4]; m[5] = dm & tp[5]; + m[6] = dm & tp[6]; m[7] = dm & tp[7]; + m[8] = dm & tp[8]; m[9] = dm & tp[9]; + m[10] = dm & tp[10]; m[11] = dm & tp[11]; + m[12] = dm & tp[12]; m[13] = dm & tp[13]; + m[14] = dm & tp[14]; m[15] = dm & tp[15]; + tp += 16; + //#pragma unroll 7 + for (int v = 1; v < 8; v++) { + dm = -((h1[i] >> v) & 1); + m[0] ^= dm & tp[0]; m[1] ^= dm & tp[1]; + m[2] ^= dm & tp[2]; m[3] ^= dm & tp[3]; + m[4] ^= dm & tp[4]; m[5] ^= dm & tp[5]; + m[6] ^= dm & tp[6]; m[7] ^= dm & tp[7]; + m[8] ^= dm & tp[8]; m[9] ^= dm & tp[9]; + m[10] ^= dm & tp[10]; m[11] ^= dm & tp[11]; + m[12] ^= dm & tp[12]; m[13] ^= dm & tp[13]; + m[14] ^= dm & tp[14]; m[15] ^= dm & tp[15]; + tp += 16; + } + + //#pragma unroll + for (int u = 1; u < 8; u++) { +#pragma unroll 8 + for (int v = 0; v < 8; v++) { + dm = -((h1[i + u] >> v) & 1); + m[0] ^= dm & tp[0]; m[1] ^= dm & tp[1]; + m[2] ^= dm & tp[2]; m[3] ^= dm & tp[3]; + m[4] ^= dm & tp[4]; m[5] ^= dm & tp[5]; + m[6] ^= dm & tp[6]; m[7] ^= dm & tp[7]; + m[8] ^= dm & tp[8]; m[9] ^= dm & tp[9]; + m[10] ^= dm & tp[10]; m[11] ^= dm & tp[11]; + m[12] ^= dm & tp[12]; m[13] ^= dm & tp[13]; + m[14] ^= dm & tp[14]; m[15] ^= dm & tp[15]; + tp += 16; + } + } + + //#pragma unroll 6 + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + /* order is (no more) important */ + h[0] ^= m[0]; h[1] ^= m[1]; h[2] ^= c[0]; h[3] ^= c[1]; + h[4] ^= m[2]; h[5] ^= m[3]; h[6] ^= c[2]; h[7] ^= c[3]; + h[8] ^= m[8]; h[9] ^= m[9]; h[10] ^= c[8]; h[11] ^= c[9]; + h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11]; + + *(uint16*)&c[0] = *(uint16*)&h[0]; } -} -__global__ __launch_bounds__(TPB, 1) -void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state) + *(uint2x4*)&m[0] = *(uint2x4*)&d_T512[112]; + *(uint2x4*)&m[8] = *(uint2x4*)&d_T512[120]; + +#pragma unroll 6 + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + + /* order is (no more) important */ + h[0] ^= m[0]; h[1] ^= m[1]; h[2] ^= c[0]; h[3] ^= c[1]; + h[4] ^= m[2]; h[5] ^= m[3]; h[6] ^= c[2]; h[7] ^= c[3]; + h[8] ^= m[8]; h[9] ^= m[9]; h[10] ^= c[8]; h[11] ^= c[9]; + h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11]; + + *(uint16*)&c[0] = *(uint16*)&h[0]; + + *(uint2x4*)&m[0] = *(uint2x4*)&d_T512[784]; + *(uint2x4*)&m[8] = *(uint2x4*)&d_T512[792]; + +#pragma unroll 12 + for (int r = 0; r < 12; r++) + ROUND_BIG(r, d_alpha_f); + + /* order is (no more) important */ + h[0] ^= m[0]; h[1] ^= m[1]; h[2] ^= c[0]; h[3] ^= c[1]; + h[4] ^= m[2]; h[5] ^= m[3]; h[6] ^= c[2]; h[7] ^= c[3]; + h[8] ^= m[8]; h[9] ^= m[9]; h[10] ^= c[8]; h[11] ^= c[9]; + h[12] ^= m[10]; h[13] ^= m[11]; h[14] ^= c[10]; h[15] ^= c[11]; + + +#pragma unroll 16 + for (int i = 0; i < 16; i++) + h[i] = cuda_swab32(h[i]); + + *(uint2x4*)&Hash[0] = *(uint2x4*)&h[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&h[8]; +} +__global__ +__launch_bounds__(256, 3) +static void x16_simd512_gpu_compress_64_echo512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + + __shared__ uint32_t sharedMemory[8 * 1024]; + + if(threadIdx.x<256) aes_gpu_init256_32(sharedMemory); + + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + uint32_t k0; + uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + *(uint2x4*)&Hash[0] = *(uint2x4*)&A[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&A[8]; + +#pragma unroll 16 + for (int i = 0; i<16; i++){ + h[i] = A[i]; + } + + k0 = 512 + 8; + +#pragma unroll 4 + for (uint32_t idx = 0; idx < 16; idx += 4) + AES_2ROUND_32(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + + k0 += 4; + + uint32_t W[64]; + + // #pragma unroll 4 + for (int i = 0; i < 4; i++){ + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + + uint32_t t = ((a ^ b) & 0x80808080); + uint32_t t2 = ((b ^ c) & 0x80808080); + uint32_t t3 = ((c ^ d) & 0x80808080); + + uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1); + uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1); + uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1); + + W[0U + i] = bc ^ d ^ abx; + W[4U + i] = a ^ cd ^ bcx; + W[8U + i] = ab ^ d ^ cdx; + W[12U + i] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[12U + i]; + b = h[i + 4U]; + c = P[12U + i + 4U]; + d = P[12U + i + 8U]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16U + i] = abx ^ bc ^ d; + W[16U + i + 4U] = bcx ^ a ^ cd; + W[16U + i + 8U] = cdx ^ ab ^ d; + W[16U + i + 12U] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = h[i]; + b = P[24U + i + 0U]; + c = P[24U + i + 4U]; + d = P[24U + i + 8U]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32U + i] = abx ^ bc ^ d; + W[32U + i + 4U] = bcx ^ a ^ cd; + W[32U + i + 8U] = cdx ^ ab ^ d; + W[32U + i + 12U] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[36U + i]; + b = P[36U + i + 4U]; + c = P[36U + i + 8U]; + d = h[i + 12U]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48U + i] = abx ^ bc ^ d; + W[48U + i + 4U] = bcx ^ a ^ cd; + W[48U + i + 8U] = cdx ^ ab ^ d; + W[48U + i + 12U] = abx ^ bcx ^ cdx ^ ab ^ c; + } + + for (int k = 1; k < 10; k++){ + echo_round_sp(sharedMemory, W, k0); + } +#pragma unroll 4 + for (uint32_t i = 0; i < 16; i += 4) { - Compression2(thread, g_fft4, g_state); + W[i] ^= W[32 + i] ^ 512; + W[i + 1] ^= W[32 + i + 1]; + W[i + 2] ^= W[32 + i + 2]; + W[i + 3] ^= W[32 + i + 3]; } + *(uint2x4*)&Hash[0] = *(uint2x4*)&Hash[0] ^ *(uint2x4*)&W[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&Hash[8] ^ *(uint2x4*)&W[8]; } -__global__ __launch_bounds__(TPB, 2) -void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) +__global__ __launch_bounds__(256, 3) +void x16_simd512_gpu_compress_64_maxwell_echo512_final(const uint32_t* __restrict__ g_hash, uint32_t startnonce, const uint4 *const __restrict__ g_fft4, uint32_t* resNonce, const uint64_t target) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + + __shared__ uint32_t sharedMemory[1024 * 8]; + + aes_gpu_init256_32(sharedMemory); + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + uint32_t k0; + uint32_t h[16]; + + // if (thread < threads){ + + const uint32_t* __restrict__ Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + SIMD_Compress(A, thr_offset, g_fft4); + +#pragma unroll 16 + for (int i = 0; i<16; i++){ + h[i] = A[i]; + } + + uint64_t backup = *(uint64_t*)&h[6]; + + k0 = 512 + 8; + +#pragma unroll 3 + for (uint32_t idx = 0; idx < 16; idx += 4){ + AES_2ROUND_32(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + idx += 4; + AES_2ROUND_32(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + } + k0 += 4; + + uint32_t W[64]; + + // #pragma unroll 4 + for (int i = 0; i < 4; i++){ + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + + uint32_t t = ((a ^ b) & 0x80808080); + uint32_t t2 = ((b ^ c) & 0x80808080); + uint32_t t3 = ((c ^ d) & 0x80808080); + + uint32_t abx = ((t >> 7) * 27U) ^ ((ab^t) << 1); + uint32_t bcx = ((t2 >> 7) * 27U) ^ ((bc^t2) << 1); + uint32_t cdx = ((t3 >> 7) * 27U) ^ ((cd^t3) << 1); + + W[0U + i] = bc ^ d ^ abx; + W[4U + i] = a ^ cd ^ bcx; + W[8U + i] = ab ^ d ^ cdx; + W[12U + i] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[12U + i]; + b = h[i + 4U]; + c = P[12U + i + 4U]; + d = P[12U + i + 8U]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16U + i] = abx ^ bc ^ d; + W[16U + i + 4U] = bcx ^ a ^ cd; + W[16U + i + 8U] = cdx ^ ab ^ d; + W[16U + i + 12U] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = h[i]; + b = P[24U + i + 0U]; + c = P[24U + i + 4U]; + d = P[24U + i + 8U]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32U + i] = abx ^ bc ^ d; + W[32U + i + 4U] = bcx ^ a ^ cd; + W[32U + i + 8U] = cdx ^ ab ^ d; + W[32U + i + 12U] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[36U + i]; + b = P[36U + i + 4U]; + c = P[36U + i + 8U]; + d = h[i + 12U]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48U + i] = abx ^ bc ^ d; + W[48U + i + 4U] = bcx ^ a ^ cd; + W[48U + i + 8U] = cdx ^ ab ^ d; + W[48U + i + 12U] = abx ^ bcx ^ cdx ^ ab ^ c; + } + + for (int k = 1; k < 9; k++){ + echo_round_sp(sharedMemory, W, k0); + } + + // Big Sub Words + uint32_t y[4]; + aes_round_32(sharedMemory, W[0], W[1], W[2], W[3], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[0], W[1], W[2], W[3]); + aes_round_32(sharedMemory, W[4], W[5], W[6], W[7], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[4], W[5], W[6], W[7]); + aes_round_32(sharedMemory, W[8], W[9], W[10], W[11], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[8], W[9], W[10], W[11]); + aes_round_32(sharedMemory, W[20], W[21], W[22], W[23], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[20], W[21], W[22], W[23]); + aes_round_32(sharedMemory, W[28], W[29], W[30], W[31], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[28], W[29], W[30], W[31]); + aes_round_32(sharedMemory, W[32], W[33], W[34], W[35], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[32], W[33], W[34], W[35]); + aes_round_32(sharedMemory, W[40], W[41], W[42], W[43], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[40], W[41], W[42], W[43]); + aes_round_32(sharedMemory, W[52], W[53], W[54], W[55], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[52], W[53], W[54], W[55]); + aes_round_32(sharedMemory, W[60], W[61], W[62], W[63], k0, y[0], y[1], y[2], y[3]); + aes_round_32(sharedMemory, y[0], y[1], y[2], y[3], W[60], W[61], W[62], W[63]); + + uint32_t bc = W[22] ^ W[42]; + uint32_t t2 = (bc & 0x80808080); + W[6] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + bc = W[23] ^ W[43]; + t2 = (bc & 0x80808080); + W[7] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + bc = W[10] ^ W[54]; + t2 = (bc & 0x80808080); + W[38] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + bc = W[11] ^ W[55]; + t2 = (bc & 0x80808080); + W[39] = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + + uint64_t check = backup ^ *(uint64_t*)&W[2] ^ *(uint64_t*)&W[6] ^ *(uint64_t*)&W[10] ^ *(uint64_t*)&W[30] ^ *(uint64_t*)&W[34] ^ *(uint64_t*)&W[38] ^ *(uint64_t*)&W[42] ^ *(uint64_t*)&W[62]; + uint32_t nonce = thread + startnonce; + + if (check <= target) { - uint32_t *Hash = &g_hash[thread * 16]; - Compression1(Hash, thread, g_fft4, g_state); - Compression2(thread, g_fft4, g_state); + uint32_t tmp = atomicExch(&resNonce[0], nonce); + if (tmp != UINT32_MAX) + resNonce[1] = tmp; } + // } } -__global__ __launch_bounds__(TPB, 2) -void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) +__global__ //__launch_bounds__(128, 5) +static void x16_simd512_gpu_compress_64_whirlpool512(uint32_t *g_hash, const uint4 *const __restrict__ g_fft4) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thr_offset = thread << 6; // thr_id * 128 (je zwei elemente) + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + }; + + __shared__ uint2 sharedMemory[256][16]; + + if (threadIdx.x < 128) { - uint32_t *Hash = &g_hash[thread * 16]; - Final(Hash, thread, g_fft4, g_state); + const uint2 tmp = b0[threadIdx.x]; + const uint2 tmp2 = b0[threadIdx.x+128]; + sharedMemory[threadIdx.x][0] = tmp; + sharedMemory[threadIdx.x][1] = tmp; + sharedMemory[threadIdx.x][2] = tmp; + sharedMemory[threadIdx.x][3] = tmp; + sharedMemory[threadIdx.x][4] = tmp; + sharedMemory[threadIdx.x][5] = tmp; + sharedMemory[threadIdx.x][6] = tmp; + sharedMemory[threadIdx.x][7] = tmp; + sharedMemory[threadIdx.x][8] = tmp; + sharedMemory[threadIdx.x][9] = tmp; + sharedMemory[threadIdx.x][10] = tmp; + sharedMemory[threadIdx.x][11] = tmp; + sharedMemory[threadIdx.x][12] = tmp; + sharedMemory[threadIdx.x][13] = tmp; + sharedMemory[threadIdx.x][14] = tmp; + sharedMemory[threadIdx.x][15] = tmp; + + sharedMemory[threadIdx.x + 128][0] = tmp2; + sharedMemory[threadIdx.x + 128][1] = tmp2; + sharedMemory[threadIdx.x + 128][2] = tmp2; + sharedMemory[threadIdx.x + 128][3] = tmp2; + sharedMemory[threadIdx.x + 128][4] = tmp2; + sharedMemory[threadIdx.x + 128][5] = tmp2; + sharedMemory[threadIdx.x + 128][6] = tmp2; + sharedMemory[threadIdx.x + 128][7] = tmp2; + sharedMemory[threadIdx.x + 128][8] = tmp2; + sharedMemory[threadIdx.x + 128][9] = tmp2; + sharedMemory[threadIdx.x + 128][10] = tmp2; + sharedMemory[threadIdx.x + 128][11] = tmp2; + sharedMemory[threadIdx.x + 128][12] = tmp2; + sharedMemory[threadIdx.x + 128][13] = tmp2; + sharedMemory[threadIdx.x + 128][14] = tmp2; + sharedMemory[threadIdx.x + 128][15] = tmp2; } + + + uint32_t k0; + // uint32_t h[16]; + + // if (thread < threads){ + + uint32_t *Hash = &g_hash[thread << 4]; + + uint32_t A[32]; + + *(uint2x4*)&A[0] = *(uint2x4*)&c_IV_512[0] ^ __ldg4((uint2x4*)&Hash[0]); + *(uint2x4*)&A[8] = *(uint2x4*)&c_IV_512[8] ^ __ldg4((uint2x4*)&Hash[8]); + *(uint2x4*)&A[16] = *(uint2x4*)&c_IV_512[16]; + *(uint2x4*)&A[24] = *(uint2x4*)&c_IV_512[24]; + + __syncthreads(); + + SIMD_Compress(A, thr_offset, g_fft4); + + uint2 hash[8], n[8], h[8]; + uint2 tmp[8] = { + { 0xC0EE0B30, 0x672990AF }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, + { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 }, { 0x28282828, 0x28282828 } + }; + + *(uint2x4*)&hash[0] = *((uint2x4*)&A[0]); //__ldg4((uint2x4*)&g_hash[(thread << 3) + 0]); + *(uint2x4*)&hash[4] = *((uint2x4*)&A[8]); //__ldg4((uint2x4*)&g_hash[(thread << 3) + 4]); + + __syncthreads(); + + const uint32_t index = 15; //sharedindex; + +#pragma unroll 8 + for (int i = 0; i<8; i++) + n[i] = hash[i]; + + tmp[0] ^= d_ROUND_ELT(index, sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1); + tmp[1] ^= d_ROUND_ELT(index, sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2); + tmp[2] ^= d_ROUND_ELT(index, sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3); + tmp[3] ^= d_ROUND_ELT(index, sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4); + tmp[4] ^= d_ROUND_ELT(index, sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5); + tmp[5] ^= d_ROUND_ELT(index, sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6); + tmp[6] ^= d_ROUND_ELT(index, sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7); + tmp[7] ^= d_ROUND_ELT(index, sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0); + for (int i = 1; i <10; i++) + { + TRANSFER(n, tmp); + tmp[0] = d_ROUND_ELT1(index, sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1, precomputed_round_key_64[(i - 1) * 8 + 0]); + tmp[1] = d_ROUND_ELT1(index, sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2, precomputed_round_key_64[(i - 1) * 8 + 1]); + tmp[2] = d_ROUND_ELT1(index, sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3, precomputed_round_key_64[(i - 1) * 8 + 2]); + tmp[3] = d_ROUND_ELT1(index, sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4, precomputed_round_key_64[(i - 1) * 8 + 3]); + tmp[4] = d_ROUND_ELT1(index, sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5, precomputed_round_key_64[(i - 1) * 8 + 4]); + tmp[5] = d_ROUND_ELT1(index, sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6, precomputed_round_key_64[(i - 1) * 8 + 5]); + tmp[6] = d_ROUND_ELT1(index, sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7, precomputed_round_key_64[(i - 1) * 8 + 6]); + tmp[7] = d_ROUND_ELT1(index, sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0, precomputed_round_key_64[(i - 1) * 8 + 7]); + } + + TRANSFER(h, tmp); +#pragma unroll 8 + for (int i = 0; i<8; i++) + hash[i] = h[i] = h[i] ^ hash[i]; + +#pragma unroll 6 + for (int i = 1; i<7; i++) + n[i] = vectorize(0); + + n[0] = vectorize(0x80); + n[7] = vectorize(0x2000000000000); + +#pragma unroll 8 + for (int i = 0; i < 8; i++) { + n[i] = n[i] ^ h[i]; + } + +// #pragma unroll 2 + for (int i = 0; i < 10; i++) + { + tmp[0] = InitVector_RC[i]; + tmp[0] ^= d_ROUND_ELT(index, sharedMemory, h, 0, 7, 6, 5, 4, 3, 2, 1); + tmp[1] = d_ROUND_ELT(index, sharedMemory, h, 1, 0, 7, 6, 5, 4, 3, 2); + tmp[2] = d_ROUND_ELT(index, sharedMemory, h, 2, 1, 0, 7, 6, 5, 4, 3); + tmp[3] = d_ROUND_ELT(index, sharedMemory, h, 3, 2, 1, 0, 7, 6, 5, 4); + tmp[4] = d_ROUND_ELT(index, sharedMemory, h, 4, 3, 2, 1, 0, 7, 6, 5); + tmp[5] = d_ROUND_ELT(index, sharedMemory, h, 5, 4, 3, 2, 1, 0, 7, 6); + tmp[6] = d_ROUND_ELT(index, sharedMemory, h, 6, 5, 4, 3, 2, 1, 0, 7); + tmp[7] = d_ROUND_ELT(index, sharedMemory, h, 7, 6, 5, 4, 3, 2, 1, 0); + TRANSFER(h, tmp); + tmp[0] = d_ROUND_ELT1(index, sharedMemory, n, 0, 7, 6, 5, 4, 3, 2, 1, tmp[0]); + tmp[1] = d_ROUND_ELT1(index, sharedMemory, n, 1, 0, 7, 6, 5, 4, 3, 2, tmp[1]); + tmp[2] = d_ROUND_ELT1(index, sharedMemory, n, 2, 1, 0, 7, 6, 5, 4, 3, tmp[2]); + tmp[3] = d_ROUND_ELT1(index, sharedMemory, n, 3, 2, 1, 0, 7, 6, 5, 4, tmp[3]); + tmp[4] = d_ROUND_ELT1(index, sharedMemory, n, 4, 3, 2, 1, 0, 7, 6, 5, tmp[4]); + tmp[5] = d_ROUND_ELT1(index, sharedMemory, n, 5, 4, 3, 2, 1, 0, 7, 6, tmp[5]); + tmp[6] = d_ROUND_ELT1(index, sharedMemory, n, 6, 5, 4, 3, 2, 1, 0, 7, tmp[6]); + tmp[7] = d_ROUND_ELT1(index, sharedMemory, n, 7, 6, 5, 4, 3, 2, 1, 0, tmp[7]); + TRANSFER(n, tmp); + } + + hash[0] = xor3x(hash[0], n[0], vectorize(0x80)); + hash[1] = hash[1] ^ n[1]; + hash[2] = hash[2] ^ n[2]; + hash[3] = hash[3] ^ n[3]; + hash[4] = hash[4] ^ n[4]; + hash[5] = hash[5] ^ n[5]; + hash[6] = hash[6] ^ n[6]; + hash[7] = xor3x(hash[7], n[7], vectorize(0x2000000000000)); + + *(uint2x4*)&Hash[0] = *(uint2x4*)&hash[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&hash[4]; } -#else -__global__ void x11_simd512_gpu_expand_64(uint32_t threads, uint32_t *g_hash, uint4 *g_temp4) {} -__global__ void x11_simd512_gpu_compress1_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} -__global__ void x11_simd512_gpu_compress2_64(uint32_t threads, uint4 *g_fft4, uint32_t *g_state) {} -__global__ void x11_simd512_gpu_compress_64_maxwell(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} -__global__ void x11_simd512_gpu_final_64(uint32_t threads, uint32_t *g_hash, uint4 *g_fft4, uint32_t *g_state) {} -#endif /* SM3+ */ __host__ -int x11_simd512_cpu_init(int thr_id, uint32_t threads) +void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { int dev_id = device_map[thr_id]; - cuda_get_arch(thr_id); - if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { - x11_simd512_cpu_init_sm2(thr_id); - return 0; - } - CUDA_CALL_OR_RET_X(cudaMalloc(&d_temp4[thr_id], 64*sizeof(uint4)*threads), (int) err); /* todo: prevent -i 21 */ - CUDA_CALL_OR_RET_X(cudaMalloc(&d_state[thr_id], 32*sizeof(int)*threads), (int) err); + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB50_1; + const dim3 grid1((8*threads + tpb - 1) / tpb); + const dim3 block1(tpb); + +// tpb = TPB52_2; +// if (device_sm[dev_id] <= 500) tpb = TPB50_2; +// const dim3 grid2((threads + tpb - 1) / tpb); +// const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 <<>> (threads, d_hash, d_temp4[thr_id]); + x11_simd512_gpu_compress_64 <<< grid1, block1 >>> (threads, d_hash, d_temp4[thr_id]); +} -#ifndef DEVICE_DIRECT_CONSTANTS - cudaMemcpyToSymbol(c_perm, h_perm, sizeof(h_perm), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_IV_512, h_IV_512, sizeof(h_IV_512), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_FFT128_8_16_Twiddle, h_FFT128_8_16_Twiddle, sizeof(h_FFT128_8_16_Twiddle), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_FFT256_2_128_Twiddle, h_FFT256_2_128_Twiddle, sizeof(h_FFT256_2_128_Twiddle), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_cw0, h_cw0, sizeof(h_cw0), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_cw1, h_cw1, sizeof(h_cw1), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_cw2, h_cw2, sizeof(h_cw2), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(d_cw3, h_cw3, sizeof(h_cw3), 0, cudaMemcpyHostToDevice); -#endif +__host__ +void x11_simd512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startnonce, uint32_t *d_hash, uint32_t *d_resNonce, uint64_t target) +{ + int dev_id = device_map[thr_id]; - // Texture for 128-Bit Zugriffe - cudaChannelFormatDesc channelDesc128 = cudaCreateChannelDesc(); - texRef1D_128.normalized = 0; - texRef1D_128.filterMode = cudaFilterModePoint; - texRef1D_128.addressMode[0] = cudaAddressModeClamp; + uint32_t tpb = 32; + if (device_sm[dev_id] <= 500) tpb = TPB50_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); - CUDA_CALL_OR_RET_X(cudaBindTexture(NULL, &texRef1D_128, d_temp4[thr_id], &channelDesc128, 64*sizeof(uint4)*threads), (int) err); + // tpb = TPB52_2; + // if (device_sm[dev_id] <= 500) tpb = TPB50_2; + // const dim3 grid2((threads + tpb - 1) / tpb); + // const dim3 block2(tpb); - return 0; + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x11_simd512_gpu_compress_64_pascal_final << < grid1, block1 >> > (threads, startnonce, d_hash, d_temp4[thr_id], d_resNonce, target); } + __host__ -void x11_simd512_cpu_free(int thr_id) +void x16_simd_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) { + int dev_id = device_map[thr_id]; - if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300) { - cudaFree(d_temp4[thr_id]); - cudaFree(d_state[thr_id]); - } + + uint32_t tpb = 128; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 256; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_echo512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); } __host__ -void x11_simd512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x16_simd_echo512_cpu_hash_64_final(int thr_id, uint32_t threads, uint32_t startnonce, uint32_t *d_hash, uint32_t *d_resNonce, const uint64_t target) { - const uint32_t threadsperblock = TPB; + int dev_id = device_map[thr_id]; - dim3 block(threadsperblock); - dim3 grid((threads + threadsperblock-1) / threadsperblock); - dim3 gridX8(grid.x * 8); + uint32_t tpb = 128; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); - if (d_nonceVector != NULL || device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) { - x11_simd512_cpu_hash_64_sm2(thr_id, threads, startNounce, d_nonceVector, d_hash, order); - return; - } + tpb = 256; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); - x11_simd512_gpu_expand_64 <<>> (threads, d_hash, d_temp4[thr_id]); + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_maxwell_echo512_final << < grid2, block2 >> > (d_hash, startnonce, d_temp4[thr_id], d_resNonce, target); +} - if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) { - x11_simd512_gpu_compress_64_maxwell <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]); - } else { - x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]); - x11_simd512_gpu_compress2_64 <<< grid, block >>> (threads, d_temp4[thr_id], d_state[thr_id]); - } - x11_simd512_gpu_final_64 <<>> (threads, d_hash, d_temp4[thr_id], d_state[thr_id]); +__host__ +void x16_simd_whirlpool512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + int dev_id = device_map[thr_id]; + + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB52_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 128; + if (device_sm[dev_id] <= 500) tpb = 128; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_whirlpool512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); +} + +__host__ +void x16_simd_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ + + int dev_id = device_map[thr_id]; + + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB52_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 128; + if (device_sm[dev_id] <= 500) tpb = 128; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); + + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_hamsi512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); +} + +__host__ +void x16_simd_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){ + + int dev_id = device_map[thr_id]; + + uint32_t tpb = TPB52_1; + if (device_sm[dev_id] <= 500) tpb = TPB52_1; + const dim3 grid1((8 * threads + tpb - 1) / tpb); + const dim3 block1(tpb); + + tpb = 128; + if (device_sm[dev_id] <= 500) tpb = 128; + const dim3 grid2((threads + tpb - 1) / tpb); + const dim3 block2(tpb); - //MyStreamSynchronize(NULL, order, thr_id); + x11_simd512_gpu_expand_64 << > > (threads, d_hash, d_temp4[thr_id]); + x16_simd512_gpu_compress_64_fugue512 << < grid2, block2 >> > (d_hash, d_temp4[thr_id]); } diff --git a/x11/cuda_x11_simd512_func.cuh b/x11/cuda_x11_simd512_func.cuh index f61eaa4f59..85375e577d 100644 --- a/x11/cuda_x11_simd512_func.cuh +++ b/x11/cuda_x11_simd512_func.cuh @@ -1,1396 +1,656 @@ -#define SIMD_FUNCTIONS_CUH +static __constant__ const uint32_t c_perm[8][8] = { + { 2, 3, 6, 7, 0, 1, 4, 5 }, { 6, 7, 2, 3, 4, 5, 0, 1 }, { 7, 6, 5, 4, 3, 2, 1, 0 }, { 1, 0, 3, 2, 5, 4, 7, 6 }, + { 0, 1, 4, 5, 6, 7, 2, 3 }, { 6, 7, 2, 3, 0, 1, 4, 5 }, { 6, 7, 0, 1, 4, 5, 2, 3 }, { 4, 5, 2, 3, 6, 7, 0, 1 } +}; -__device__ __forceinline__ void STEP8_IF_0(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for(int j=0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[1]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[0]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[3]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[2]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[5]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[4]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[7]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[6]; -#pragma unroll 8 - for(int j=0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_1(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[6]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[7]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[4]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[5]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[2]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[3]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[0]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[1]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_2(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[2]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[3]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[0]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[1]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[6]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[7]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[4]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[5]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_3(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[3]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[2]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[1]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[0]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[7]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[6]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[5]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[4]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_4(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[5]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[4]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[7]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[6]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[1]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[0]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[3]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[2]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_5(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[7]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[6]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[5]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[4]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[3]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[2]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[1]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[0]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_6(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[4]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[5]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[6]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[7]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[0]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[1]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[2]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[3]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_7(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[1]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[0]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[3]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[2]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[5]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[4]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[7]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[6]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_8(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[6]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[7]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[4]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[5]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[2]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[3]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[0]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[1]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_9(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[2]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[3]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[0]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[1]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[6]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[7]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[4]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[5]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_10(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ +static __constant__ const uint32_t c_IV_512[32] = { + 0x0ba16b95, 0x72f999ad, 0x9fecc2ae, 0xba3264fc, 0x5e894929, 0x8e9f30e5, 0x2f1daa37, 0xf0f2c558, + 0xac506643, 0xa90635a5, 0xe25b878b, 0xaab7878f, 0x88817f7a, 0x0a02892b, 0x559a7550, 0x598f657e, + 0x7eef60a1, 0x6b70e3e8, 0x9c1714d1, 0xb958e2a8, 0xab02675e, 0xed1c014f, 0xcd8d65bb, 0xfdb7a257, + 0x09254899, 0xd699c7bc, 0x9019b6dc, 0x2b9022e4, 0x8fa14956, 0x21bf9bd3, 0xb94d0943, 0x6ffddc22 +}; - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[3]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[2]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[1]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[0]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[7]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[6]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[5]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[4]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_11(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[5]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[4]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[7]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[6]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[1]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[0]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[3]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[2]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_12(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[7]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[6]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[5]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[4]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[3]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[2]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[1]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[0]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_13(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[4]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[5]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[6]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[7]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[0]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[1]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[2]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[3]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_14(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[1]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[0]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[3]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[2]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[5]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[4]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[7]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[6]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_15(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[6]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[7]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[4]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[5]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[2]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[3]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[0]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[1]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_16(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[2]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[3]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[0]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[1]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[6]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[7]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[4]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[5]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_17(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[3]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[2]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[1]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[0]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[7]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[6]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[5]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[4]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_18(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[5]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[4]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[7]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[6]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[1]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[0]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[3]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[2]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_19(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[7]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[6]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[5]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[4]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[3]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[2]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[1]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[0]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_20(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[4]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[5]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[6]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[7]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[0]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[1]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[2]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[3]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_21(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[1]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[0]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[3]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[2]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[5]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[4]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[7]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[6]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_22(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[6]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[7]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[4]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[5]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[2]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[3]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[0]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[1]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_23(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[2]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[3]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[0]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[1]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[6]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[7]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[4]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[5]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_24(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[3]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[2]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[1]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[0]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[7]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[6]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[5]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[4]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_25(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[5]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[4]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[7]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[6]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[1]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[0]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[3]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[2]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_26(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[7]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[6]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[5]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[4]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[3]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[2]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[1]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[0]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_27(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[4]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[5]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[6]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[7]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[0]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[1]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[2]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[3]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_28(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[1]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[0]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[3]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[2]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[5]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[4]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[7]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[6]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_29(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[6]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[7]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[4]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[5]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[2]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[3]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[0]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[1]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_MAJ_30(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) +static __constant__ const int16_t c_FFT128_8_16_Twiddle[128] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 60, 2, 120, 4, -17, 8, -34, 16, -68, 32, 121, 64, -15, 128, -30, + 1, 46, 60, -67, 2, 92, 120, 123, 4, -73, -17, -11, 8, 111, -34, -22, 1, -67, 120, -73, 8, -22, -68, -70, 64, 81, -30, -46, -2, -123, 17, -111, + 1, -118, 46, -31, 60, 116, -67, -61, 2, 21, 92, -62, 120, -25, 123, -122, 1, 116, 92, -122, -17, 84, -22, 18, 32, 114, 117, -49, -30, 118, 67, 62, + 1, -31, -67, 21, 120, -122, -73, -50, 8, 9, -22, -89, -68, 52, -70, 114, 1, -61, 123, -50, -34, 18, -70, -99, 128, -98, 67, 25, 17, -9, 35, -79 +}; + +static __constant__ const int16_t c_FFT256_2_128_Twiddle[128] = { + 1, 41, -118, 45, 46, 87, -31, 14, 60, -110, 116, -127, -67, 80, -61, 69, 2, 82, 21, 90, 92, -83, -62, 28, 120, 37, -25, 3, 123, -97, -122, -119, + 4, -93, 42, -77, -73, 91, -124, 56, -17, 74, -50, 6, -11, 63, 13, 19, 8, 71, 84, 103, 111, -75, 9, 112, -34, -109, -100, 12, -22, 126, 26, 38, + 16, -115, -89, -51, -35, 107, 18, -33, -68, 39, 57, 24, -44, -5, 52, 76, 32, 27, 79, -102, -70, -43, 36, -66, 121, 78, 114, 48, -88, -10, 104, -105, + 64, 54, -99, 53, 117, -86, 72, 125, -15, -101, -29, 96, 81, -20, -49, 47, 128, 108, 59, 106, -23, 85, -113, -7, -30, 55, -58, -65, -95, -40, -98, 94 +}; + +__device__ __forceinline__ +static uint32_t IF(uint32_t x, uint32_t y, uint32_t z) { - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[2]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[3]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[0]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[1]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[6]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[7]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[4]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[5]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint32_t result; + asm("lop3.b32 %0, %1, %2, %3, 0xCA;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); // x=F0, y=CC, z=AA // 0xCA = ((CC⊻AA)∧F0)⊻AA + return result; +#else + return (((y ^ z) & x) ^ z); +#endif } -__device__ __forceinline__ void STEP8_MAJ_31(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + MAJ(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[3]; - temp = D[1] + w[1] + MAJ(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[2]; - temp = D[2] + w[2] + MAJ(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[1]; - temp = D[3] + w[3] + MAJ(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[0]; - temp = D[4] + w[4] + MAJ(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[7]; - temp = D[5] + w[5] + MAJ(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[6]; - temp = D[6] + w[6] + MAJ(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[5]; - temp = D[7] + w[7] + MAJ(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[4]; -#pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } + + +__device__ __forceinline__ +static uint32_t MAJ(const uint32_t x, const uint32_t y, const uint32_t z){ + +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + uint32_t result; + asm("lop3.b32 %0, %1, %2, %3, 0xE8;" : "=r"(result) : "r"(x), "r"(y), "r"(z)); // x=AA, y=CC, z=F0 // 0xCA = ((CC⊻AA)∧F0)⊻AA + return result; +#else + return ((z &y) | ((z | y) & x)); +#endif } -__device__ __forceinline__ void STEP8_IF_32(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) + +#define p8_xor(x) ( ((x)%7) == 0 ? 1 : \ + ((x)%7) == 1 ? 6 : \ + ((x)%7) == 2 ? 2 : \ + ((x)%7) == 3 ? 3 : \ + ((x)%7) == 4 ? 5 : \ + ((x)%7) == 5 ? 7 : 4 ) + +__device__ __forceinline__ +static void STEP8_IF(const uint32_t *const __restrict__ w, const uint32_t i, const uint32_t r, const uint32_t s, uint32_t *const __restrict__ A, const uint32_t *const __restrict__ B, const uint32_t *const __restrict__ C, uint32_t *const __restrict__ D) { - uint32_t temp; uint32_t R[8]; + #pragma unroll 8 - for (int j = 0; j<8; j++) { + for (int j = 0; j<8; j++) R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[5]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[4]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[7]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[6]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[1]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[0]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[3]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[2]; + + uint32_t W[8]; + *(uint2x4*)&W[0] = *(uint2x4*)&w[0]; #pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_33(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; + for (int j = 0; j<8; j++) + D[j] += W[j] + IF(A[j], B[j], C[j]); #pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[7]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[6]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[5]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[4]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[3]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[2]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[1]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[0]; + for (int j = 0; j<8; j++) + D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s); #pragma unroll 8 - for (int j = 0; j<8; j++) { + for (int j = 0; j<8; j++) A[j] = R[j]; - } } -__device__ __forceinline__ void STEP8_IF_34(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) + +__device__ __forceinline__ +static void STEP8_MAJ(const uint32_t *const __restrict__ w, const uint32_t i, const uint32_t r, const uint32_t s, uint32_t *const __restrict__ A, const uint32_t *const __restrict__ B, const uint32_t *const __restrict__ C, uint32_t *const __restrict__ D) { - uint32_t temp; uint32_t R[8]; + + uint32_t W[8]; + *(uint2x4*)&W[0] = *(uint2x4*)&w[0]; + #pragma unroll 8 - for (int j = 0; j<8; j++) { + for (int j = 0; j<8; j++) R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[4]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[5]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[6]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[7]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[0]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[1]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[2]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[3]; + #pragma unroll 8 - for (int j = 0; j<8; j++) { - A[j] = R[j]; - } -} -__device__ __forceinline__ void STEP8_IF_35(const uint32_t *w, const int r, const int s, uint32_t * A, const uint32_t * B, const uint32_t * C, uint32_t * D) -{ - uint32_t temp; - uint32_t R[8]; + for (int j = 0; j<8; j++) + D[j] += W[j] + MAJ(A[j], B[j], C[j]); #pragma unroll 8 - for (int j = 0; j<8; j++) { - R[j] = ROTL32(A[j], r); - } - temp = D[0] + w[0] + IF(A[0], B[0], C[0]); - D[0] = ROTL32(temp, s) + R[1]; - temp = D[1] + w[1] + IF(A[1], B[1], C[1]); - D[1] = ROTL32(temp, s) + R[0]; - temp = D[2] + w[2] + IF(A[2], B[2], C[2]); - D[2] = ROTL32(temp, s) + R[3]; - temp = D[3] + w[3] + IF(A[3], B[3], C[3]); - D[3] = ROTL32(temp, s) + R[2]; - temp = D[4] + w[4] + IF(A[4], B[4], C[4]); - D[4] = ROTL32(temp, s) + R[5]; - temp = D[5] + w[5] + IF(A[5], B[5], C[5]); - D[5] = ROTL32(temp, s) + R[4]; - temp = D[6] + w[6] + IF(A[6], B[6], C[6]); - D[6] = ROTL32(temp, s) + R[7]; - temp = D[7] + w[7] + IF(A[7], B[7], C[7]); - D[7] = ROTL32(temp, s) + R[6]; + for (int j = 0; j<8; j++) + D[j] = R[j^p8_xor(i)] + ROTL32(D[j], s); #pragma unroll 8 - for (int j = 0; j<8; j++) { + for (int j = 0; j<8; j++) A[j] = R[j]; - } } -#ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ uint32_t d_cw0[8][8] = { -#else -static __constant__ uint32_t d_cw0[8][8]; -static const uint32_t h_cw0[8][8] = { -#endif - 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, - 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, - 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, - 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, - 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, - 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, - 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, - 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3 +static __constant__ uint32_t d_cw[4][8][8] = { + 0x531B1720, 0xAC2CDE09, 0x0B902D87, 0x2369B1F4, 0x2931AA01, 0x02E4B082, 0xC914C914, 0xC1DAE1A6, 0xF18C2B5C, 0x08AC306B, 0x27BFC914, 0xCEDC548D, 0xC630C4BE, 0xF18C4335, 0xF0D3427C, 0xBE3DA380, + 0x143C02E4, 0xA948C630, 0xA4F2DE09, 0xA71D2085, 0xA439BD84, 0x109FCD6A, 0xEEA8EF61, 0xA5AB1CE8, 0x0B90D4A4, 0x3D6D039D, 0x25944D53, 0xBAA0E034, 0x5BC71E5A, 0xB1F4F2FE, 0x12CADE09, 0x548D41C3, + 0x3CB4F80D, 0x36ECEBC4, 0xA66443EE, 0x43351ABD, 0xC7A20C49, 0xEB0BB366, 0xF5293F98, 0x49B6DE09, 0x531B29EA, 0x02E402E4, 0xDB25C405, 0x53D4E543, 0x0AD71720, 0xE1A61A04, 0xB87534C1, 0x3EDF43EE, + 0x213E50F0, 0x39173EDF, 0xA9485B0E, 0xEEA82EF9, 0x14F55771, 0xFAF15546, 0x3D6DD9B3, 0xAB73B92E, 0x582A48FD, 0xEEA81892, 0x4F7EAA01, 0xAF10A88F, 0x11581720, 0x34C124DB, 0xD1C0AB73, 0x1E5AF0D3, + 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, + 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, + 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, + 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80, + 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, + 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, + 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, + 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE, + 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, + 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, + 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, + 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D + }; -__device__ __forceinline__ void Round8_0_final(uint32_t *A, int r, int s, int t, int u) -{ - STEP8_IF_0(d_cw0[0], r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF_1(d_cw0[1], s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF_2(d_cw0[2], t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF_3(d_cw0[3], u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ_4(d_cw0[4], r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ_5(d_cw0[5], s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ_6(d_cw0[6], t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ_7(d_cw0[7], u, r, &A[8], &A[16], &A[24], A); +__device__ __forceinline__ +static void Round8_0_final(uint32_t *const __restrict__ A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ + + STEP8_IF(d_cw[0][0], 0, r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF(d_cw[0][1], 1, s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF(d_cw[0][2], 2, t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF(d_cw[0][3], 3, u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ(d_cw[0][4], 4, r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ(d_cw[0][5], 5, s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ(d_cw[0][6], 6, t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ(d_cw[0][7], 7, u, r, &A[8], &A[16], &A[24], A); } -#ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ uint32_t d_cw1[8][8] = { -#else -static __constant__ uint32_t d_cw1[8][8]; -static const uint32_t h_cw1[8][8] = { -#endif - 0xC34C07F3, 0xC914143C, 0x599CBC12, 0xBCCBE543, 0x385EF3B7, 0x14F54C9A, 0x0AD7C068, 0xB64A21F7, - 0xDEC2AF10, 0xC6E9C121, 0x56B8A4F2, 0x1158D107, 0xEB0BA88F, 0x050FAABA, 0xC293264D, 0x548D46D2, - 0xACE5E8E0, 0x53D421F7, 0xF470D279, 0xDC974E0C, 0xD6CF55FF, 0xFD1C4F7E, 0x36EC36EC, 0x3E261E5A, - 0xEBC4FD1C, 0x56B839D0, 0x5B0E21F7, 0x58E3DF7B, 0x5BC7427C, 0xEF613296, 0x1158109F, 0x5A55E318, - 0xA7D6B703, 0x1158E76E, 0xB08255FF, 0x50F05771, 0xEEA8E8E0, 0xCB3FDB25, 0x2E40548D, 0xE1A60F2D, - 0xACE5D616, 0xFD1CFD1C, 0x24DB3BFB, 0xAC2C1ABD, 0xF529E8E0, 0x1E5AE5FC, 0x478BCB3F, 0xC121BC12, - 0xF4702B5C, 0xC293FC63, 0xDA6CB2AD, 0x45601FCC, 0xA439E1A6, 0x4E0C0D02, 0xED3621F7, 0xAB73BE3D, - 0x0E74D4A4, 0xF754CF95, 0xD84136EC, 0x3124AB73, 0x39D03B42, 0x0E74BCCB, 0x0F2DBD84, 0x41C35C80 -}; +__device__ __forceinline__ +static void Round8_1_final(uint32_t *const __restrict__ A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ -__device__ __forceinline__ void Round8_1_final(uint32_t *A, int r, int s, int t, int u) -{ - STEP8_IF_8(d_cw1[0], r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF_9(d_cw1[1], s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF_10(d_cw1[2], t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF_11(d_cw1[3], u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ_12(d_cw1[4], r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ_13(d_cw1[5], s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ_14(d_cw1[6], t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ_15(d_cw1[7], u, r, &A[8], &A[16], &A[24], A); + STEP8_IF(d_cw[1][0], 8, r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF(d_cw[1][1], 9, s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF(d_cw[1][2], 10, t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF(d_cw[1][3], 11, u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ(d_cw[1][4], 12, r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ(d_cw[1][5], 13, s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ(d_cw[1][6], 14, t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ(d_cw[1][7], 15, u, r, &A[8], &A[16], &A[24], A); } -#ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ uint32_t d_cw2[8][8] = { -#else -static __constant__ uint32_t d_cw2[8][8]; -static const uint32_t h_cw2[8][8] = { -#endif - 0xA4135BED, 0xE10E1EF2, 0x6C4F93B1, 0x6E2191DF, 0xE2E01D20, 0xD1952E6B, 0x6A7D9583, 0x131DECE3, - 0x369CC964, 0xFB73048D, 0x9E9D6163, 0x280CD7F4, 0xD9C6263A, 0x1062EF9E, 0x2AC7D539, 0xAD2D52D3, - 0x0A03F5FD, 0x197CE684, 0xAA72558E, 0xDE5321AD, 0xF0870F79, 0x607A9F86, 0xAFE85018, 0x2AC7D539, - 0xE2E01D20, 0x2AC7D539, 0xC6A93957, 0x624C9DB4, 0x6C4F93B1, 0x641E9BE2, 0x452CBAD4, 0x263AD9C6, - 0xC964369C, 0xC3053CFB, 0x452CBAD4, 0x95836A7D, 0x4AA2B55E, 0xAB5B54A5, 0xAC4453BC, 0x74808B80, - 0xCB3634CA, 0xFC5C03A4, 0x4B8BB475, 0x21ADDE53, 0xE2E01D20, 0xDF3C20C4, 0xBD8F4271, 0xAA72558E, - 0xFC5C03A4, 0x48D0B730, 0x2AC7D539, 0xD70B28F5, 0x53BCAC44, 0x3FB6C04A, 0x14EFEB11, 0xDB982468, - 0x9A1065F0, 0xB0D14F2F, 0x8D5272AE, 0xC4D73B29, 0x91DF6E21, 0x949A6B66, 0x303DCFC3, 0x5932A6CE -}; +__device__ __forceinline__ +static void Round8_2_final(uint32_t *const __restrict__ A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ -__device__ __forceinline__ void Round8_2_final(uint32_t *A, int r, int s, int t, int u) -{ - STEP8_IF_16(d_cw2[0], r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF_17(d_cw2[1], s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF_18(d_cw2[2], t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF_19(d_cw2[3], u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ_20(d_cw2[4], r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ_21(d_cw2[5], s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ_22(d_cw2[6], t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ_23(d_cw2[7], u, r, &A[8], &A[16], &A[24], A); + STEP8_IF(d_cw[2][0], 16, r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF(d_cw[2][1], 17, s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF(d_cw[2][2], 18, t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF(d_cw[2][3], 19, u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ(d_cw[2][4], 20, r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ(d_cw[2][5], 21, s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ(d_cw[2][6], 22, t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ(d_cw[2][7], 23, u, r, &A[8], &A[16], &A[24], A); } -#ifdef DEVICE_DIRECT_CONSTANTS -static __constant__ uint32_t d_cw3[8][8] = { -#else -static __constant__ uint32_t d_cw3[8][8]; -static const uint32_t h_cw3[8][8] = { -#endif - 0x1234EDCC, 0xF5140AEC, 0xCDF1320F, 0x3DE4C21C, 0x48D0B730, 0x1234EDCC, 0x131DECE3, 0x52D3AD2D, - 0xE684197C, 0x6D3892C8, 0x72AE8D52, 0x6FF3900D, 0x73978C69, 0xEB1114EF, 0x15D8EA28, 0x71C58E3B, - 0x90F66F0A, 0x15D8EA28, 0x9BE2641E, 0x65F09A10, 0xEA2815D8, 0xBD8F4271, 0x3A40C5C0, 0xD9C6263A, - 0xB38C4C74, 0xBAD4452C, 0x70DC8F24, 0xAB5B54A5, 0x46FEB902, 0x1A65E59B, 0x0DA7F259, 0xA32A5CD6, - 0xD62229DE, 0xB81947E7, 0x6D3892C8, 0x15D8EA28, 0xE59B1A65, 0x065FF9A1, 0xB2A34D5D, 0x6A7D9583, - 0x975568AB, 0xFC5C03A4, 0x2E6BD195, 0x966C6994, 0xF2590DA7, 0x263AD9C6, 0x5A1BA5E5, 0xB0D14F2F, - 0x975568AB, 0x6994966C, 0xF1700E90, 0xD3672C99, 0xCC1F33E1, 0xFC5C03A4, 0x452CBAD4, 0x4E46B1BA, - 0xF1700E90, 0xB2A34D5D, 0xD0AC2F54, 0x5760A8A0, 0x8C697397, 0x624C9DB4, 0xE85617AA, 0x95836A7D -}; +__device__ __forceinline__ +static void Round8_3_final(uint32_t*const __restrict__ A, const uint32_t r, const uint32_t s, const uint32_t t, const uint32_t u){ -__device__ __forceinline__ void Round8_3_final(uint32_t *A, int r, int s, int t, int u) -{ - STEP8_IF_24(d_cw3[0], r, s, A, &A[8], &A[16], &A[24]); - STEP8_IF_25(d_cw3[1], s, t, &A[24], A, &A[8], &A[16]); - STEP8_IF_26(d_cw3[2], t, u, &A[16], &A[24], A, &A[8]); - STEP8_IF_27(d_cw3[3], u, r, &A[8], &A[16], &A[24], A); - STEP8_MAJ_28(d_cw3[4], r, s, A, &A[8], &A[16], &A[24]); - STEP8_MAJ_29(d_cw3[5], s, t, &A[24], A, &A[8], &A[16]); - STEP8_MAJ_30(d_cw3[6], t, u, &A[16], &A[24], A, &A[8]); - STEP8_MAJ_31(d_cw3[7], u, r, &A[8], &A[16], &A[24], A); + STEP8_IF(d_cw[3][0], 24, r, s, A, &A[8], &A[16], &A[24]); + STEP8_IF(d_cw[3][1], 25, s, t, &A[24], A, &A[8], &A[16]); + STEP8_IF(d_cw[3][2], 26, t, u, &A[16], &A[24], A, &A[8]); + STEP8_IF(d_cw[3][3], 27, u, r, &A[8], &A[16], &A[24], A); + STEP8_MAJ(d_cw[3][4], 28, r, s, A, &A[8], &A[16], &A[24]); + STEP8_MAJ(d_cw[3][5], 29, s, t, &A[24], A, &A[8], &A[16]); + STEP8_MAJ(d_cw[3][6], 30, t, u, &A[16], &A[24], A, &A[8]); + STEP8_MAJ(d_cw[3][7], 31, u, r, &A[8], &A[16], &A[24], A); } -#if __CUDA_ARCH__ < 350 -#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) -#else -//#define expanded_vector(x) tex1Dfetch(texRef1D_128, (x)) -#define expanded_vector(x) __ldg(&g_fft4[x]) -#endif +//#define expanded_vector(x) __ldg(&g_fft4[x]) +static __device__ __forceinline__ void expanded_vector(uint32_t* w, const uint4* ptr){ + asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(w[0]), "=r"(w[1]), "=r"(w[2]), "=r"(w[3]) : __LDG_PTR(ptr)); +} + +__device__ __forceinline__ +static void Round8(uint32_t*const __restrict__ A, const uint32_t thr_offset, const uint4 *const __restrict__ g_fft4) { -__device__ __forceinline__ void Round8_0(uint32_t *A, const int thr_offset, - int r, int s, int t, int u, uint4 *g_fft4) { uint32_t w[8]; - uint4 hv1, hv2; - - int tmp = 0 + thr_offset; - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_0(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_1(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_2(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_3(w, u, r, &A[8], &A[16], &A[24], A); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_4(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_5(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_6(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_7(w, u, r, &A[8], &A[16], &A[24], A); + uint32_t tmp = thr_offset; + uint32_t r = 3, s = 23, t = 17, u = 27; -} -__device__ __forceinline__ void Round8_1(uint32_t *A, const int thr_offset, - int r, int s, int t, int u, uint4 *g_fft4) { - uint32_t w[8]; - uint4 hv1, hv2; - - int tmp = 16 + thr_offset; - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_8(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_9(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_10(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_11(w, u, r, &A[8], &A[16], &A[24], A); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_12(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_13(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_14(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_15(w, u, r, &A[8], &A[16], &A[24], A); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 0, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 1, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 2, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 3, u, r, &A[8], &A[16], &A[24], A); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 4, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 5, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 6, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 7, u, r, &A[8], &A[16], &A[24], A); + r = 28; s = 19; t = 22; u = 7; + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 8, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 9, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 10, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 11, u, r, &A[8], &A[16], &A[24], A); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 12, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 13, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 14, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 15, u, r, &A[8], &A[16], &A[24], A); -} -__device__ __forceinline__ void Round8_2(uint32_t *A, const int thr_offset, - int r, int s, int t, int u, uint4 *g_fft4) { - uint32_t w[8]; - uint4 hv1, hv2; - - int tmp = 32 + thr_offset; - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_16(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_17(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_18(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_19(w, u, r, &A[8], &A[16], &A[24], A); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_20(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_21(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_22(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_23(w, u, r, &A[8], &A[16], &A[24], A); + r = 29; s = 9; t = 15; u = 5; + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 16, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 17, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 18, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 19, u, r, &A[8], &A[16], &A[24], A); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 20, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 21, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 22, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 23, u, r, &A[8], &A[16], &A[24], A); -} -__device__ __forceinline__ void Round8_3(uint32_t *A, const int thr_offset, - int r, int s, int t, int u, uint4 *g_fft4) { - uint32_t w[8]; - uint4 hv1, hv2; - - int tmp = 48 + thr_offset; - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_24(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_25(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_26(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_IF_27(w, u, r, &A[8], &A[16], &A[24], A); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_28(w, r, s, A, &A[8], &A[16], &A[24]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_29(w, s, t, &A[24], A, &A[8], &A[16]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_30(w, t, u, &A[16], &A[24], A, &A[8]); - hv1 = expanded_vector(tmp++); w[0] = hv1.x; w[1] = hv1.y; w[2] = hv1.z; w[3] = hv1.w; - hv2 = expanded_vector(tmp++); w[4] = hv2.x; w[5] = hv2.y; w[6] = hv2.z; w[7] = hv2.w; - STEP8_MAJ_31(w, u, r, &A[8], &A[16], &A[24], A); + r = 4; s = 13; t = 10; u = 25; + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 24, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 25, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 26, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_IF(w, 27, u, r, &A[8], &A[16], &A[24], A); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 28, r, s, A, &A[8], &A[16], &A[24]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 29, s, t, &A[24], A, &A[8], &A[16]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 30, t, u, &A[16], &A[24], A, &A[8]); + expanded_vector(&w[0], &g_fft4[tmp++]); + expanded_vector(&w[4], &g_fft4[tmp++]); + STEP8_MAJ(w, 31, u, r, &A[8], &A[16], &A[24], A); } -__device__ __forceinline__ void SIMD_Compress1(uint32_t *A, const int thr_id, const uint32_t *M, uint4 *g_fft4) { - int i; - const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) -#pragma unroll 8 - for(i=0; i<8; i++) { - A[i] ^= M[i]; - (&A[8])[i] ^= M[8+i]; - } - Round8_0(A, thr_offset, 3, 23, 17, 27, g_fft4); - Round8_1(A, thr_offset, 28, 19, 22, 7, g_fft4); -} +/********************* Message expansion ************************/ -__device__ __forceinline__ void Compression1(const uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) { - uint32_t A[32]; - int i; -#pragma unroll 32 - for (i=0; i < 32; i++) A[i] = c_IV_512[i]; - uint32_t buffer[16]; -#pragma unroll 16 - for (i=0; i < 16; i++) buffer[i] = hashval[i]; - SIMD_Compress1(A, texture_id, buffer, g_fft4); - uint32_t *state = (uint32_t*)&g_state[blockIdx.x * (blockDim.x*32)]; -#pragma unroll 32 - for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; +/* +* Reduce modulo 257; result is in [-127; 383] +* REDUCE(x) := (x&255) - (x>>8) +*/ +#define REDUCE(x) \ + (((x)&255) - ((x)>>8)) + +/* +* Reduce from [-127; 383] to [-128; 128] +* EXTRA_REDUCE_S(x) := x<=128 ? x : x-257 +*/ +#define EXTRA_REDUCE_S(x) \ + ((x)<=128 ? (x) : (x)-257) + +/* +* Reduce modulo 257; result is in [-128; 128] +*/ +#define REDUCE_FULL_S(x) \ + EXTRA_REDUCE_S(REDUCE(x)) + +// Parallelization: +// +// FFT_8 wird 2 times 8-fach parallel ausgeführt (in FFT_64) +// and 1 time 16-fach parallel (in FFT_128_full) +// +// STEP8_IF and STEP8_MAJ beinhalten je 2x 8-fach parallel Operations + +/** +* FFT_8 using w=4 as 8th root of unity +* Unrolled decimation in frequency (DIF) radix-2 NTT. +* Output data is in revbin_permuted order. +*/ +__device__ __forceinline__ +static void FFT_8(int *y, const uint8_t stripe){ + +#define BUTTERFLY(i,j,n) \ +do { \ + int u= y[stripe*i]; \ + int v= y[stripe*j]; \ + y[stripe*i] = u+v; \ + y[stripe*j] = (u-v) << (n<<1); \ +} while(0) + + BUTTERFLY(0, 4, 0); + BUTTERFLY(1, 5, 1); + BUTTERFLY(2, 6, 2); + BUTTERFLY(3, 7, 3); + + y[stripe * 6] = REDUCE(y[stripe * 6]); + y[stripe * 7] = REDUCE(y[stripe * 7]); + + BUTTERFLY(0, 2, 0); + BUTTERFLY(4, 6, 0); + BUTTERFLY(1, 3, 2); + BUTTERFLY(5, 7, 2); + + y[stripe * 7] = REDUCE(y[stripe * 7]); + + BUTTERFLY(0, 1, 0); + BUTTERFLY(2, 3, 0); + BUTTERFLY(4, 5, 0); + BUTTERFLY(6, 7, 0); + + y[0] = REDUCE(y[0]); + y[stripe] = REDUCE(y[stripe]); + y[stripe << 1] = REDUCE(y[stripe << 1]); + y[stripe * 3] = REDUCE(y[stripe * 3]); + y[stripe << 2] = REDUCE(y[stripe << 2]); + y[stripe * 5] = REDUCE(y[stripe * 5]); + y[stripe * 6] = REDUCE(y[stripe * 6]); + y[stripe * 7] = REDUCE(y[stripe * 7]); + + y[0] = EXTRA_REDUCE_S(y[0]); + y[stripe] = EXTRA_REDUCE_S(y[stripe]); + y[stripe << 1] = EXTRA_REDUCE_S(y[stripe << 1]); + y[stripe * 3] = EXTRA_REDUCE_S(y[stripe * 3]); + y[stripe << 2] = EXTRA_REDUCE_S(y[stripe << 2]); + y[stripe * 5] = EXTRA_REDUCE_S(y[stripe * 5]); + y[stripe * 6] = EXTRA_REDUCE_S(y[stripe * 6]); + y[stripe * 7] = EXTRA_REDUCE_S(y[stripe * 7]); + +#undef BUTTERFLY } -__device__ __forceinline__ void SIMD_Compress2(uint32_t *A, const int thr_id, uint4 *g_fft4) { - uint32_t IV[4][8]; - int i; - const int thr_offset = thr_id << 6; // thr_id * 128 (je zwei elemente) -#pragma unroll 8 - for(i=0; i<8; i++) { - IV[0][i] = c_IV_512[i]; - IV[1][i] = c_IV_512[8+i]; - IV[2][i] = c_IV_512[16+i]; - IV[3][i] = c_IV_512[24+i]; +/** +* FFT_16 using w=2 as 16th root of unity +* Unrolled decimation in frequency (DIF) radix-2 NTT. +* Output data is in revbin_permuted order. +*/ +__device__ __forceinline__ +static void FFT_16(int *const __restrict__ y){ + +#define DO_REDUCE_FULL_S(i) \ + do { \ + y[i] = REDUCE(y[i]); \ + y[i] = EXTRA_REDUCE_S(y[i]); \ + } while(0) + + int u, v; + + const uint8_t thr = threadIdx.x & 7; + + u = y[0]; // 0..7 + v = y[1]; // 8..15 + y[0] = u + v; + y[1] = (u - v) << (thr); + + if ((thr) >= 3) y[1] = REDUCE(y[1]); // 11...15 + + u = __shfl(y[0], (threadIdx.x & 3), 8); // 0,1,2,3 0,1,2,3 + v = __shfl(y[0], 4 + (threadIdx.x & 3), 8); // 4,5,6,7 4,5,6,7 + y[0] = ((thr) < 4) ? (u + v) : ((u - v) << ((threadIdx.x & 3) << 1)); + + u = __shfl(y[1], (threadIdx.x & 3), 8); // 8,9,10,11 8,9,10,11 + v = __shfl(y[1], 4 + (threadIdx.x & 3), 8); // 12,13,14,15 12,13,14,15 + y[1] = ((thr) < 4) ? (u + v) : ((u - v) << ((threadIdx.x & 3) << 1)); + + if ((threadIdx.x & 1) && (thr >= 4)) { + y[0] = REDUCE(y[0]); // 5, 7 + y[1] = REDUCE(y[1]); // 13, 15 } - Round8_2(A, thr_offset, 29, 9, 15, 5, g_fft4); - Round8_3(A, thr_offset, 4, 13, 10, 25, g_fft4); - STEP8_IF_32(IV[0], 4, 13, A, &A[8], &A[16], &A[24]); - STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]); - STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]); - STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); -} -__device__ __forceinline__ void Compression2(const int texture_id, uint4 *g_fft4, uint32_t *g_state) { - uint32_t A[32]; - int i; - uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; -#pragma unroll 32 - for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; - SIMD_Compress2(A, texture_id, g_fft4); -#pragma unroll 32 - for (i=0; i < 32; i++) state[threadIdx.x+blockDim.x*i] = A[i]; + u = __shfl(y[0], (threadIdx.x & 5), 8); // 0,1,0,1 4,5,4,5 + v = __shfl(y[0], 2 + (threadIdx.x & 5), 8); // 2,3,2,3 6,7,6,7 + y[0] = ((threadIdx.x & 3) < 2) ? (u + v) : ((u - v) << ((threadIdx.x & 1) << 2)); + + u = __shfl(y[1], (threadIdx.x & 5), 8); // 8,9,8,9 12,13,12,13 + v = __shfl(y[1], 2 + (threadIdx.x & 5), 8); // 10,11,10,11 14,15,14,15 + y[1] = ((threadIdx.x & 3) < 2) ? (u + v) : ((u - v) << ((threadIdx.x & 1) << 2)); + + u = __shfl(y[0], (threadIdx.x & 6), 8); // 0,0,2,2 4,4,6,6 + v = __shfl(y[0], 1 + (threadIdx.x & 6), 8); // 1,1,3,3 5,5,7,7 + y[0] = ((threadIdx.x & 1) < 1) ? (u + v) : (u - v); + + u = __shfl(y[1], (threadIdx.x & 6), 8); // 8,8,10,10 12,12,14,14 + v = __shfl(y[1], 1 + (threadIdx.x & 6), 8); // 9,9,11,11 13,13,15,15 + y[1] = ((threadIdx.x & 1) < 1) ? (u + v) : (u - v); + + DO_REDUCE_FULL_S(0); // 0...7 + DO_REDUCE_FULL_S(1); // 8...15 + +#undef DO_REDUCE_FULL_S } -__device__ __forceinline__ void SIMD_Compress_Final(uint32_t *A, const uint32_t *M) { - uint32_t IV[4][8]; - int i; -#pragma unroll 8 - for(i=0; i<8; i++) { - IV[0][i] = A[i]; - IV[1][i] = (&A[8])[i]; - IV[2][i] = (&A[16])[i]; - IV[3][i] = (&A[24])[i]; - } +/***************************************************/ +#if __CUDA_ARCH__ > 500 +__global__ __launch_bounds__(TPB52_1, 9) +#else +__global__ __launch_bounds__(TPB50_1, 9) +#endif +static void x11_simd512_gpu_expand_64(uint32_t threads, const uint32_t* __restrict__ g_hash, uint4 * __restrict__ g_temp4) +{ + const uint32_t threadBloc = (blockDim.x * blockIdx.x + threadIdx.x) >> 3; + const uint8_t thr = (threadIdx.x & 7); + /* Message Expansion using Number Theoretical Transform similar to FFT */ + int expanded[32]; + + uint4 vec0; + int P, Q, P1, Q1, P2, Q2; + + const bool even = (threadIdx.x & 1) == 0; + const bool hi = (thr) >= 4; + const bool lo = (thr)<4; + const bool sel = ((threadIdx.x + 2) & 7) >= 4; // 2,3,4,5 + + if (threadBloc < threads){ + + const uint32_t hashPosition = threadBloc << 4; + + const uint32_t *inpHash = &g_hash[hashPosition]; + + const uint32_t data0 = __ldg(&inpHash[thr]); + const uint32_t data1 = __ldg(&inpHash[thr + 8]); + + // Puffer für expandierte Nachricht + uint4 *temp4 = &g_temp4[hashPosition << 2]; + +#pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) { + expanded[i] = bfe(__byte_perm(__shfl(data0, i << 1, 8), __shfl(data0, (i << 1) + 1, 8), thr), 0, 8); + } +#pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) { + expanded[4 + i] = bfe(__byte_perm(__shfl(data1, i << 1, 8), __shfl(data1, (i << 1) + 1, 8), thr), 0, 8); + } +#pragma unroll 8 + for (uint32_t i = 8; i < 16; i++) { + expanded[i] = 0; + } + /* + * FFT_256 using w=41 as 256th root of unity. Decimation in frequency (DIF) NTT. Output data is in revbin_permuted order. In place. + */ +#pragma unroll 8 + for (uint32_t i = 0; i<8; i++) + expanded[16 + i] = REDUCE(expanded[i] * c_FFT256_2_128_Twiddle[8 * i + (thr)]); + #pragma unroll 8 - for(i=0; i<8; i++) { - A[i] ^= M[i]; - (&A[8])[i] ^= M[8+i]; - } - Round8_0_final(A, 3, 23, 17, 27); - Round8_1_final(A, 28, 19, 22, 7); - Round8_2_final(A, 29, 9, 15, 5); - Round8_3_final(A, 4, 13, 10, 25); - STEP8_IF_32(IV[0], 4, 13, A, &A[8], &A[16], &A[24]); - STEP8_IF_33(IV[1], 13, 10, &A[24], A, &A[8], &A[16]); - STEP8_IF_34(IV[2], 10, 25, &A[16], &A[24], A, &A[8]); - STEP8_IF_35(IV[3], 25, 4, &A[8], &A[16], &A[24], A); -} + for (uint32_t i = 24; i < 32; i++) { + expanded[i] = 0; + } + /* handle X^255 with an additional butterfly */ + if (thr == 7){ + expanded[15] = 1; + expanded[31] = REDUCE((-1) * c_FFT256_2_128_Twiddle[127]); + } + + // FFT_128_full(expanded); + FFT_8(expanded, 2); // eight parallel FFT8's + FFT_8(&expanded[16], 2); // eight parallel FFT8's + FFT_8(&expanded[1], 2); // eight parallel FFT8's + FFT_8(&expanded[17], 2); // eight parallel FFT8's -__device__ __forceinline__ void Final(uint32_t *hashval, const int texture_id, uint4 *g_fft4, uint32_t *g_state) { - uint32_t A[32]; - int i; - uint32_t *state = &g_state[blockIdx.x * (blockDim.x*32)]; -#pragma unroll 32 - for (i=0; i < 32; i++) A[i] = state[threadIdx.x+blockDim.x*i]; - uint32_t buffer[16]; - buffer[0] = 512; -#pragma unroll 15 - for (i=1; i < 16; i++) buffer[i] = 0; - SIMD_Compress_Final(A, buffer); #pragma unroll 16 - for (i=0; i < 16; i++) - hashval[i] = A[i]; + for (uint32_t i = 0; i<16; i++){ + expanded[i] = REDUCE(expanded[i] * c_FFT128_8_16_Twiddle[i * 8 + (thr)]); + expanded[i + 16] = REDUCE(expanded[i + 16] * c_FFT128_8_16_Twiddle[i * 8 + (thr)]); + } + +#pragma unroll 8 + for (uint32_t i = 0; i<8; i++){ + FFT_16(expanded + (i << 1)); // eight sequential FFT16's, each one executed in parallel by 8 threads + FFT_16(expanded + 16 + (i << 1)); // eight sequential FFT16's, each one executed in parallel by 8 threads + } + + // store w matrices in global memory + P1 = expanded[0]; P2 = __shfl(expanded[2], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[16]; Q2 = __shfl(expanded[18], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); + P1 = expanded[8]; P2 = __shfl(expanded[10], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[24]; Q2 = __shfl(expanded[26], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); + P1 = expanded[4]; P2 = __shfl(expanded[6], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[20]; Q2 = __shfl(expanded[22], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); + P1 = expanded[12]; P2 = __shfl(expanded[14], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[28]; Q2 = __shfl(expanded[30], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[0][thr], 8); + temp4[thr] = vec0; + + P1 = expanded[1]; P2 = __shfl(expanded[3], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[17]; Q2 = __shfl(expanded[19], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); + P1 = expanded[9]; P2 = __shfl(expanded[11], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[25]; Q2 = __shfl(expanded[27], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); + P1 = expanded[5]; P2 = __shfl(expanded[7], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[21]; Q2 = __shfl(expanded[23], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); + P1 = expanded[13]; P2 = __shfl(expanded[15], (threadIdx.x - 1) & 7, 8); P = even ? P1 : P2; + Q1 = expanded[29]; Q2 = __shfl(expanded[31], (threadIdx.x - 1) & 7, 8); Q = even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[1][thr], 8); + temp4[8 + (thr)] = vec0; + + P1 = hi ? expanded[1] : expanded[0]; P2 = __shfl(hi ? expanded[3] : expanded[2], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = hi ? expanded[17] : expanded[16]; Q2 = __shfl(hi ? expanded[19] : expanded[18], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); + P1 = hi ? expanded[9] : expanded[8]; P2 = __shfl(hi ? expanded[11] : expanded[10], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = hi ? expanded[25] : expanded[24]; Q2 = __shfl(hi ? expanded[27] : expanded[26], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); + P1 = hi ? expanded[5] : expanded[4]; P2 = __shfl(hi ? expanded[7] : expanded[6], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = hi ? expanded[21] : expanded[20]; Q2 = __shfl(hi ? expanded[23] : expanded[22], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); + P1 = hi ? expanded[13] : expanded[12]; P2 = __shfl(hi ? expanded[15] : expanded[14], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = hi ? expanded[29] : expanded[28]; Q2 = __shfl(hi ? expanded[31] : expanded[30], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[2][thr], 8); + temp4[16 + (thr)] = vec0; + + P1 = lo ? expanded[1] : expanded[0]; P2 = __shfl(lo ? expanded[3] : expanded[2], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = lo ? expanded[17] : expanded[16]; Q2 = __shfl(lo ? expanded[19] : expanded[18], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); + P1 = lo ? expanded[9] : expanded[8]; P2 = __shfl(lo ? expanded[11] : expanded[10], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = lo ? expanded[25] : expanded[24]; Q2 = __shfl(lo ? expanded[27] : expanded[26], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); + P1 = lo ? expanded[5] : expanded[4]; P2 = __shfl(lo ? expanded[7] : expanded[6], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = lo ? expanded[21] : expanded[20]; Q2 = __shfl(lo ? expanded[23] : expanded[22], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); + P1 = lo ? expanded[13] : expanded[12]; P2 = __shfl(lo ? expanded[15] : expanded[14], (threadIdx.x + 1) & 7, 8); P = !even ? P1 : P2; + Q1 = lo ? expanded[29] : expanded[28]; Q2 = __shfl(lo ? expanded[31] : expanded[30], (threadIdx.x + 1) & 7, 8); Q = !even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(185 * P, 185 * Q, 0x5410), c_perm[3][thr], 8); + temp4[24 + (thr)] = vec0; + + P1 = sel ? expanded[0] : expanded[1]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[2] : expanded[3]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); + P1 = sel ? expanded[8] : expanded[9]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[10] : expanded[11]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); + P1 = sel ? expanded[4] : expanded[5]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[6] : expanded[7]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); + P1 = sel ? expanded[12] : expanded[13]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[14] : expanded[15]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[4][thr], 8); + + temp4[32 + thr] = vec0; + + P1 = sel ? expanded[1] : expanded[0]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[3] : expanded[2]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); + P1 = sel ? expanded[9] : expanded[8]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[11] : expanded[10]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); + P1 = sel ? expanded[5] : expanded[4]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[7] : expanded[6]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); + P1 = sel ? expanded[13] : expanded[12]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + Q2 = sel ? expanded[15] : expanded[14]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[5][thr], 8); + + temp4[40 + thr] = vec0; + + uint32_t t; + t = __shfl(expanded[17], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[16]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[19], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[18]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); + t = __shfl(expanded[25], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[24]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[27], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[26]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); + t = __shfl(expanded[21], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[20]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[23], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[22]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); + t = __shfl(expanded[29], (threadIdx.x + 4) & 7, 8); P1 = sel ? t : expanded[28]; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[31], (threadIdx.x + 4) & 7, 8); Q2 = sel ? t : expanded[30]; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[6][thr], 8); + + temp4[48 + thr] = vec0; + + t = __shfl(expanded[16], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[17] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[18], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[19] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.x = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); + t = __shfl(expanded[24], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[25] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[26], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[27] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.y = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); + t = __shfl(expanded[20], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[21] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[22], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[23] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.z = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); + t = __shfl(expanded[28], (threadIdx.x + 4) & 7, 8); P1 = sel ? expanded[29] : t; Q1 = __shfl(P1, (threadIdx.x ^ 1) & 7, 8); + t = __shfl(expanded[30], (threadIdx.x + 4) & 7, 8); Q2 = sel ? expanded[31] : t; P2 = __shfl(Q2, (threadIdx.x ^ 1) & 7, 8); + P = even ? P1 : P2; Q = even ? Q1 : Q2; + vec0.w = __shfl(__byte_perm(233 * P, 233 * Q, 0x5410), c_perm[7][thr], 8); + + temp4[56 + thr] = vec0; + } } diff --git a/x11/phi.cu b/x11/phi.cu index ab1f30833c..3a64fe2865 100644 --- a/x11/phi.cu +++ b/x11/phi.cu @@ -115,7 +115,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u quark_skein512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); +// x11_cubehash512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput); if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput); @@ -143,7 +143,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (use_compat_kernels[thr_id]) { streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); diff --git a/x11/sib.cu b/x11/sib.cu index c437523d03..0665a7da39 100644 --- a/x11/sib.cu +++ b/x11/sib.cu @@ -169,7 +169,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u TRACE("skein :"); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("jh512 :"); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; TRACE("keccak :"); if (use_compat_kernels[thr_id]) streebog_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); diff --git a/x11/timetravel.cu b/x11/timetravel.cu index 93c3fd19a8..2ace3ecaf1 100644 --- a/x11/timetravel.cu +++ b/x11/timetravel.cu @@ -329,7 +329,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n quark_jh512_cpu_init(thr_id, throughput); qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes) x11_luffa512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); +// x11_cubehash512_cpu_init(thr_id, throughput); #if HASH_FUNC_COUNT > 8 x11_shavite512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); @@ -460,7 +460,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n TRACE("jh512 :"); break; case KECCAK: - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; TRACE("keccak :"); break; case LUFFA: @@ -468,7 +468,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n TRACE("luffa :"); break; case CUBEHASH: - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; TRACE("cube :"); break; #if HASH_FUNC_COUNT > 8 diff --git a/x11/x11.cu b/x11/x11.cu index a7f1b601fa..20c0a96a65 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -156,7 +156,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u TRACE("skein :"); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); TRACE("jh512 :"); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; TRACE("keccak :"); x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); TRACE("luffa+c:"); diff --git a/x11/x11evo.cu b/x11/x11evo.cu index 53799f9ba6..c4b4b3c996 100644 --- a/x11/x11evo.cu +++ b/x11/x11evo.cu @@ -266,7 +266,7 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce quark_keccak512_cpu_init(thr_id, throughput); quark_jh512_cpu_init(thr_id, throughput); x11_luffa512_cpu_init(thr_id, throughput); - x11_cubehash512_cpu_init(thr_id, throughput); +// x11_cubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); x11_echo512_cpu_init(thr_id, throughput); if (x11_simd512_cpu_init(thr_id, throughput) != 0) { @@ -322,7 +322,7 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce TRACE("jh512 :"); break; case KECCAK: - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; TRACE("keccak :"); break; case LUFFA: @@ -330,7 +330,7 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce TRACE("luffa :"); break; case CUBEHASH: - x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; TRACE("cube :"); break; case SHAVITE: diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu index 69070fb362..3c21f7f1e5 100644 --- a/x13/cuda_x13_hamsi512.cu +++ b/x13/cuda_x13_hamsi512.cu @@ -1,6 +1,6 @@ /* - * Quick Hamsi-512 for X13 - * by tsiv - 2014 + * Quick Hamsi-512 for X13 by tsiv - 2014 + * + Hamsi-512 80 by tpruvot - 2018 */ #include @@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32]; static __constant__ uint32_t d_T512[64][16]; static const uint32_t alpha_n[] = { - SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), - SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), - SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), - SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), - SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), - SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) + 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00, + 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa, + 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, + 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0 }; static const uint32_t alpha_f[] = { - SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), - SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), - SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), - SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), - SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) + 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9, + 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0, + 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, + 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c }; #define hamsi_s00 m0 @@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = { static const uint32_t T512[64][16] = { - { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), - SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), - SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), - SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), - SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), - SPH_C32(0x9e69af68) }, - { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), - SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), - SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), - SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), - SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), - SPH_C32(0x0c26f262) }, - { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), - SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), - SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), - SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), - SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), - SPH_C32(0xdc24e61f) }, - { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), - SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), - SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), - SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), - SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), - SPH_C32(0x3daac2da) }, - { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), - SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), - SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), - SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), - SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), - SPH_C32(0x78cace29) }, - { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), - SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), - SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), - SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), - SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), - SPH_C32(0x2dd1f9ab) }, - { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), - SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), - SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), - SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), - SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), - SPH_C32(0xbf2c0be2) }, - { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), - SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), - SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), - SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), - SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), - SPH_C32(0x32219526) }, - { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), - SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), - SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), - SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), - SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), - SPH_C32(0xac8e6c88) }, - { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), - SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), - SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), - SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), - SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), - SPH_C32(0x7b1bd6b9) }, - { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), - SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), - SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), - SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), - SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), - SPH_C32(0xf746c320) }, - { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), - SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), - SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), - SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), - SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), - SPH_C32(0x69505b3a) }, - { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), - SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), - SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), - SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), - SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), - SPH_C32(0x8a341574) }, - { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), - SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), - SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), - SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), - SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), - SPH_C32(0x450360bf) }, - { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), - SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), - SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), - SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), - SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), - SPH_C32(0xf3d45758) }, - { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), - SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), - SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), - SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), - SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), - SPH_C32(0x925c44e9) }, - { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), - SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), - SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), - SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), - SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), - SPH_C32(0xa123ff9f) }, - { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), - SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), - SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), - SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), - SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), - SPH_C32(0x1568ff0f) }, - { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), - SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), - SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), - SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), - SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), - SPH_C32(0xc5c1eb3e) }, - { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), - SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), - SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), - SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), - SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), - SPH_C32(0x1af21fe1) }, - { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), - SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), - SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), - SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), - SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), - SPH_C32(0x857f3c2b) }, - { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), - SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), - SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), - SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), - SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), - SPH_C32(0x2ba05a55) }, - { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), - SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), - SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), - SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), - SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), - SPH_C32(0xfeabf254) }, - { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), - SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), - SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), - SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), - SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), - SPH_C32(0xfe1cdc7f) }, - { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), - SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), - SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), - SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), - SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), - SPH_C32(0xb0a51834) }, - { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), - SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), - SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), - SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), - SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), - SPH_C32(0xa6b8c28d) }, - { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), - SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), - SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), - SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), - SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), - SPH_C32(0x3a4e99d7) }, - { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), - SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), - SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), - SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), - SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), - SPH_C32(0xe1844257) }, - { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), - SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), - SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), - SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), - SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), - SPH_C32(0x2c3b504e) }, - { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), - SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), - SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), - SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), - SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), - SPH_C32(0x524a0d59) }, - { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), - SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), - SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), - SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), - SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), - SPH_C32(0x378dd173) }, - { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), - SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), - SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), - SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), - SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), - SPH_C32(0x8b6c72bd) }, - { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), - SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), - SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), - SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), - SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), - SPH_C32(0x8e67b7fa) }, - { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), - SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), - SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), - SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), - SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), - SPH_C32(0x443d3004) }, - { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), - SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), - SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), - SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), - SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), - SPH_C32(0xf4f6ea7b) }, - { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), - SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), - SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), - SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), - SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), - SPH_C32(0x979961d0) }, - { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), - SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), - SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), - SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), - SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), - SPH_C32(0x98aa496e) }, - { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), - SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), - SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), - SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), - SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), - SPH_C32(0x094e3198) }, - { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), - SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), - SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), - SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), - SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), - SPH_C32(0xe86cba2e) }, - { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), - SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), - SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), - SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), - SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), - SPH_C32(0x4b7eec55) }, - { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), - SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), - SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), - SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), - SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), - SPH_C32(0x1e7536a6) }, - { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), - SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), - SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), - SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), - SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), - SPH_C32(0x24314f17) }, - { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), - SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), - SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), - SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), - SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), - SPH_C32(0x9075b1ce) }, - { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), - SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), - SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), - SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), - SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), - SPH_C32(0x9b6ef888) }, - { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), - SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), - SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), - SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), - SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), - SPH_C32(0xd8b61463) }, - { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), - SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), - SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), - SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), - SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), - SPH_C32(0x3ea660f7) }, - { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), - SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), - SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), - SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), - SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), - SPH_C32(0x7f975691) }, - { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), - SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), - SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), - SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), - SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), - SPH_C32(0x2c94459e) }, - { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), - SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), - SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), - SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), - SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), - SPH_C32(0x56a7b19f) }, - { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), - SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), - SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), - SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), - SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), - SPH_C32(0x81fdf908) }, - { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), - SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), - SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), - SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), - SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), - SPH_C32(0x5bd61539) }, - { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), - SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), - SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), - SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), - SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), - SPH_C32(0x15b961e7) }, - { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), - SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), - SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), - SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), - SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), - SPH_C32(0x2a2c18f0) }, - { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), - SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), - SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), - SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), - SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), - SPH_C32(0x551e3d6e) }, - { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), - SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), - SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), - SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), - SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), - SPH_C32(0x33c5244f) }, - { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), - SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), - SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), - SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), - SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), - SPH_C32(0x8a58e6a4) }, - { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), - SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), - SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), - SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), - SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), - SPH_C32(0xda878000) }, - { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), - SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), - SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), - SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), - SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), - SPH_C32(0x3c5dfffe) }, - { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), - SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), - SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), - SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), - SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), - SPH_C32(0x7b1675d7) }, - { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), - SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), - SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), - SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), - SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), - SPH_C32(0x2879ebac) }, - { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), - SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), - SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), - SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), - SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), - SPH_C32(0xbe0a679e) }, - { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), - SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), - SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), - SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), - SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), - SPH_C32(0x30aebcf7) }, - { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), - SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), - SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), - SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), - SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), - SPH_C32(0xc7ff60f0) }, - { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), - SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), - SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), - SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), - SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), - SPH_C32(0xe7e00a94) } +{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0 + 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 }, +{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, + 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 }, +{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5, + 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f }, +{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, + 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da }, +{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782, + 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 }, +{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, + 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab }, +{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4, + 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 }, +{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7 + 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 }, +{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31, + 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 }, +{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, + 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 }, +{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a, + 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 }, +{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, + 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a }, +{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb, + 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 }, +{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, + 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf }, +{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14 + 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 }, +{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, + 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 }, +{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090, + 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f }, +{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, + 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f }, +{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df, + 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e }, +{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, + 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 }, +{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e, + 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b }, +{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21 + 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 }, +{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b, + 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 }, +{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, + 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f }, +{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9, + 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 }, +{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, + 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d }, +{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80, + 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 }, +{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, + 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 }, +{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28 + 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e }, +{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, + 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 }, +{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce, + 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 }, +{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, + 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd }, +{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe, + 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa }, +{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, + 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 }, +{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab, + 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b }, +{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35 + 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 }, +{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6, + 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e }, +{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, + 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 }, +{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b, + 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e }, +{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, + 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 }, +{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1, + 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 }, +{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, + 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 }, +{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42 + 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce }, +{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, + 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 }, +{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494, + 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 }, +{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, + 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 }, +{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f, + 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 }, +{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, + 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e }, +{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897, + 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f }, +{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49 + 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 }, +{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de, + 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 }, +{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, + 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 }, +{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e, + 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 }, +{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, + 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e }, +{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb, + 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f }, +{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, + 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 }, +{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56 + 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 }, +{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, + 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe }, +{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b, + 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 }, +{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, + 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac }, +{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69, + 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e }, +{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, + 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 }, +{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64, + 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 }, +{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, + 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 } }; __global__ @@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; unsigned char *h1 = (unsigned char *)Hash; - uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); - uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); - uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); - uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); - uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; + uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265; + uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c; + uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48; + uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d; uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; uint32_t *tp, db, dm; for(int i = 0; i < 64; i += 8) { @@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * T_BIG; } + // precomputed for 64 bytes blocks ? tp = &d_T512[0][0] + 112; - - m0 = *(tp+ 0); m1 = *(tp+ 1); - m2 = *(tp+ 2); m3 = *(tp+ 3); - m4 = *(tp+ 4); m5 = *(tp+ 5); - m6 = *(tp+ 6); m7 = *(tp+ 7); - m8 = *(tp+ 8); m9 = *(tp+ 9); - mA = *(tp+10); mB = *(tp+11); - mC = *(tp+12); mD = *(tp+13); - mE = *(tp+14); mF = *(tp+15); + m0 = tp[ 0]; m1 = tp[ 1]; + m2 = tp[ 2]; m3 = tp[ 3]; + m4 = tp[ 4]; m5 = tp[ 5]; + m6 = tp[ 6]; m7 = tp[ 7]; + m8 = tp[ 8]; m9 = tp[ 9]; + mA = tp[10]; mB = tp[11]; + mC = tp[12]; mD = tp[13]; + mE = tp[14]; mF = tp[15]; for( int r = 0; r < 6; r += 2 ) { ROUND_BIG(r, d_alpha_n); @@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * T_BIG; tp = &d_T512[0][0] + 784; - - m0 = *(tp+ 0); m1 = *(tp+ 1); - m2 = *(tp+ 2); m3 = *(tp+ 3); - m4 = *(tp+ 4); m5 = *(tp+ 5); - m6 = *(tp+ 6); m7 = *(tp+ 7); - m8 = *(tp+ 8); m9 = *(tp+ 9); - mA = *(tp+10); mB = *(tp+11); - mC = *(tp+12); mD = *(tp+13); - mE = *(tp+14); mF = *(tp+15); + m0 = tp[ 0]; m1 = tp[ 1]; + m2 = tp[ 2]; m3 = tp[ 3]; + m4 = tp[ 4]; m5 = tp[ 5]; + m6 = tp[ 6]; m7 = tp[ 7]; + m8 = tp[ 8]; m9 = tp[ 9]; + mA = tp[10]; mB = tp[11]; + mC = tp[12]; mD = tp[13]; + mE = tp[14]; mF = tp[15]; for( int r = 0; r < 12; r += 2 ) { ROUND_BIG(r, d_alpha_f); @@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } + +__constant__ static uint64_t c_PaddedMessage80[10]; + +__host__ +void x16_hamsi512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +__global__ +void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + unsigned char h1[80]; + #pragma unroll + for (int i = 0; i < 10; i++) + ((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i]; + //((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread)); + ((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread); + + uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265; + uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c; + uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48; + uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d; + uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; + uint32_t *tp, db, dm; + + for(int i = 0; i < 80; i += 8) + { + m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; + m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; + tp = &d_T512[0][0]; + + #pragma unroll + for (int u = 0; u < 8; u++) { + db = h1[i + u]; + #pragma unroll 2 + for (int v = 0; v < 8; v++, db >>= 1) { + dm = -(uint32_t)(db & 1); + m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; + m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; + m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; + m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; + m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; + mA ^= dm & tp[10]; mB ^= dm & tp[11]; + mC ^= dm & tp[12]; mD ^= dm & tp[13]; + mE ^= dm & tp[14]; mF ^= dm & tp[15]; + tp += 16; + } + } + + #pragma unroll + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + T_BIG; + } + + #define INPUT_BIG { \ + m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \ + m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \ + tp = &d_T512[0][0]; \ + for (int u = 0; u < 8; u++) { \ + db = endtag[u]; \ + for (int v = 0; v < 8; v++, db >>= 1) { \ + dm = -(uint32_t)(db & 1); \ + m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \ + m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \ + m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \ + m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \ + m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \ + mA ^= dm & tp[10]; mB ^= dm & tp[11]; \ + mC ^= dm & tp[12]; mD ^= dm & tp[13]; \ + mE ^= dm & tp[14]; mF ^= dm & tp[15]; \ + tp += 16; \ + } \ + } \ + } + + // close + uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + INPUT_BIG; + + #pragma unroll + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + T_BIG; + + endtag[0] = endtag[1] = 0x00; + endtag[6] = 0x02; + endtag[7] = 0x80; + INPUT_BIG; + + // PF_BIG + #pragma unroll + for(int r = 0; r < 12; r++) { + ROUND_BIG(r, d_alpha_f); + } + T_BIG; + + uint64_t hashPosition = thread; + uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3]; + #pragma unroll 16 + for(int i = 0; i < 16; i++) + Hash[i] = cuda_swab32(h[i]); + + #undef INPUT_BIG + } +} + +__host__ +void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_hamsi512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash); +} diff --git a/x13/hsr.cu b/x13/hsr.cu index e86444628d..8e3e78c247 100644 --- a/x13/hsr.cu +++ b/x13/hsr.cu @@ -182,7 +182,7 @@ extern "C" int scanhash_hsr(int thr_id, struct work* work, uint32_t max_nonce, u quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x13/x13.cu b/x13/x13.cu index 0f5d88c394..46a4d4df86 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -172,7 +172,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu index e2df3dc19e..3110a694ed 100644 --- a/x15/cuda_x15_whirlpool_sm3.cu +++ b/x15/cuda_x15_whirlpool_sm3.cu @@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int __global__ -void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab) +void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab) { __shared__ uint64_t sharedMemory[2048]; @@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; #endif } - __threadfence_block(); // ensure shared mem is ready + //__threadfence_block(); // ensure shared mem is ready + __syncthreads(); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp uint64_t state[8]; #pragma unroll 8 for (int i=0; i < 8; i++) { - state[i] = c_PaddedMessage80[i]; + //state[i] = c_PaddedMessage80[i]; + AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]); } #else #pragma unroll 8 @@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp state[i] = xor1(n[i],c_PaddedMessage80[i]); } #endif + /// round 2 /////// ////////////////////////////////// n[0] = c_PaddedMessage80[8]; //read data @@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t } __host__ -void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) +void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash) { dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 block(threadsperblock); @@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce if (threads < 256) applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); - oldwhirlpool_gpu_hash_80<<>>(threads, startNounce, d_outputHash, 1); + oldwhirlpool_gpu_hash_80<<>>(threads, startNonce, d_outputHash, 1); } extern void whirl_midstate(void *state, const void *input); @@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget) cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice); } + +// ------------------------------------------------------------------------------------------------ + +__host__ +void x16_whirlpool512_init(int thr_id, uint32_t threads) +{ + cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); +#if USE_ALL_TABLES + cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice); +#endif +} + +extern void whirlpool_midstate(void *state, const void *input); + +__host__ +void x16_whirlpool512_setBlock_80(void *pdata) +{ + unsigned char PaddedMessage[128]; + + memcpy(PaddedMessage, pdata, 80); + memset(PaddedMessage + 80, 0, 48); + PaddedMessage[80] = 0x80; /* ending */ + +#if HOST_MIDSTATE + // compute constant first block + unsigned char midstate[64] = { 0 }; + whirlpool_midstate(midstate, pdata); + memcpy(PaddedMessage, midstate, 64); +#endif + + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice); +} + +__host__ +void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash) +{ + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + if (threads < 256) + applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); + + oldwhirlpool_gpu_hash_80 <<>> (threads, startNonce, d_outputHash, 1); +} diff --git a/x15/x14.cu b/x15/x14.cu index 4232c6906f..f67b6eda58 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -185,7 +185,7 @@ extern "C" int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x15/x15.cu b/x15/x15.cu index cdfbd81f00..723e078335 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -193,7 +193,7 @@ extern "C" int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x16/cuda_x16.h b/x16/cuda_x16.h new file mode 100644 index 0000000000..48857f3283 --- /dev/null +++ b/x16/cuda_x16.h @@ -0,0 +1,85 @@ +#include "x11/cuda_x11.h" + +extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_free(int thr_id); + +extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash); + +extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen); + +void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); + +extern void tiger192_cpu_hash_64(int thr_id, int threads, int zero_pad_64, uint32_t *d_hash); + +// ---- optimised but non compatible kernels + +void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + +// ---- 80 bytes kernels + +void quark_bmw512_cpu_setBlock_80(void *pdata); +void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); + +void groestl512_setBlock_80(int thr_id, uint32_t *endiandata); +void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void skein512_cpu_setBlock_80(void *pdata); +void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap); + +void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); +void qubit_luffa512_cpu_setBlock_80(void *pdata); +void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); + +void jh512_setBlock_80(int thr_id, uint32_t *endiandata); +void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void keccak512_setBlock_80(int thr_id, uint32_t *endiandata); +void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); +void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_shavite512_setBlock_80(void *pdata); +void x16_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); + + +void x16_shabal512_setBlock_80(void *pdata); +void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_simd512_setBlock_80(void *pdata); +void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_echo512_cuda_init(int thr_id, const uint32_t threads); +void x16_echo512_setBlock_80(void *pdata); +void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_hamsi512_setBlock_80(void *pdata); +void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_fugue512_cpu_init(int thr_id, uint32_t threads); +void x16_fugue512_cpu_free(int thr_id); +void x16_fugue512_setBlock_80(void *pdata); +void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_whirlpool512_init(int thr_id, uint32_t threads); +void x16_whirlpool512_setBlock_80(void* endiandata); +void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_sha512_setBlock_80(void *pdata); +void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void tiger192_setBlock_80(void *pdata); +void tiger192_cpu_hash_80(int thr_id, int threads, uint32_t startNonce, uint32_t *d_hash); diff --git a/x16/cuda_x16_echo512.cu b/x16/cuda_x16_echo512.cu new file mode 100644 index 0000000000..5e6013d2ab --- /dev/null +++ b/x16/cuda_x16_echo512.cu @@ -0,0 +1,214 @@ +/** + * echo512-80 cuda kernel for X16R algorithm + * + * tpruvot 2018 - GPL code + */ + +#include +#include + +#include "cuda_helper.h" + +extern __device__ __device_builtin__ void __threadfence_block(void); + +#include "../x11/cuda_x11_aes.cuh" + +__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory, + uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, + uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round(sharedMemory, + x0, x1, x2, x3, + k0, + y0, y1, y2, y3); + + aes_round(sharedMemory, + y0, y1, y2, y3, + x0, x1, x2, x3); + + k0++; +} + +__device__ +static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0) +{ + // Big Sub Words + #pragma unroll 16 + for (int idx = 0; idx < 16; idx++) { + AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0); + } + + // Shift Rows + #pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i + 4]; + t[1] = W[i + 8]; + t[2] = W[i + 24]; + t[3] = W[i + 60]; + + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i + 24] = W[i + 56]; + W[i + 60] = W[i + 44]; + + W[i + 20] = W[i + 36]; + W[i + 40] = t[1]; + W[i + 56] = t[2]; + W[i + 44] = W[i + 28]; + + W[i + 28] = W[i + 12]; + W[i + 12] = t[3]; + W[i + 36] = W[i + 52]; + W[i + 52] = t[0]; + } + + // Mix Columns + #pragma unroll 4 + for (int i = 0; i < 4; i++) + { + #pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16) + { + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i + 12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = bc ^ a[3] ^ abx; + W[idx + i + 4] = a[0] ^ cd ^ bcx; + W[idx + i + 8] = ab ^ a[3] ^ cdx; + W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx); + } + } +} + +__device__ __forceinline__ +void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash) +{ + uint32_t h[29]; // <= 127 bytes input + + #pragma unroll 8 + for (int i = 0; i < 18; i += 2) + AS_UINT2(&h[i]) = AS_UINT2(&data[i]); + h[18] = data[18]; + h[19] = cuda_swab32(nonce); + h[20] = 0x80; + h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0; + //((uint8_t*)h)[80] = 0x80; + //((uint8_t*)h)[128-17] = 0x02; + //((uint8_t*)h)[128-16] = 0x80; + //((uint8_t*)h)[128-15] = 0x02; + h[27] = 0x2000000; + h[28] = 0x280; + //h[29] = h[30] = h[31] = 0; + + uint32_t k0 = 640; // bitlen + uint32_t W[64]; + + #pragma unroll 8 + for (int i = 0; i < 32; i+=4) { + W[i] = 512; // L + W[i+1] = 0; // H + W[i+2] = 0; // X + W[i+3] = 0; + } + + uint32_t Z[16]; + #pragma unroll + for (int i = 0; i<16; i++) Z[i] = W[i]; + #pragma unroll + for (int i = 32; i<61; i++) W[i] = h[i - 32]; + #pragma unroll + for (int i = 61; i<64; i++) W[i] = 0; + + for (int i = 0; i < 10; i++) + echo_round(sharedMemory, W, k0); + + #pragma unroll 16 + for (int i = 0; i < 16; i++) { + Z[i] ^= h[i] ^ W[i] ^ W[i + 32]; + } + + #pragma unroll 8 + for (int i = 0; i < 16; i += 2) + AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]); +} + +__device__ __forceinline__ +void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 128) { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; + sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; + + sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; + } +} + +__host__ +void x16_echo512_cuda_init(int thr_id, const uint32_t threads) +{ + aes_cpu_init(thr_id); +} + +__constant__ static uint32_t c_PaddedMessage80[20]; + +__host__ +void x16_echo512_setBlock_80(void *endiandata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +__global__ __launch_bounds__(128, 7) /* will force 72 registers */ +void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash) +{ + __shared__ uint32_t sharedMemory[1024]; + + echo_gpu_init(sharedMemory); + __threadfence_block(); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t hashPosition = thread; + uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3]; + + cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash); + } +} + +__host__ +void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_80<<>>(threads, startNonce, (uint64_t*)d_hash); +} diff --git a/x16/cuda_x16_echo512_64.cu b/x16/cuda_x16_echo512_64.cu new file mode 100644 index 0000000000..3a0f268725 --- /dev/null +++ b/x16/cuda_x16_echo512_64.cu @@ -0,0 +1,260 @@ +/** + * Echo512-64 kernel for maxwell, based on alexis work + */ + +#include +#include +#include + +#define INTENSIVE_GMF +#include "tribus/cuda_echo512_aes.cuh" + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, b) x +#define atomicExch(p,y) (*p) = y +#endif + +__device__ +static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0) +{ + // Big Sub Words + #pragma unroll 16 + for (int idx = 0; idx < 16; idx++) + AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0); + + // Shift Rows + #pragma unroll 4 + for (int i = 0; i < 4; i++){ + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i+ 4]; + t[1] = W[i+ 8]; + t[2] = W[i+24]; + t[3] = W[i+60]; + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i +24] = W[i + 56]; + W[i +60] = W[i + 44]; + + W[i +20] = W[i +36]; + W[i +40] = t[1]; + W[i +56] = t[2]; + W[i +44] = W[i +28]; + + W[i +28] = W[i +12]; + W[i +12] = t[3]; + W[i +36] = W[i +52]; + W[i +52] = t[0]; + } + // Mix Columns + #pragma unroll 4 + for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t + #pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i +12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = bc ^ a[3] ^ abx; + W[idx + i + 4] = a[0] ^ cd ^ bcx; + W[idx + i + 8] = ab ^ a[3] ^ cdx; + W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx); + } + } +} + +__global__ __launch_bounds__(128, 5) /* will force 80 registers */ +static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val) +{ + __shared__ uint32_t sharedMemory[4][256]; + + aes_gpu_init128(sharedMemory); + __syncthreads(); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + uint32_t k0; + uint32_t h[16]; + uint32_t hash[16]; + if (thread < threads) + { + // phi2 filter (2 hash chain branches) + if (d_filter && d_filter[thread] != filter_val) return; + + uint32_t *Hash = &g_hash[thread<<4]; + + *(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]); + *(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]); + + *(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0]; + *(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8]; + + const uint32_t P[48] = { + 0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //8-12 + 0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //21-25 + 0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + //34-38 + 0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, + 0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968, + 0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af + //58-61 + }; + + k0 = 520; + + #pragma unroll 4 + for (uint32_t idx = 0; idx < 16; idx += 4) { + AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0); + } + k0 += 4; + + uint32_t W[64]; + + #pragma unroll 4 + for (uint32_t i = 0; i < 4; i++) + { + uint32_t a = P[i]; + uint32_t b = P[i + 4]; + uint32_t c = h[i + 8]; + uint32_t d = P[i + 8]; + + uint32_t ab = a ^ b; + uint32_t bc = b ^ c; + uint32_t cd = c ^ d; + + + uint32_t t = (ab & 0x80808080); + uint32_t t2 = (bc & 0x80808080); + uint32_t t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[i] = abx ^ bc ^ d; + W[i + 4] = bcx ^ a ^ cd; + W[i + 8] = cdx ^ ab ^ d; + W[i +12] = abx ^ bcx ^ cdx ^ ab ^ c; + + a = P[i +12]; + b = h[i + 4]; + c = P[i +16]; + d = P[i +20]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[16 + i] = bc ^ d ^ abx; + W[16 + i + 4] = a ^ cd ^ bcx; + W[16 + i + 8] = d ^ ab ^ cdx; + W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + a = h[i]; + b = P[24 + i + 0]; + c = P[24 + i + 4]; + d = P[24 + i + 8]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[32 + i] = bc ^ d ^ abx; + W[32 + i + 4] = a ^ cd ^ bcx; + W[32 + i + 8] = d ^ ab ^ cdx; + W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + + a = P[36 + i ]; + b = P[36 + i + 4]; + c = P[36 + i + 8]; + d = h[i + 12]; + + ab = a ^ b; + bc = b ^ c; + cd = c ^ d; + + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + abx = (t >> 7) * 27U ^ ((ab^t) << 1); + bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[48 + i] = bc ^ d ^ abx; + W[48 + i + 4] = a ^ cd ^ bcx; + W[48 + i + 8] = d ^ ab ^ cdx; + W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx; + } + + for (int k = 1; k < 10; k++) + echo_round_alexis(sharedMemory,W,k0); + + #pragma unroll 4 + for (int i = 0; i < 16; i += 4) + { + W[i] ^= W[32 + i] ^ 512; + W[i + 1] ^= W[32 + i + 1]; + W[i + 2] ^= W[32 + i + 2]; + W[i + 3] ^= W[32 + i + 3]; + } + *(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0]; + *(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8]; + } +} + +__host__ +void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_64 <<>> (threads, d_hash, NULL, 0); +} + +__host__ +void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_64 <<>> (threads, g_hash, d_filter, 0); +} \ No newline at end of file diff --git a/x16/cuda_x16_fugue512.cu b/x16/cuda_x16_fugue512.cu new file mode 100644 index 0000000000..7f3438c2bd --- /dev/null +++ b/x16/cuda_x16_fugue512.cu @@ -0,0 +1,467 @@ + +#include +#include + +#define TPB 256 + +/* + * fugue512-80 x16r kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2018 tpruvot + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + */ + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, m) (x|y) +#define tex1Dfetch(t, n) (n) +#define __CUDACC__ +#include +#endif + +// store allocated textures device addresses +static unsigned int* d_textures[MAX_GPUS][1]; + +#define mixtab0(x) mixtabs[(x)] +#define mixtab1(x) mixtabs[(x)+256] +#define mixtab2(x) mixtabs[(x)+512] +#define mixtab3(x) mixtabs[(x)+768] + +static texture mixTab0Tex; + +static const uint32_t mixtab0[] = { + 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, + 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3, + 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, + 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d, + 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, + 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54, + 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, + 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, + 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, + 0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, + 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, + 0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, + 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, + 0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, + 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9, + 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, + 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c, + 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, + 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516, + 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, + 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b, + 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, + 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, + 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, + 0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, + 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, + 0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, + 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, + 0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, + 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51, + 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, + 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258 +}; + +#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ + x22 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ +} + +#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ +} + +#define SMIX(x0, x1, x2, x3) { \ + uint32_t tmp; \ + uint32_t r0 = 0; \ + uint32_t r1 = 0; \ + uint32_t r2 = 0; \ + uint32_t r3 = 0; \ + uint32_t c0 = mixtab0(x0 >> 24); \ + tmp = mixtab1((x0 >> 16) & 0xFF); \ + c0 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x0 >> 8) & 0xFF); \ + c0 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x0 & 0xFF); \ + c0 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x1 >> 24); \ + uint32_t c1 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x1 >> 16) & 0xFF); \ + c1 ^= tmp; \ + tmp = mixtab2((x1 >> 8) & 0xFF); \ + c1 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x1 & 0xFF); \ + c1 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x2 >> 24); \ + uint32_t c2 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x2 >> 16) & 0xFF); \ + c2 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x2 >> 8) & 0xFF); \ + c2 ^= tmp; \ + tmp = mixtab3(x2 & 0xFF); \ + c2 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x3 >> 24); \ + uint32_t c3 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x3 >> 16) & 0xFF); \ + c3 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x3 >> 8) & 0xFF); \ + c3 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x3 & 0xFF); \ + c3 ^= tmp; \ + x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \ + | ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \ + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \ + | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \ + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \ + | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \ + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \ + | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \ +} + +#define SUB_ROR3 { \ + B33 = S33, B34 = S34, B35 = S35; \ + S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \ + S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \ + S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \ + S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \ +} + +#define SUB_ROR8 { \ + B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ + S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \ + S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \ + S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \ + S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \ +} + +#define SUB_ROR9 { \ + B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ + S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \ + S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \ + S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \ + S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \ +} + +#define SUB_ROR9_3 { \ + SUB_ROR3; SUB_ROR3; SUB_ROR3; \ +} + +#define SUB_ROR12 { /* to fix */ \ + B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \ + S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \ + S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \ + S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \ +} + +#define FUGUE512_3(x, y, z) { \ + TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ + \ + TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \ + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \ + SMIX(S21, S22, S23, S24); \ + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \ + SMIX(S18, S19, S20, S21); \ + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \ + SMIX(S15, S16, S17, S18); \ + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \ + SMIX(S12, S13, S14, S15); \ + \ + TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \ + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \ + SMIX(S09, S10, S11, S12); \ + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \ + SMIX(S06, S07, S08, S09); \ + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \ + SMIX(S03, S04, S05, S06); \ + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \ + SMIX(S00, S01, S02, S03); \ +} + +#define FUGUE512_F(w, x, y, z) { \ + TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ + \ + TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \ + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \ + SMIX(S21, S22, S23, S24); \ + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \ + SMIX(S18, S19, S20, S21); \ + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \ + SMIX(S15, S16, S17, S18); \ + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \ + SMIX(S12, S13, S14, S15); \ + \ + TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \ + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \ + SMIX(S09, S10, S11, S12); \ + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \ + SMIX(S06, S07, S08, S09); \ + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \ + SMIX(S03, S04, S05, S06); \ + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \ + SMIX(S00, S01, S02, S03); \ + \ + TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ +} + +#undef ROL8 +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ +uint32_t ROL8(const uint32_t a) { + return __byte_perm(a, 0, 0x2103); +} +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a) { + return __byte_perm(a, 0, 0x0321); +} +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a) { + return __byte_perm(a, 0, 0x1032); +} +#else +#define ROL8(u) ROTL32(u, 8) +#define ROR8(u) ROTR32(u, 8) +#define ROL16(u) ROTL32(u,16) +#endif + +//#define AS_UINT4(addr) *((uint4*)(addr)) + +__constant__ static uint64_t c_PaddedMessage80[10]; + +__host__ +void x16_fugue512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +/***************************************************/ + +__global__ +__launch_bounds__(TPB) +void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +{ + __shared__ uint32_t mixtabs[1024]; + + // load shared mem (with 256 threads) + const uint32_t thr = threadIdx.x & 0xFF; + const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr); + mixtabs[thr] = tmp; + mixtabs[thr+256] = ROR8(tmp); + mixtabs[thr+512] = ROL16(tmp); + mixtabs[thr+768] = ROL8(tmp); +#if TPB <= 256 + if (blockDim.x < 256) { + const uint32_t thr = (threadIdx.x + 0x80) & 0xFF; + const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr); + mixtabs[thr] = tmp; + mixtabs[thr + 256] = ROR8(tmp); + mixtabs[thr + 512] = ROL16(tmp); + mixtabs[thr + 768] = ROL8(tmp); + } +#endif + + __syncthreads(); + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t Data[20]; + + #pragma unroll + for(int i = 0; i < 10; i++) + AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]); + Data[19] = (startNonce + thread); + + uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11; + uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23; + uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35; + //uint32_t B24, B25, B26, + uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; + //const uint64_t bc = 640 bits to hash + //const uint32_t bclo = (uint32_t)(bc); + //const uint32_t bchi = (uint32_t)(bc >> 32); + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0; + S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027; + S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1; + S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f; + S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567; + + FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2])); + FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5])); + FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8])); + FUGUE512_3((Data[ 9]), (Data[10]), (Data[11])); + FUGUE512_3((Data[12]), (Data[13]), (Data[14])); + FUGUE512_3((Data[15]), (Data[16]), (Data[17])); + FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/); + + // rotate right state by 3 dwords (S00 = S33, S03 = S00) + SUB_ROR3; + SUB_ROR9; + + #pragma unroll 32 + for (int i = 0; i < 32; i++) { + SUB_ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + #pragma unroll 13 + for (int i = 0; i < 13; i++) { + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + SUB_ROR8; + SMIX(S00, S01, S02, S03); + } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + Data[ 0] = cuda_swab32(S01); + Data[ 1] = cuda_swab32(S02); + Data[ 2] = cuda_swab32(S03); + Data[ 3] = cuda_swab32(S04); + Data[ 4] = cuda_swab32(S09); + Data[ 5] = cuda_swab32(S10); + Data[ 6] = cuda_swab32(S11); + Data[ 7] = cuda_swab32(S12); + Data[ 8] = cuda_swab32(S18); + Data[ 9] = cuda_swab32(S19); + Data[10] = cuda_swab32(S20); + Data[11] = cuda_swab32(S21); + Data[12] = cuda_swab32(S27); + Data[13] = cuda_swab32(S28); + Data[14] = cuda_swab32(S29); + Data[15] = cuda_swab32(S30); + + const size_t hashPosition = thread; + uint64_t* pHash = &g_hash[hashPosition << 3]; + #pragma unroll 4 + for(int i = 0; i < 4; i++) + AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]); + } +} + +#define texDef(id, texname, texmem, texsource, texsize) { \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} + +__host__ +void x16_fugue512_cpu_init(int thr_id, uint32_t threads) +{ + texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256); +} + +__host__ +void x16_fugue512_cpu_free(int thr_id) +{ + cudaFree(d_textures[thr_id][0]); +} + +__host__ +void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = TPB; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_fugue512_gpu_hash_80 <<>> (threads, startNonce, (uint64_t*)d_hash); +} diff --git a/x16/cuda_x16_shabal512.cu b/x16/cuda_x16_shabal512.cu new file mode 100644 index 0000000000..ca00b50737 --- /dev/null +++ b/x16/cuda_x16_shabal512.cu @@ -0,0 +1,350 @@ +/* +* Shabal-512 for X16R +* tpruvot 2018, based on alexis x14 and xevan kernlx code +*/ + +#include +#include +#include + +typedef uint32_t sph_u32; + +#define C32(x) (x) +#define T32(x) (x) + +#define INPUT_BLOCK_ADD do { \ + B0 = T32(B0 + M0); \ + B1 = T32(B1 + M1); \ + B2 = T32(B2 + M2); \ + B3 = T32(B3 + M3); \ + B4 = T32(B4 + M4); \ + B5 = T32(B5 + M5); \ + B6 = T32(B6 + M6); \ + B7 = T32(B7 + M7); \ + B8 = T32(B8 + M8); \ + B9 = T32(B9 + M9); \ + BA = T32(BA + MA); \ + BB = T32(BB + MB); \ + BC = T32(BC + MC); \ + BD = T32(BD + MD); \ + BE = T32(BE + ME); \ + BF = T32(BF + MF); \ + } while (0) + +#define INPUT_BLOCK_SUB do { \ + C0 = T32(C0 - M0); \ + C1 = T32(C1 - M1); \ + C2 = T32(C2 - M2); \ + C3 = T32(C3 - M3); \ + C4 = T32(C4 - M4); \ + C5 = T32(C5 - M5); \ + C6 = T32(C6 - M6); \ + C7 = T32(C7 - M7); \ + C8 = T32(C8 - M8); \ + C9 = T32(C9 - M9); \ + CA = T32(CA - MA); \ + CB = T32(CB - MB); \ + CC = T32(CC - MC); \ + CD = T32(CD - MD); \ + CE = T32(CE - ME); \ + CF = T32(CF - MF); \ + } while (0) + +#define XOR_W do { \ + A00 ^= Wlow; \ + A01 ^= Whigh; \ + } while (0) + +#define SWAP(v1, v2) do { \ + sph_u32 tmp = (v1); \ + (v1) = (v2); \ + (v2) = tmp; \ + } while (0) + +#define SWAP_BC do { \ + SWAP(B0, C0); \ + SWAP(B1, C1); \ + SWAP(B2, C2); \ + SWAP(B3, C3); \ + SWAP(B4, C4); \ + SWAP(B5, C5); \ + SWAP(B6, C6); \ + SWAP(B7, C7); \ + SWAP(B8, C8); \ + SWAP(B9, C9); \ + SWAP(BA, CA); \ + SWAP(BB, CB); \ + SWAP(BC, CC); \ + SWAP(BD, CD); \ + SWAP(BE, CE); \ + SWAP(BF, CF); \ + } while (0) + +#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ + xa0 = T32((xa0 \ + ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \ + ^ xc) * 3U) \ + ^ xb1 ^ (xb2 & ~xb3) ^ xm; \ + xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \ + } while (0) + +#define PERM_STEP_0 do { \ + PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_1 do { \ + PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_2 do { \ + PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define APPLY_P do { \ + B0 = T32(B0 << 17) | (B0 >> 15); \ + B1 = T32(B1 << 17) | (B1 >> 15); \ + B2 = T32(B2 << 17) | (B2 >> 15); \ + B3 = T32(B3 << 17) | (B3 >> 15); \ + B4 = T32(B4 << 17) | (B4 >> 15); \ + B5 = T32(B5 << 17) | (B5 >> 15); \ + B6 = T32(B6 << 17) | (B6 >> 15); \ + B7 = T32(B7 << 17) | (B7 >> 15); \ + B8 = T32(B8 << 17) | (B8 >> 15); \ + B9 = T32(B9 << 17) | (B9 >> 15); \ + BA = T32(BA << 17) | (BA >> 15); \ + BB = T32(BB << 17) | (BB >> 15); \ + BC = T32(BC << 17) | (BC >> 15); \ + BD = T32(BD << 17) | (BD >> 15); \ + BE = T32(BE << 17) | (BE >> 15); \ + BF = T32(BF << 17) | (BF >> 15); \ + PERM_STEP_0; \ + PERM_STEP_1; \ + PERM_STEP_2; \ + A0B = T32(A0B + C6); \ + A0A = T32(A0A + C5); \ + A09 = T32(A09 + C4); \ + A08 = T32(A08 + C3); \ + A07 = T32(A07 + C2); \ + A06 = T32(A06 + C1); \ + A05 = T32(A05 + C0); \ + A04 = T32(A04 + CF); \ + A03 = T32(A03 + CE); \ + A02 = T32(A02 + CD); \ + A01 = T32(A01 + CC); \ + A00 = T32(A00 + CB); \ + A0B = T32(A0B + CA); \ + A0A = T32(A0A + C9); \ + A09 = T32(A09 + C8); \ + A08 = T32(A08 + C7); \ + A07 = T32(A07 + C6); \ + A06 = T32(A06 + C5); \ + A05 = T32(A05 + C4); \ + A04 = T32(A04 + C3); \ + A03 = T32(A03 + C2); \ + A02 = T32(A02 + C1); \ + A01 = T32(A01 + C0); \ + A00 = T32(A00 + CF); \ + A0B = T32(A0B + CE); \ + A0A = T32(A0A + CD); \ + A09 = T32(A09 + CC); \ + A08 = T32(A08 + CB); \ + A07 = T32(A07 + CA); \ + A06 = T32(A06 + C9); \ + A05 = T32(A05 + C8); \ + A04 = T32(A04 + C7); \ + A03 = T32(A03 + C6); \ + A02 = T32(A02 + C5); \ + A01 = T32(A01 + C4); \ + A00 = T32(A00 + C3); \ + } while (0) + +#define INCR_W do { \ + if ((Wlow = T32(Wlow + 1)) == 0) \ + Whigh = T32(Whigh + 1); \ + } while (0) + +__constant__ static const sph_u32 A_init_512[] = { + C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), + C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), + C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F) +}; + +__constant__ static const sph_u32 B_init_512[] = { + C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640), + C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08), + C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E), + C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B) +}; + +__constant__ static const sph_u32 C_init_512[] = { + C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359), + C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780), + C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), + C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) +}; + +__constant__ static uint32_t c_PaddedMessage80[20]; + +__host__ +void x16_shabal512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +#define TPB_SHABAL 256 + +__global__ __launch_bounds__(TPB_SHABAL, 2) +void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + uint32_t B[] = { + 0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08, + 0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B + }; + uint32_t M[16]; + + if (thread < threads) + { + // todo: try __ldc + *(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0]; + *(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8]; + + sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3]; + sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7]; + sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11]; + + sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3]; + sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7]; + sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11]; + sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15]; + + sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3]; + sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7]; + sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11]; + sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15]; + + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + sph_u32 Wlow = 1, Whigh = 0; + + M0 = M[ 0]; + M1 = M[ 1]; + M2 = M[ 2]; + M3 = M[ 3]; + M4 = M[ 4]; + M5 = M[ 5]; + M6 = M[ 6]; + M7 = M[ 7]; + M8 = M[ 8]; + M9 = M[ 9]; + MA = M[10]; + MB = M[11]; + MC = M[12]; + MD = M[13]; + ME = M[14]; + MF = M[15]; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + INCR_W; + + M0 = c_PaddedMessage80[16]; + M1 = c_PaddedMessage80[17]; + M2 = c_PaddedMessage80[18]; + M3 = cuda_swab32(startNonce + thread); + M4 = 0x80; + M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + + for (unsigned i = 0; i < 3; i++) { + SWAP_BC; + XOR_W; + APPLY_P; + } + + B[ 0] = B0; + B[ 1] = B1; + B[ 2] = B2; + B[ 3] = B3; + B[ 4] = B4; + B[ 5] = B5; + B[ 6] = B6; + B[ 7] = B7; + B[ 8] = B8; + B[ 9] = B9; + B[10] = BA; + B[11] = BB; + B[12] = BC; + B[13] = BD; + B[14] = BE; + B[15] = BF; + + // output + uint64_t hashPosition = thread; + uint32_t *Hash = &g_hash[hashPosition << 4]; + *(uint2x4*)&Hash[0] = *(uint2x4*)&B[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&B[8]; + } +} + +__host__ +void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = TPB_SHABAL; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_shabal512_gpu_hash_80 <<>>(threads, startNonce, d_hash); +} diff --git a/x16/cuda_x16_shavite512.cu b/x16/cuda_x16_shavite512.cu new file mode 100644 index 0000000000..765b1392cc --- /dev/null +++ b/x16/cuda_x16_shavite512.cu @@ -0,0 +1,3047 @@ +#include // memcpy() + +#include "cuda_helper_alexis.h" + + + +extern __device__ __device_builtin__ void __threadfence_block(void); + +#define AESx(x) (x ##UL) /* SPH_C32(x) */ + +#define TPB 128 +__device__ __align__(64) uint32_t d_AES0[256] = { + AESx(0xA56363C6), AESx(0x847C7CF8), AESx(0x997777EE), AESx(0x8D7B7BF6), + AESx(0x0DF2F2FF), AESx(0xBD6B6BD6), AESx(0xB16F6FDE), AESx(0x54C5C591), + AESx(0x50303060), AESx(0x03010102), AESx(0xA96767CE), AESx(0x7D2B2B56), + AESx(0x19FEFEE7), AESx(0x62D7D7B5), AESx(0xE6ABAB4D), AESx(0x9A7676EC), + AESx(0x45CACA8F), AESx(0x9D82821F), AESx(0x40C9C989), AESx(0x877D7DFA), + AESx(0x15FAFAEF), AESx(0xEB5959B2), AESx(0xC947478E), AESx(0x0BF0F0FB), + AESx(0xECADAD41), AESx(0x67D4D4B3), AESx(0xFDA2A25F), AESx(0xEAAFAF45), + AESx(0xBF9C9C23), AESx(0xF7A4A453), AESx(0x967272E4), AESx(0x5BC0C09B), + AESx(0xC2B7B775), AESx(0x1CFDFDE1), AESx(0xAE93933D), AESx(0x6A26264C), + AESx(0x5A36366C), AESx(0x413F3F7E), AESx(0x02F7F7F5), AESx(0x4FCCCC83), + AESx(0x5C343468), AESx(0xF4A5A551), AESx(0x34E5E5D1), AESx(0x08F1F1F9), + AESx(0x937171E2), AESx(0x73D8D8AB), AESx(0x53313162), AESx(0x3F15152A), + AESx(0x0C040408), AESx(0x52C7C795), AESx(0x65232346), AESx(0x5EC3C39D), + AESx(0x28181830), AESx(0xA1969637), AESx(0x0F05050A), AESx(0xB59A9A2F), + AESx(0x0907070E), AESx(0x36121224), AESx(0x9B80801B), AESx(0x3DE2E2DF), + AESx(0x26EBEBCD), AESx(0x6927274E), AESx(0xCDB2B27F), AESx(0x9F7575EA), + AESx(0x1B090912), AESx(0x9E83831D), AESx(0x742C2C58), AESx(0x2E1A1A34), + AESx(0x2D1B1B36), AESx(0xB26E6EDC), AESx(0xEE5A5AB4), AESx(0xFBA0A05B), + AESx(0xF65252A4), AESx(0x4D3B3B76), AESx(0x61D6D6B7), AESx(0xCEB3B37D), + AESx(0x7B292952), AESx(0x3EE3E3DD), AESx(0x712F2F5E), AESx(0x97848413), + AESx(0xF55353A6), AESx(0x68D1D1B9), AESx(0x00000000), AESx(0x2CEDEDC1), + AESx(0x60202040), AESx(0x1FFCFCE3), AESx(0xC8B1B179), AESx(0xED5B5BB6), + AESx(0xBE6A6AD4), AESx(0x46CBCB8D), AESx(0xD9BEBE67), AESx(0x4B393972), + AESx(0xDE4A4A94), AESx(0xD44C4C98), AESx(0xE85858B0), AESx(0x4ACFCF85), + AESx(0x6BD0D0BB), AESx(0x2AEFEFC5), AESx(0xE5AAAA4F), AESx(0x16FBFBED), + AESx(0xC5434386), AESx(0xD74D4D9A), AESx(0x55333366), AESx(0x94858511), + AESx(0xCF45458A), AESx(0x10F9F9E9), AESx(0x06020204), AESx(0x817F7FFE), + AESx(0xF05050A0), AESx(0x443C3C78), AESx(0xBA9F9F25), AESx(0xE3A8A84B), + AESx(0xF35151A2), AESx(0xFEA3A35D), AESx(0xC0404080), AESx(0x8A8F8F05), + AESx(0xAD92923F), AESx(0xBC9D9D21), AESx(0x48383870), AESx(0x04F5F5F1), + AESx(0xDFBCBC63), AESx(0xC1B6B677), AESx(0x75DADAAF), AESx(0x63212142), + AESx(0x30101020), AESx(0x1AFFFFE5), AESx(0x0EF3F3FD), AESx(0x6DD2D2BF), + AESx(0x4CCDCD81), AESx(0x140C0C18), AESx(0x35131326), AESx(0x2FECECC3), + AESx(0xE15F5FBE), AESx(0xA2979735), AESx(0xCC444488), AESx(0x3917172E), + AESx(0x57C4C493), AESx(0xF2A7A755), AESx(0x827E7EFC), AESx(0x473D3D7A), + AESx(0xAC6464C8), AESx(0xE75D5DBA), AESx(0x2B191932), AESx(0x957373E6), + AESx(0xA06060C0), AESx(0x98818119), AESx(0xD14F4F9E), AESx(0x7FDCDCA3), + AESx(0x66222244), AESx(0x7E2A2A54), AESx(0xAB90903B), AESx(0x8388880B), + AESx(0xCA46468C), AESx(0x29EEEEC7), AESx(0xD3B8B86B), AESx(0x3C141428), + AESx(0x79DEDEA7), AESx(0xE25E5EBC), AESx(0x1D0B0B16), AESx(0x76DBDBAD), + AESx(0x3BE0E0DB), AESx(0x56323264), AESx(0x4E3A3A74), AESx(0x1E0A0A14), + AESx(0xDB494992), AESx(0x0A06060C), AESx(0x6C242448), AESx(0xE45C5CB8), + AESx(0x5DC2C29F), AESx(0x6ED3D3BD), AESx(0xEFACAC43), AESx(0xA66262C4), + AESx(0xA8919139), AESx(0xA4959531), AESx(0x37E4E4D3), AESx(0x8B7979F2), + AESx(0x32E7E7D5), AESx(0x43C8C88B), AESx(0x5937376E), AESx(0xB76D6DDA), + AESx(0x8C8D8D01), AESx(0x64D5D5B1), AESx(0xD24E4E9C), AESx(0xE0A9A949), + AESx(0xB46C6CD8), AESx(0xFA5656AC), AESx(0x07F4F4F3), AESx(0x25EAEACF), + AESx(0xAF6565CA), AESx(0x8E7A7AF4), AESx(0xE9AEAE47), AESx(0x18080810), + AESx(0xD5BABA6F), AESx(0x887878F0), AESx(0x6F25254A), AESx(0x722E2E5C), + AESx(0x241C1C38), AESx(0xF1A6A657), AESx(0xC7B4B473), AESx(0x51C6C697), + AESx(0x23E8E8CB), AESx(0x7CDDDDA1), AESx(0x9C7474E8), AESx(0x211F1F3E), + AESx(0xDD4B4B96), AESx(0xDCBDBD61), AESx(0x868B8B0D), AESx(0x858A8A0F), + AESx(0x907070E0), AESx(0x423E3E7C), AESx(0xC4B5B571), AESx(0xAA6666CC), + AESx(0xD8484890), AESx(0x05030306), AESx(0x01F6F6F7), AESx(0x120E0E1C), + AESx(0xA36161C2), AESx(0x5F35356A), AESx(0xF95757AE), AESx(0xD0B9B969), + AESx(0x91868617), AESx(0x58C1C199), AESx(0x271D1D3A), AESx(0xB99E9E27), + AESx(0x38E1E1D9), AESx(0x13F8F8EB), AESx(0xB398982B), AESx(0x33111122), + AESx(0xBB6969D2), AESx(0x70D9D9A9), AESx(0x898E8E07), AESx(0xA7949433), + AESx(0xB69B9B2D), AESx(0x221E1E3C), AESx(0x92878715), AESx(0x20E9E9C9), + AESx(0x49CECE87), AESx(0xFF5555AA), AESx(0x78282850), AESx(0x7ADFDFA5), + AESx(0x8F8C8C03), AESx(0xF8A1A159), AESx(0x80898909), AESx(0x170D0D1A), + AESx(0xDABFBF65), AESx(0x31E6E6D7), AESx(0xC6424284), AESx(0xB86868D0), + AESx(0xC3414182), AESx(0xB0999929), AESx(0x772D2D5A), AESx(0x110F0F1E), + AESx(0xCBB0B07B), AESx(0xFC5454A8), AESx(0xD6BBBB6D), AESx(0x3A16162C) +}; + +__constant__ __align__(64) uint32_t d_AES1[256] = { + AESx(0x6363C6A5), AESx(0x7C7CF884), AESx(0x7777EE99), AESx(0x7B7BF68D), + AESx(0xF2F2FF0D), AESx(0x6B6BD6BD), AESx(0x6F6FDEB1), AESx(0xC5C59154), + AESx(0x30306050), AESx(0x01010203), AESx(0x6767CEA9), AESx(0x2B2B567D), + AESx(0xFEFEE719), AESx(0xD7D7B562), AESx(0xABAB4DE6), AESx(0x7676EC9A), + AESx(0xCACA8F45), AESx(0x82821F9D), AESx(0xC9C98940), AESx(0x7D7DFA87), + AESx(0xFAFAEF15), AESx(0x5959B2EB), AESx(0x47478EC9), AESx(0xF0F0FB0B), + AESx(0xADAD41EC), AESx(0xD4D4B367), AESx(0xA2A25FFD), AESx(0xAFAF45EA), + AESx(0x9C9C23BF), AESx(0xA4A453F7), AESx(0x7272E496), AESx(0xC0C09B5B), + AESx(0xB7B775C2), AESx(0xFDFDE11C), AESx(0x93933DAE), AESx(0x26264C6A), + AESx(0x36366C5A), AESx(0x3F3F7E41), AESx(0xF7F7F502), AESx(0xCCCC834F), + AESx(0x3434685C), AESx(0xA5A551F4), AESx(0xE5E5D134), AESx(0xF1F1F908), + AESx(0x7171E293), AESx(0xD8D8AB73), AESx(0x31316253), AESx(0x15152A3F), + AESx(0x0404080C), AESx(0xC7C79552), AESx(0x23234665), AESx(0xC3C39D5E), + AESx(0x18183028), AESx(0x969637A1), AESx(0x05050A0F), AESx(0x9A9A2FB5), + AESx(0x07070E09), AESx(0x12122436), AESx(0x80801B9B), AESx(0xE2E2DF3D), + AESx(0xEBEBCD26), AESx(0x27274E69), AESx(0xB2B27FCD), AESx(0x7575EA9F), + AESx(0x0909121B), AESx(0x83831D9E), AESx(0x2C2C5874), AESx(0x1A1A342E), + AESx(0x1B1B362D), AESx(0x6E6EDCB2), AESx(0x5A5AB4EE), AESx(0xA0A05BFB), + AESx(0x5252A4F6), AESx(0x3B3B764D), AESx(0xD6D6B761), AESx(0xB3B37DCE), + AESx(0x2929527B), AESx(0xE3E3DD3E), AESx(0x2F2F5E71), AESx(0x84841397), + AESx(0x5353A6F5), AESx(0xD1D1B968), AESx(0x00000000), AESx(0xEDEDC12C), + AESx(0x20204060), AESx(0xFCFCE31F), AESx(0xB1B179C8), AESx(0x5B5BB6ED), + AESx(0x6A6AD4BE), AESx(0xCBCB8D46), AESx(0xBEBE67D9), AESx(0x3939724B), + AESx(0x4A4A94DE), AESx(0x4C4C98D4), AESx(0x5858B0E8), AESx(0xCFCF854A), + AESx(0xD0D0BB6B), AESx(0xEFEFC52A), AESx(0xAAAA4FE5), AESx(0xFBFBED16), + AESx(0x434386C5), AESx(0x4D4D9AD7), AESx(0x33336655), AESx(0x85851194), + AESx(0x45458ACF), AESx(0xF9F9E910), AESx(0x02020406), AESx(0x7F7FFE81), + AESx(0x5050A0F0), AESx(0x3C3C7844), AESx(0x9F9F25BA), AESx(0xA8A84BE3), + AESx(0x5151A2F3), AESx(0xA3A35DFE), AESx(0x404080C0), AESx(0x8F8F058A), + AESx(0x92923FAD), AESx(0x9D9D21BC), AESx(0x38387048), AESx(0xF5F5F104), + AESx(0xBCBC63DF), AESx(0xB6B677C1), AESx(0xDADAAF75), AESx(0x21214263), + AESx(0x10102030), AESx(0xFFFFE51A), AESx(0xF3F3FD0E), AESx(0xD2D2BF6D), + AESx(0xCDCD814C), AESx(0x0C0C1814), AESx(0x13132635), AESx(0xECECC32F), + AESx(0x5F5FBEE1), AESx(0x979735A2), AESx(0x444488CC), AESx(0x17172E39), + AESx(0xC4C49357), AESx(0xA7A755F2), AESx(0x7E7EFC82), AESx(0x3D3D7A47), + AESx(0x6464C8AC), AESx(0x5D5DBAE7), AESx(0x1919322B), AESx(0x7373E695), + AESx(0x6060C0A0), AESx(0x81811998), AESx(0x4F4F9ED1), AESx(0xDCDCA37F), + AESx(0x22224466), AESx(0x2A2A547E), AESx(0x90903BAB), AESx(0x88880B83), + AESx(0x46468CCA), AESx(0xEEEEC729), AESx(0xB8B86BD3), AESx(0x1414283C), + AESx(0xDEDEA779), AESx(0x5E5EBCE2), AESx(0x0B0B161D), AESx(0xDBDBAD76), + AESx(0xE0E0DB3B), AESx(0x32326456), AESx(0x3A3A744E), AESx(0x0A0A141E), + AESx(0x494992DB), AESx(0x06060C0A), AESx(0x2424486C), AESx(0x5C5CB8E4), + AESx(0xC2C29F5D), AESx(0xD3D3BD6E), AESx(0xACAC43EF), AESx(0x6262C4A6), + AESx(0x919139A8), AESx(0x959531A4), AESx(0xE4E4D337), AESx(0x7979F28B), + AESx(0xE7E7D532), AESx(0xC8C88B43), AESx(0x37376E59), AESx(0x6D6DDAB7), + AESx(0x8D8D018C), AESx(0xD5D5B164), AESx(0x4E4E9CD2), AESx(0xA9A949E0), + AESx(0x6C6CD8B4), AESx(0x5656ACFA), AESx(0xF4F4F307), AESx(0xEAEACF25), + AESx(0x6565CAAF), AESx(0x7A7AF48E), AESx(0xAEAE47E9), AESx(0x08081018), + AESx(0xBABA6FD5), AESx(0x7878F088), AESx(0x25254A6F), AESx(0x2E2E5C72), + AESx(0x1C1C3824), AESx(0xA6A657F1), AESx(0xB4B473C7), AESx(0xC6C69751), + AESx(0xE8E8CB23), AESx(0xDDDDA17C), AESx(0x7474E89C), AESx(0x1F1F3E21), + AESx(0x4B4B96DD), AESx(0xBDBD61DC), AESx(0x8B8B0D86), AESx(0x8A8A0F85), + AESx(0x7070E090), AESx(0x3E3E7C42), AESx(0xB5B571C4), AESx(0x6666CCAA), + AESx(0x484890D8), AESx(0x03030605), AESx(0xF6F6F701), AESx(0x0E0E1C12), + AESx(0x6161C2A3), AESx(0x35356A5F), AESx(0x5757AEF9), AESx(0xB9B969D0), + AESx(0x86861791), AESx(0xC1C19958), AESx(0x1D1D3A27), AESx(0x9E9E27B9), + AESx(0xE1E1D938), AESx(0xF8F8EB13), AESx(0x98982BB3), AESx(0x11112233), + AESx(0x6969D2BB), AESx(0xD9D9A970), AESx(0x8E8E0789), AESx(0x949433A7), + AESx(0x9B9B2DB6), AESx(0x1E1E3C22), AESx(0x87871592), AESx(0xE9E9C920), + AESx(0xCECE8749), AESx(0x5555AAFF), AESx(0x28285078), AESx(0xDFDFA57A), + AESx(0x8C8C038F), AESx(0xA1A159F8), AESx(0x89890980), AESx(0x0D0D1A17), + AESx(0xBFBF65DA), AESx(0xE6E6D731), AESx(0x424284C6), AESx(0x6868D0B8), + AESx(0x414182C3), AESx(0x999929B0), AESx(0x2D2D5A77), AESx(0x0F0F1E11), + AESx(0xB0B07BCB), AESx(0x5454A8FC), AESx(0xBBBB6DD6), AESx(0x16162C3A) +}; + +__constant__ __align__(64) uint32_t d_AES2[256] = { + AESx(0x63C6A563), AESx(0x7CF8847C), AESx(0x77EE9977), AESx(0x7BF68D7B), + AESx(0xF2FF0DF2), AESx(0x6BD6BD6B), AESx(0x6FDEB16F), AESx(0xC59154C5), + AESx(0x30605030), AESx(0x01020301), AESx(0x67CEA967), AESx(0x2B567D2B), + AESx(0xFEE719FE), AESx(0xD7B562D7), AESx(0xAB4DE6AB), AESx(0x76EC9A76), + AESx(0xCA8F45CA), AESx(0x821F9D82), AESx(0xC98940C9), AESx(0x7DFA877D), + AESx(0xFAEF15FA), AESx(0x59B2EB59), AESx(0x478EC947), AESx(0xF0FB0BF0), + AESx(0xAD41ECAD), AESx(0xD4B367D4), AESx(0xA25FFDA2), AESx(0xAF45EAAF), + AESx(0x9C23BF9C), AESx(0xA453F7A4), AESx(0x72E49672), AESx(0xC09B5BC0), + AESx(0xB775C2B7), AESx(0xFDE11CFD), AESx(0x933DAE93), AESx(0x264C6A26), + AESx(0x366C5A36), AESx(0x3F7E413F), AESx(0xF7F502F7), AESx(0xCC834FCC), + AESx(0x34685C34), AESx(0xA551F4A5), AESx(0xE5D134E5), AESx(0xF1F908F1), + AESx(0x71E29371), AESx(0xD8AB73D8), AESx(0x31625331), AESx(0x152A3F15), + AESx(0x04080C04), AESx(0xC79552C7), AESx(0x23466523), AESx(0xC39D5EC3), + AESx(0x18302818), AESx(0x9637A196), AESx(0x050A0F05), AESx(0x9A2FB59A), + AESx(0x070E0907), AESx(0x12243612), AESx(0x801B9B80), AESx(0xE2DF3DE2), + AESx(0xEBCD26EB), AESx(0x274E6927), AESx(0xB27FCDB2), AESx(0x75EA9F75), + AESx(0x09121B09), AESx(0x831D9E83), AESx(0x2C58742C), AESx(0x1A342E1A), + AESx(0x1B362D1B), AESx(0x6EDCB26E), AESx(0x5AB4EE5A), AESx(0xA05BFBA0), + AESx(0x52A4F652), AESx(0x3B764D3B), AESx(0xD6B761D6), AESx(0xB37DCEB3), + AESx(0x29527B29), AESx(0xE3DD3EE3), AESx(0x2F5E712F), AESx(0x84139784), + AESx(0x53A6F553), AESx(0xD1B968D1), AESx(0x00000000), AESx(0xEDC12CED), + AESx(0x20406020), AESx(0xFCE31FFC), AESx(0xB179C8B1), AESx(0x5BB6ED5B), + AESx(0x6AD4BE6A), AESx(0xCB8D46CB), AESx(0xBE67D9BE), AESx(0x39724B39), + AESx(0x4A94DE4A), AESx(0x4C98D44C), AESx(0x58B0E858), AESx(0xCF854ACF), + AESx(0xD0BB6BD0), AESx(0xEFC52AEF), AESx(0xAA4FE5AA), AESx(0xFBED16FB), + AESx(0x4386C543), AESx(0x4D9AD74D), AESx(0x33665533), AESx(0x85119485), + AESx(0x458ACF45), AESx(0xF9E910F9), AESx(0x02040602), AESx(0x7FFE817F), + AESx(0x50A0F050), AESx(0x3C78443C), AESx(0x9F25BA9F), AESx(0xA84BE3A8), + AESx(0x51A2F351), AESx(0xA35DFEA3), AESx(0x4080C040), AESx(0x8F058A8F), + AESx(0x923FAD92), AESx(0x9D21BC9D), AESx(0x38704838), AESx(0xF5F104F5), + AESx(0xBC63DFBC), AESx(0xB677C1B6), AESx(0xDAAF75DA), AESx(0x21426321), + AESx(0x10203010), AESx(0xFFE51AFF), AESx(0xF3FD0EF3), AESx(0xD2BF6DD2), + AESx(0xCD814CCD), AESx(0x0C18140C), AESx(0x13263513), AESx(0xECC32FEC), + AESx(0x5FBEE15F), AESx(0x9735A297), AESx(0x4488CC44), AESx(0x172E3917), + AESx(0xC49357C4), AESx(0xA755F2A7), AESx(0x7EFC827E), AESx(0x3D7A473D), + AESx(0x64C8AC64), AESx(0x5DBAE75D), AESx(0x19322B19), AESx(0x73E69573), + AESx(0x60C0A060), AESx(0x81199881), AESx(0x4F9ED14F), AESx(0xDCA37FDC), + AESx(0x22446622), AESx(0x2A547E2A), AESx(0x903BAB90), AESx(0x880B8388), + AESx(0x468CCA46), AESx(0xEEC729EE), AESx(0xB86BD3B8), AESx(0x14283C14), + AESx(0xDEA779DE), AESx(0x5EBCE25E), AESx(0x0B161D0B), AESx(0xDBAD76DB), + AESx(0xE0DB3BE0), AESx(0x32645632), AESx(0x3A744E3A), AESx(0x0A141E0A), + AESx(0x4992DB49), AESx(0x060C0A06), AESx(0x24486C24), AESx(0x5CB8E45C), + AESx(0xC29F5DC2), AESx(0xD3BD6ED3), AESx(0xAC43EFAC), AESx(0x62C4A662), + AESx(0x9139A891), AESx(0x9531A495), AESx(0xE4D337E4), AESx(0x79F28B79), + AESx(0xE7D532E7), AESx(0xC88B43C8), AESx(0x376E5937), AESx(0x6DDAB76D), + AESx(0x8D018C8D), AESx(0xD5B164D5), AESx(0x4E9CD24E), AESx(0xA949E0A9), + AESx(0x6CD8B46C), AESx(0x56ACFA56), AESx(0xF4F307F4), AESx(0xEACF25EA), + AESx(0x65CAAF65), AESx(0x7AF48E7A), AESx(0xAE47E9AE), AESx(0x08101808), + AESx(0xBA6FD5BA), AESx(0x78F08878), AESx(0x254A6F25), AESx(0x2E5C722E), + AESx(0x1C38241C), AESx(0xA657F1A6), AESx(0xB473C7B4), AESx(0xC69751C6), + AESx(0xE8CB23E8), AESx(0xDDA17CDD), AESx(0x74E89C74), AESx(0x1F3E211F), + AESx(0x4B96DD4B), AESx(0xBD61DCBD), AESx(0x8B0D868B), AESx(0x8A0F858A), + AESx(0x70E09070), AESx(0x3E7C423E), AESx(0xB571C4B5), AESx(0x66CCAA66), + AESx(0x4890D848), AESx(0x03060503), AESx(0xF6F701F6), AESx(0x0E1C120E), + AESx(0x61C2A361), AESx(0x356A5F35), AESx(0x57AEF957), AESx(0xB969D0B9), + AESx(0x86179186), AESx(0xC19958C1), AESx(0x1D3A271D), AESx(0x9E27B99E), + AESx(0xE1D938E1), AESx(0xF8EB13F8), AESx(0x982BB398), AESx(0x11223311), + AESx(0x69D2BB69), AESx(0xD9A970D9), AESx(0x8E07898E), AESx(0x9433A794), + AESx(0x9B2DB69B), AESx(0x1E3C221E), AESx(0x87159287), AESx(0xE9C920E9), + AESx(0xCE8749CE), AESx(0x55AAFF55), AESx(0x28507828), AESx(0xDFA57ADF), + AESx(0x8C038F8C), AESx(0xA159F8A1), AESx(0x89098089), AESx(0x0D1A170D), + AESx(0xBF65DABF), AESx(0xE6D731E6), AESx(0x4284C642), AESx(0x68D0B868), + AESx(0x4182C341), AESx(0x9929B099), AESx(0x2D5A772D), AESx(0x0F1E110F), + AESx(0xB07BCBB0), AESx(0x54A8FC54), AESx(0xBB6DD6BB), AESx(0x162C3A16) +}; + +__device__ __align__(64) uint32_t d_AES3[256] = { + AESx(0xC6A56363), AESx(0xF8847C7C), AESx(0xEE997777), AESx(0xF68D7B7B), + AESx(0xFF0DF2F2), AESx(0xD6BD6B6B), AESx(0xDEB16F6F), AESx(0x9154C5C5), + AESx(0x60503030), AESx(0x02030101), AESx(0xCEA96767), AESx(0x567D2B2B), + AESx(0xE719FEFE), AESx(0xB562D7D7), AESx(0x4DE6ABAB), AESx(0xEC9A7676), + AESx(0x8F45CACA), AESx(0x1F9D8282), AESx(0x8940C9C9), AESx(0xFA877D7D), + AESx(0xEF15FAFA), AESx(0xB2EB5959), AESx(0x8EC94747), AESx(0xFB0BF0F0), + AESx(0x41ECADAD), AESx(0xB367D4D4), AESx(0x5FFDA2A2), AESx(0x45EAAFAF), + AESx(0x23BF9C9C), AESx(0x53F7A4A4), AESx(0xE4967272), AESx(0x9B5BC0C0), + AESx(0x75C2B7B7), AESx(0xE11CFDFD), AESx(0x3DAE9393), AESx(0x4C6A2626), + AESx(0x6C5A3636), AESx(0x7E413F3F), AESx(0xF502F7F7), AESx(0x834FCCCC), + AESx(0x685C3434), AESx(0x51F4A5A5), AESx(0xD134E5E5), AESx(0xF908F1F1), + AESx(0xE2937171), AESx(0xAB73D8D8), AESx(0x62533131), AESx(0x2A3F1515), + AESx(0x080C0404), AESx(0x9552C7C7), AESx(0x46652323), AESx(0x9D5EC3C3), + AESx(0x30281818), AESx(0x37A19696), AESx(0x0A0F0505), AESx(0x2FB59A9A), + AESx(0x0E090707), AESx(0x24361212), AESx(0x1B9B8080), AESx(0xDF3DE2E2), + AESx(0xCD26EBEB), AESx(0x4E692727), AESx(0x7FCDB2B2), AESx(0xEA9F7575), + AESx(0x121B0909), AESx(0x1D9E8383), AESx(0x58742C2C), AESx(0x342E1A1A), + AESx(0x362D1B1B), AESx(0xDCB26E6E), AESx(0xB4EE5A5A), AESx(0x5BFBA0A0), + AESx(0xA4F65252), AESx(0x764D3B3B), AESx(0xB761D6D6), AESx(0x7DCEB3B3), + AESx(0x527B2929), AESx(0xDD3EE3E3), AESx(0x5E712F2F), AESx(0x13978484), + AESx(0xA6F55353), AESx(0xB968D1D1), AESx(0x00000000), AESx(0xC12CEDED), + AESx(0x40602020), AESx(0xE31FFCFC), AESx(0x79C8B1B1), AESx(0xB6ED5B5B), + AESx(0xD4BE6A6A), AESx(0x8D46CBCB), AESx(0x67D9BEBE), AESx(0x724B3939), + AESx(0x94DE4A4A), AESx(0x98D44C4C), AESx(0xB0E85858), AESx(0x854ACFCF), + AESx(0xBB6BD0D0), AESx(0xC52AEFEF), AESx(0x4FE5AAAA), AESx(0xED16FBFB), + AESx(0x86C54343), AESx(0x9AD74D4D), AESx(0x66553333), AESx(0x11948585), + AESx(0x8ACF4545), AESx(0xE910F9F9), AESx(0x04060202), AESx(0xFE817F7F), + AESx(0xA0F05050), AESx(0x78443C3C), AESx(0x25BA9F9F), AESx(0x4BE3A8A8), + AESx(0xA2F35151), AESx(0x5DFEA3A3), AESx(0x80C04040), AESx(0x058A8F8F), + AESx(0x3FAD9292), AESx(0x21BC9D9D), AESx(0x70483838), AESx(0xF104F5F5), + AESx(0x63DFBCBC), AESx(0x77C1B6B6), AESx(0xAF75DADA), AESx(0x42632121), + AESx(0x20301010), AESx(0xE51AFFFF), AESx(0xFD0EF3F3), AESx(0xBF6DD2D2), + AESx(0x814CCDCD), AESx(0x18140C0C), AESx(0x26351313), AESx(0xC32FECEC), + AESx(0xBEE15F5F), AESx(0x35A29797), AESx(0x88CC4444), AESx(0x2E391717), + AESx(0x9357C4C4), AESx(0x55F2A7A7), AESx(0xFC827E7E), AESx(0x7A473D3D), + AESx(0xC8AC6464), AESx(0xBAE75D5D), AESx(0x322B1919), AESx(0xE6957373), + AESx(0xC0A06060), AESx(0x19988181), AESx(0x9ED14F4F), AESx(0xA37FDCDC), + AESx(0x44662222), AESx(0x547E2A2A), AESx(0x3BAB9090), AESx(0x0B838888), + AESx(0x8CCA4646), AESx(0xC729EEEE), AESx(0x6BD3B8B8), AESx(0x283C1414), + AESx(0xA779DEDE), AESx(0xBCE25E5E), AESx(0x161D0B0B), AESx(0xAD76DBDB), + AESx(0xDB3BE0E0), AESx(0x64563232), AESx(0x744E3A3A), AESx(0x141E0A0A), + AESx(0x92DB4949), AESx(0x0C0A0606), AESx(0x486C2424), AESx(0xB8E45C5C), + AESx(0x9F5DC2C2), AESx(0xBD6ED3D3), AESx(0x43EFACAC), AESx(0xC4A66262), + AESx(0x39A89191), AESx(0x31A49595), AESx(0xD337E4E4), AESx(0xF28B7979), + AESx(0xD532E7E7), AESx(0x8B43C8C8), AESx(0x6E593737), AESx(0xDAB76D6D), + AESx(0x018C8D8D), AESx(0xB164D5D5), AESx(0x9CD24E4E), AESx(0x49E0A9A9), + AESx(0xD8B46C6C), AESx(0xACFA5656), AESx(0xF307F4F4), AESx(0xCF25EAEA), + AESx(0xCAAF6565), AESx(0xF48E7A7A), AESx(0x47E9AEAE), AESx(0x10180808), + AESx(0x6FD5BABA), AESx(0xF0887878), AESx(0x4A6F2525), AESx(0x5C722E2E), + AESx(0x38241C1C), AESx(0x57F1A6A6), AESx(0x73C7B4B4), AESx(0x9751C6C6), + AESx(0xCB23E8E8), AESx(0xA17CDDDD), AESx(0xE89C7474), AESx(0x3E211F1F), + AESx(0x96DD4B4B), AESx(0x61DCBDBD), AESx(0x0D868B8B), AESx(0x0F858A8A), + AESx(0xE0907070), AESx(0x7C423E3E), AESx(0x71C4B5B5), AESx(0xCCAA6666), + AESx(0x90D84848), AESx(0x06050303), AESx(0xF701F6F6), AESx(0x1C120E0E), + AESx(0xC2A36161), AESx(0x6A5F3535), AESx(0xAEF95757), AESx(0x69D0B9B9), + AESx(0x17918686), AESx(0x9958C1C1), AESx(0x3A271D1D), AESx(0x27B99E9E), + AESx(0xD938E1E1), AESx(0xEB13F8F8), AESx(0x2BB39898), AESx(0x22331111), + AESx(0xD2BB6969), AESx(0xA970D9D9), AESx(0x07898E8E), AESx(0x33A79494), + AESx(0x2DB69B9B), AESx(0x3C221E1E), AESx(0x15928787), AESx(0xC920E9E9), + AESx(0x8749CECE), AESx(0xAAFF5555), AESx(0x50782828), AESx(0xA57ADFDF), + AESx(0x038F8C8C), AESx(0x59F8A1A1), AESx(0x09808989), AESx(0x1A170D0D), + AESx(0x65DABFBF), AESx(0xD731E6E6), AESx(0x84C64242), AESx(0xD0B86868), + AESx(0x82C34141), AESx(0x29B09999), AESx(0x5A772D2D), AESx(0x1E110F0F), + AESx(0x7BCBB0B0), AESx(0xA8FC5454), AESx(0x6DD6BBBB), AESx(0x2C3A1616) +}; + + +__constant__ uint32_t c_PaddedMessage80[20]; // padded message (80 bytes + padding) + +#define xor4_32(a,b,c,d) ((a ^ b) ^ (c ^ d)); + +__device__ +static void aes_round( +const uint32_t *sharedMemory, +uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, uint32_t k0, +uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + __ldg(&d_AES0[x0 & 0xff]), //sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + __ldg(&d_AES0[x1 & 0xff]), //sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + __ldg(&d_AES0[x2 & 0xff]), //sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)])); // ^k2 + + y0 ^= k0; + + y3 = xor4_32( + __ldg(&d_AES0[x3 & 0xff]), //sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 +} + +__device__ +static void aes_round( +const uint32_t *sharedMemory, +uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, +uint32_t &y0, uint32_t &y1, uint32_t &y2, uint32_t &y3) +{ + y0 = xor4_32( + __ldg(&d_AES0[x0 & 0xff]),//sharedMemory[__byte_perm(x0, 0, 0x4440)], + sharedMemory[__byte_perm(x1, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x2, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x3, 0, 0x4443) + 768]); + + y1 = xor4_32( + __ldg(&d_AES0[x1 & 0xff]),//sharedMemory[__byte_perm(x1, 0, 0x4440)], + sharedMemory[__byte_perm(x2, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x3, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x0, 0, 0x4443) + 768]); + + y2 = xor4_32( + __ldg(&d_AES0[x2 & 0xff]), //sharedMemory[__byte_perm(x2, 0, 0x4440)], + sharedMemory[__byte_perm(x3, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x0, 0, 0x4442) + 512], + __ldg(&d_AES3[__byte_perm(x1, 0, 0x4443)])); // ^k2 + + y3 = xor4_32( + __ldg(&d_AES0[x3 & 0xff]), //sharedMemory[__byte_perm(x3, 0, 0x4440)], + sharedMemory[__byte_perm(x0, 0, 0x4441) + 256], + sharedMemory[__byte_perm(x1, 0, 0x4442) + 512], + sharedMemory[__byte_perm(x2, 0, 0x4443) + 768]); // ^k3 +} + + + +__device__ __forceinline__ +static void AES_ROUND_NOKEY( + const uint32_t* __restrict__ sharedMemory, + uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3) +{ + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, + x0, x1, x2, x3, + y0, y1, y2, y3); + + x0 = y0; + x1 = y1; + x2 = y2; + x3 = y3; +} + +__device__ __forceinline__ +static void KEY_EXPAND_ELT( + const uint32_t* __restrict__ sharedMemory, + uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3) +{ + uint32_t y0, y1, y2, y3; + aes_round(sharedMemory, + k0, k1, k2, k3, + y0, y1, y2, y3); + + k0 = y1; + k1 = y2; + k2 = y3; + k3 = y0; +} + +__device__ __forceinline__ +static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, const uint32_t count) +{ + uint32_t p0, p1, p2, p3, p4, p5, p6, p7; + uint32_t p8, p9, pA, pB, pC, pD, pE, pF; + uint32_t x0, x1, x2, x3; + uint32_t rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + const uint32_t counter = count; + + p0 = state[0x0]; + p1 = state[0x1]; + p2 = state[0x2]; + p3 = state[0x3]; + p4 = state[0x4]; + p5 = state[0x5]; + p6 = state[0x6]; + p7 = state[0x7]; + p8 = state[0x8]; + p9 = state[0x9]; + pA = state[0xA]; + pB = state[0xB]; + pC = state[0xC]; + pD = state[0xD]; + pE = state[0xE]; + pF = state[0xF]; + + /* round 0 */ + rk00 = msg[0]; + x0 = p4 ^ msg[0]; + rk01 = msg[1]; + x1 = p5 ^ msg[1]; + rk02 = msg[2]; + x2 = p6 ^ msg[2]; + rk03 = msg[3]; + x3 = p7 ^ msg[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 = msg[4]; + x0 ^= msg[4]; + rk05 = msg[5]; + x1 ^= msg[5]; + rk06 = msg[6]; + x2 ^= msg[6]; + rk07 = msg[7]; + x3 ^= msg[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 = msg[8]; + x0 ^= msg[8]; + rk09 = msg[9]; + x1 ^= msg[9]; + rk0A = msg[10]; + x2 ^= msg[10]; + rk0B = msg[11]; + x3 ^= msg[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C = msg[12]; + x0 ^= msg[12]; + rk0D = msg[13]; + x1 ^= msg[13]; + rk0E = msg[14]; + x2 ^= msg[14]; + rk0F = msg[15]; + x3 ^= msg[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; +/* if (count == 512) + { + rk10 = 0x80U; + x0 = pC ^ 0x80U; + rk11 = 0; + x1 = pD; + rk12 = 0; + x2 = pE; + rk13 = 0; + x3 = pF; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 = 0; + rk15 = 0; + rk16 = 0; + rk17 = 0; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 = 0; + rk19 = 0; + rk1A = 0; + rk1B = 0x02000000U; + x3 ^= 0x02000000U; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C = 0; + rk1D = 0; + rk1E = 0; + rk1F = 0x02000000; + x3 ^= 0x02000000; + } + else + {*/ + rk10 = msg[16]; + x0 = pC ^ msg[16]; + rk11 = msg[17]; + x1 = pD ^ msg[17]; + rk12 = msg[18]; + x2 = pE ^ msg[18]; + + rk13 = msg[19]; + x3 = pF ^ msg[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 = msg[20]; + x0 ^= msg[20]; + rk15 = msg[21]; + x1 ^= msg[21]; + rk16 = msg[22]; + x2 ^= msg[22]; + rk17 = msg[23]; + x3 ^= msg[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 = msg[24]; + x0 ^= msg[24]; + rk19 = msg[25]; + x1 ^= msg[25]; + rk1A = msg[26]; + x2 ^= msg[26]; + rk1B = msg[27]; + x3 ^= msg[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C = msg[28]; + x0 ^= msg[28]; + rk1D = msg[29]; + x1 ^= msg[29]; + rk1E = msg[30]; + x2 ^= msg[30]; + rk1F = msg[31]; + x3 ^= msg[31]; +// } + + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 1 + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + rk00 ^= counter; + rk03 ^= 0xFFFFFFFF; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 2 + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + rk07 ^= SPH_T32(~counter); + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 3 + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + rk1E ^= counter; + rk1F ^= 0xFFFFFFFF; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + /* round 13 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15 ^ counter; + rk1A ^= rk16; + rk1B ^= rk17 ^ 0xFFFFFFFF; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + state[0x0] ^= p8; + state[0x1] ^= p9; + state[0x2] ^= pA; + state[0x3] ^= pB; + state[0x4] ^= pC; + state[0x5] ^= pD; + state[0x6] ^= pE; + state[0x7] ^= pF; + state[0x8] ^= p0; + state[0x9] ^= p1; + state[0xA] ^= p2; + state[0xB] ^= p3; + state[0xC] ^= p4; + state[0xD] ^= p5; + state[0xE] ^= p6; + state[0xF] ^= p7; +} + + +__device__ __forceinline__ +static void c512_80(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, const uint32_t count) +{ + uint32_t p0, p1, p2, p3, p4, p5, p6, p7; + uint32_t p8, p9, pA, pB, pC, pD, pE, pF; + uint32_t x0, x1, x2, x3; + uint32_t rk00, rk01, rk02, rk03, rk04, rk05, rk06, rk07; + uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F; + uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17; + uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F; + const uint32_t counter = count; + + p0 = state[0x0]; + p1 = state[0x1]; + p2 = state[0x2]; + p3 = state[0x3]; + p4 = state[0x4]; + p5 = state[0x5]; + p6 = state[0x6]; + p7 = state[0x7]; + p8 = state[0x8]; + p9 = state[0x9]; + pA = state[0xA]; + pB = state[0xB]; + pC = state[0xC]; + pD = state[0xD]; + pE = state[0xE]; + pF = state[0xF]; + + /* round 0 */ + rk00 = msg[0]; + x0 = p4 ^ msg[0]; + rk01 = msg[1]; + x1 = p5 ^ msg[1]; + rk02 = msg[2]; + x2 = p6 ^ msg[2]; + rk03 = msg[3]; + x3 = p7 ^ msg[3]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 = msg[4]; + x0 ^= msg[4]; + rk05 = msg[5]; + x1 ^= msg[5]; + rk06 = msg[6]; + x2 ^= msg[6]; + rk07 = msg[7]; + x3 ^= msg[7]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 = msg[8]; + x0 ^= msg[8]; + rk09 = msg[9]; + x1 ^= msg[9]; + rk0A = msg[10]; + x2 ^= msg[10]; + rk0B = msg[11]; + x3 ^= msg[11]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C = msg[12]; + x0 ^= msg[12]; + rk0D = msg[13]; + x1 ^= msg[13]; + rk0E = msg[14]; + x2 ^= msg[14]; + rk0F = msg[15]; + x3 ^= msg[15]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + /* if (count == 512) + { + rk10 = 0x80U; + x0 = pC ^ 0x80U; + rk11 = 0; + x1 = pD; + rk12 = 0; + x2 = pE; + rk13 = 0; + x3 = pF; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 = 0; + rk15 = 0; + rk16 = 0; + rk17 = 0; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 = 0; + rk19 = 0; + rk1A = 0; + rk1B = 0x02000000U; + x3 ^= 0x02000000U; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C = 0; + rk1D = 0; + rk1E = 0; + rk1F = 0x02000000; + x3 ^= 0x02000000; + } + else + {*/ + rk10 = msg[16]; + x0 = pC ^ msg[16]; + rk11 = msg[17]; + x1 = pD ^ msg[17]; + rk12 = msg[18]; + x2 = pE ^ msg[18]; + + rk13 = msg[19]; + x3 = pF ^ msg[19]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 = msg[20]; + x0 ^= msg[20]; + rk15 = msg[21]; + x1 ^= msg[21]; + rk16 = msg[22]; + x2 ^= msg[22]; + rk17 = msg[23]; + x3 ^= msg[23]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 = msg[24]; + x0 ^= msg[24]; + rk19 = msg[25]; + x1 ^= msg[25]; + rk1A = msg[26]; + x2 ^= msg[26]; + rk1B = msg[27]; + x3 ^= msg[27]; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C = msg[28]; + x0 ^= msg[28]; + rk1D = msg[29]; + x1 ^= msg[29]; + rk1E = msg[30]; + x2 ^= msg[30]; + rk1F = msg[31]; + x3 ^= msg[31]; + // } + + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 1 + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + rk00 ^= counter; + rk03 ^= 0xFFFFFFFF; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 2 + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + rk07 ^= SPH_T32(~counter); + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + // 3 + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + rk1E ^= counter; + rk1F ^= 0xFFFFFFFF; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + + rk00 ^= rk19; + x0 = pC ^ rk00; + rk01 ^= rk1A; + x1 = pD ^ rk01; + rk02 ^= rk1B; + x2 = pE ^ rk02; + rk03 ^= rk1C; + x3 = pF ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + rk10 ^= rk09; + x0 = p4 ^ rk10; + rk11 ^= rk0A; + x1 = p5 ^ rk11; + rk12 ^= rk0B; + x2 = p6 ^ rk12; + rk13 ^= rk0C; + x3 = p7 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + + /* round 3, 7, 11 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p8 ^ rk00; + x1 = p9 ^ rk01; + x2 = pA ^ rk02; + x3 = pB ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p0 ^ rk10; + x1 = p1 ^ rk11; + x2 = p2 ^ rk12; + x3 = p3 ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15; + rk1A ^= rk16; + rk1B ^= rk17; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + /* round 4, 8, 12 */ + rk00 ^= rk19; + x0 = p4 ^ rk00; + rk01 ^= rk1A; + x1 = p5 ^ rk01; + rk02 ^= rk1B; + x2 = p6 ^ rk02; + rk03 ^= rk1C; + x3 = p7 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk04 ^= rk1D; + x0 ^= rk04; + rk05 ^= rk1E; + x1 ^= rk05; + rk06 ^= rk1F; + x2 ^= rk06; + rk07 ^= rk00; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk08 ^= rk01; + x0 ^= rk08; + rk09 ^= rk02; + x1 ^= rk09; + rk0A ^= rk03; + x2 ^= rk0A; + rk0B ^= rk04; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk0C ^= rk05; + x0 ^= rk0C; + rk0D ^= rk06; + x1 ^= rk0D; + rk0E ^= rk07; + x2 ^= rk0E; + rk0F ^= rk08; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p0 ^= x0; + p1 ^= x1; + p2 ^= x2; + p3 ^= x3; + rk10 ^= rk09; + x0 = pC ^ rk10; + rk11 ^= rk0A; + x1 = pD ^ rk11; + rk12 ^= rk0B; + x2 = pE ^ rk12; + rk13 ^= rk0C; + x3 = pF ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk14 ^= rk0D; + x0 ^= rk14; + rk15 ^= rk0E; + x1 ^= rk15; + rk16 ^= rk0F; + x2 ^= rk16; + rk17 ^= rk10; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk18 ^= rk11; + x0 ^= rk18; + rk19 ^= rk12; + x1 ^= rk19; + rk1A ^= rk13; + x2 ^= rk1A; + rk1B ^= rk14; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + rk1C ^= rk15; + x0 ^= rk1C; + rk1D ^= rk16; + x1 ^= rk1D; + rk1E ^= rk17; + x2 ^= rk1E; + rk1F ^= rk18; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p8 ^= x0; + p9 ^= x1; + pA ^= x2; + pB ^= x3; + + /* round 13 */ + KEY_EXPAND_ELT(sharedMemory, rk00, rk01, rk02, rk03); + rk00 ^= rk1C; + rk01 ^= rk1D; + rk02 ^= rk1E; + rk03 ^= rk1F; + x0 = p0 ^ rk00; + x1 = p1 ^ rk01; + x2 = p2 ^ rk02; + x3 = p3 ^ rk03; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk04, rk05, rk06, rk07); + rk04 ^= rk00; + rk05 ^= rk01; + rk06 ^= rk02; + rk07 ^= rk03; + x0 ^= rk04; + x1 ^= rk05; + x2 ^= rk06; + x3 ^= rk07; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk08, rk09, rk0A, rk0B); + rk08 ^= rk04; + rk09 ^= rk05; + rk0A ^= rk06; + rk0B ^= rk07; + x0 ^= rk08; + x1 ^= rk09; + x2 ^= rk0A; + x3 ^= rk0B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk0C, rk0D, rk0E, rk0F); + rk0C ^= rk08; + rk0D ^= rk09; + rk0E ^= rk0A; + rk0F ^= rk0B; + x0 ^= rk0C; + x1 ^= rk0D; + x2 ^= rk0E; + x3 ^= rk0F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + pC ^= x0; + pD ^= x1; + pE ^= x2; + pF ^= x3; + KEY_EXPAND_ELT(sharedMemory, rk10, rk11, rk12, rk13); + rk10 ^= rk0C; + rk11 ^= rk0D; + rk12 ^= rk0E; + rk13 ^= rk0F; + x0 = p8 ^ rk10; + x1 = p9 ^ rk11; + x2 = pA ^ rk12; + x3 = pB ^ rk13; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk14, rk15, rk16, rk17); + rk14 ^= rk10; + rk15 ^= rk11; + rk16 ^= rk12; + rk17 ^= rk13; + x0 ^= rk14; + x1 ^= rk15; + x2 ^= rk16; + x3 ^= rk17; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk18, rk19, rk1A, rk1B); + rk18 ^= rk14; + rk19 ^= rk15 ^ counter; + rk1A ^= rk16; + rk1B ^= rk17 ^ 0xFFFFFFFF; + x0 ^= rk18; + x1 ^= rk19; + x2 ^= rk1A; + x3 ^= rk1B; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + KEY_EXPAND_ELT(sharedMemory, rk1C, rk1D, rk1E, rk1F); + rk1C ^= rk18; + rk1D ^= rk19; + rk1E ^= rk1A; + rk1F ^= rk1B; + x0 ^= rk1C; + x1 ^= rk1D; + x2 ^= rk1E; + x3 ^= rk1F; + AES_ROUND_NOKEY(sharedMemory, x0, x1, x2, x3); + p4 ^= x0; + p5 ^= x1; + p6 ^= x2; + p7 ^= x3; + state[0x0] ^= p8; + state[0x1] ^= p9; + state[0x2] ^= pA; + state[0x3] ^= pB; + state[0x4] ^= pC; + state[0x5] ^= pD; + state[0x6] ^= pE; + state[0x7] ^= pF; + state[0x8] ^= p0; + state[0x9] ^= p1; + state[0xA] ^= p2; + state[0xB] ^= p3; + state[0xC] ^= p4; + state[0xD] ^= p5; + state[0xE] ^= p6; + state[0xF] ^= p7; +} + + +__device__ __forceinline__ +void shavite_gpu_init(uint32_t *sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 128) + { + uint2 temp = __ldg(&((uint2*)&d_AES1)[threadIdx.x]); + +// sharedMemory[(threadIdx.x << 1) + 0] = temp.x; +// sharedMemory[(threadIdx.x << 1) + 1] = temp.y; + sharedMemory[256+(threadIdx.x << 1) + 0] = (temp.x); + sharedMemory[256 + (threadIdx.x << 1) + 1] = (temp.y); + sharedMemory[512 + (threadIdx.x << 1) + 0] = ROL8(temp.x); + sharedMemory[512 + (threadIdx.x << 1) + 1] = ROL8(temp.y); + sharedMemory[768 + (threadIdx.x << 1) + 0] = ROL16(temp.x); + sharedMemory[768 + (threadIdx.x << 1) + 1] = ROL16(temp.y); + } +} + +__global__ __launch_bounds__(TPB, 5) +void x16_shavite512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash) +{ + __shared__ uint32_t sharedMemory[1024]; + + shavite_gpu_init(sharedMemory); + __threadfence_block(); + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nounce = startNounce + thread; + + // initial state + uint32_t state[16] = { + SPH_C32(0x72FCCDD8), SPH_C32(0x79CA4727), SPH_C32(0x128A077B), SPH_C32(0x40D55AEC), + SPH_C32(0xD1901A06), SPH_C32(0x430AE307), SPH_C32(0xB29F5CD1), SPH_C32(0xDF07FBFC), + SPH_C32(0x8E45D73D), SPH_C32(0x681AB538), SPH_C32(0xBDE86578), SPH_C32(0xDD577E47), + SPH_C32(0xE275EADE), SPH_C32(0x502D9FCD), SPH_C32(0xB9357178), SPH_C32(0x022A4B9A) + }; + + uint32_t msg[32] = { 0 }; + + #pragma unroll 32 + for(int i=0;i<20;i++) + { + msg[i] = c_PaddedMessage80[i]; + } + + #pragma unroll 16 + for (int i = 20; i<32; i++) + { + msg[i] = 0; + } + + msg[19] = cuda_swab32(nounce); + msg[20] = 0x80; + msg[27] = 0x2800000; + msg[31] = 0x2000000; + + c512(sharedMemory, state, msg, 640); + + uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; + + #pragma unroll 16 + for(int i=0;i<16;i++) + outHash[i] = state[i]; + + } //thread < threads +} + + +__host__ +void x16_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +{ + const uint32_t threadsperblock = TPB; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_shavite512_gpu_hash_80<<>>(threads, startNounce, d_outputHash); +} + +__host__ +void x16_shavite512_cpu_init(int thr_id, uint32_t threads) +{ +// aes_cpu_init(thr_id); +} + +__host__ +void x16_shavite512_setBlock_80(void *pdata) +{ + unsigned char PaddedMessage[128]; + memcpy(PaddedMessage, pdata, 80); + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 20*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); +} diff --git a/x16/cuda_x16_simd512_80.cu b/x16/cuda_x16_simd512_80.cu new file mode 100644 index 0000000000..142180a39b --- /dev/null +++ b/x16/cuda_x16_simd512_80.cu @@ -0,0 +1,1836 @@ +/** + * SIMD512 CUDA IMPLEMENTATION based on sph simd code + * tpruvot 2018 (with the help of kernelx xevan code) + */ + +#include +#include +#include + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#define __byte_perm(x, y, m) (x|y) +#endif + +#define TPB50_1 128 +#define TPB50_2 128 +#define TPB52_1 128 +#define TPB52_2 128 + +#define sph_u32 uint32_t +#define sph_s32 int32_t +typedef uint32_t u32; +typedef int32_t s32; + +#define C32 SPH_C32 +#define T32 SPH_T32 +#define ROL32 ROTL32 +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +/* + * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive. + */ +__constant__ static const s32 alpha_tab[] = { + 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, 190, 80, 196, 69, + 2, 82, 21, 90, 92, 174, 195, 28, 120, 37, 232, 3, 123, 160, 135, 138, + 4, 164, 42, 180, 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19, + 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, 235, 126, 26, 38, + 16, 142, 168, 206, 222, 107, 18, 224, 189, 39, 57, 24, 213, 252, 52, 76, + 32, 27, 79, 155, 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152, + 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, 81, 237, 208, 47, + 128, 108, 59, 106, 234, 85, 144, 250, 227, 55, 199, 192, 162, 217, 159, 94, + 256, 216, 118, 212, 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188, + 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, 134, 97, 122, 119, + 253, 93, 215, 77, 73, 166, 124, 201, 17, 183, 50, 251, 11, 194, 244, 238, + 249, 186, 173, 154, 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219, + 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, 44, 5, 205, 181, + 225, 230, 178, 102, 70, 43, 221, 66, 136, 179, 143, 209, 88, 10, 153, 105, + 193, 203, 99, 204, 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210, + 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, 95, 40, 98, 163 +}; + +/* + * Ranges: + * REDS1: from -32768..98302 to -383..383 + * REDS2: from -2^31..2^31-1 to -32768..98302 + */ +#define REDS1(x) (((x) & 0x00FF) - ((x) >> 8)) +#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16)) + +/* + * If, upon entry, the values of q[] are all in the -N..N range (where + * N >= 98302) then the new values of q[] are in the -2N..2N range. + * + * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608. + */ +#define FFT_LOOP_16_8(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 16]; \ + q[(rb)] = m + n; \ + q[(rb) + 16] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 16]; \ + t = REDS2(n * alpha_tab[0 + 1 * 8]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 16] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 16]; \ + t = REDS2(n * alpha_tab[0 + 2 * 8]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 16] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 16]; \ + t = REDS2(n * alpha_tab[0 + 3 * 8]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 16] = m - t; \ + \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 16]; \ + t = REDS2(n * alpha_tab[32 + 0 * 8]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 16] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 16]; \ + t = REDS2(n * alpha_tab[32 + 1 * 8]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 16] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 16]; \ + t = REDS2(n * alpha_tab[32 + 2 * 8]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 16] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 16]; \ + t = REDS2(n * alpha_tab[32 + 3 * 8]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 16] = m - t; \ + \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 16]; \ + t = REDS2(n * alpha_tab[64 + 0 * 8]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 16] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 16]; \ + t = REDS2(n * alpha_tab[64 + 1 * 8]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 16] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 16]; \ + t = REDS2(n * alpha_tab[64 + 2 * 8]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 16] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 16]; \ + t = REDS2(n * alpha_tab[64 + 3 * 8]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 16] = m - t; \ + \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 16]; \ + t = REDS2(n * alpha_tab[96 + 0 * 8]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 16] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 16]; \ + t = REDS2(n * alpha_tab[96 + 1 * 8]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 16] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 16]; \ + t = REDS2(n * alpha_tab[96 + 2 * 8]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 16] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 16]; \ + t = REDS2(n * alpha_tab[96 + 3 * 8]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 16] = m - t; \ + } while (0) + +#define FFT_LOOP_32_4(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 32]; \ + q[(rb)] = m + n; \ + q[(rb) + 32] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 32]; \ + t = REDS2(n * alpha_tab[0 + 1 * 4]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 32] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 32]; \ + t = REDS2(n * alpha_tab[0 + 2 * 4]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 32] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 32]; \ + t = REDS2(n * alpha_tab[0 + 3 * 4]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 32]; \ + t = REDS2(n * alpha_tab[16 + 0 * 4]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 32] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 32]; \ + t = REDS2(n * alpha_tab[16 + 1 * 4]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 32] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 32]; \ + t = REDS2(n * alpha_tab[16 + 2 * 4]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 32] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 32]; \ + t = REDS2(n * alpha_tab[16 + 3 * 4]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 32]; \ + t = REDS2(n * alpha_tab[32 + 0 * 4]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 32] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 32]; \ + t = REDS2(n * alpha_tab[32 + 1 * 4]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 32] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 32]; \ + t = REDS2(n * alpha_tab[32 + 2 * 4]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 32] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 32]; \ + t = REDS2(n * alpha_tab[32 + 3 * 4]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 32]; \ + t = REDS2(n * alpha_tab[48 + 0 * 4]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 32] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 32]; \ + t = REDS2(n * alpha_tab[48 + 1 * 4]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 32] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 32]; \ + t = REDS2(n * alpha_tab[48 + 2 * 4]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 32] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 32]; \ + t = REDS2(n * alpha_tab[48 + 3 * 4]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 16 + 0]; \ + n = q[(rb) + 16 + 0 + 32]; \ + t = REDS2(n * alpha_tab[64 + 0 * 4]); \ + q[(rb) + 16 + 0] = m + t; \ + q[(rb) + 16 + 0 + 32] = m - t; \ + m = q[(rb) + 16 + 1]; \ + n = q[(rb) + 16 + 1 + 32]; \ + t = REDS2(n * alpha_tab[64 + 1 * 4]); \ + q[(rb) + 16 + 1] = m + t; \ + q[(rb) + 16 + 1 + 32] = m - t; \ + m = q[(rb) + 16 + 2]; \ + n = q[(rb) + 16 + 2 + 32]; \ + t = REDS2(n * alpha_tab[64 + 2 * 4]); \ + q[(rb) + 16 + 2] = m + t; \ + q[(rb) + 16 + 2 + 32] = m - t; \ + m = q[(rb) + 16 + 3]; \ + n = q[(rb) + 16 + 3 + 32]; \ + t = REDS2(n * alpha_tab[64 + 3 * 4]); \ + q[(rb) + 16 + 3] = m + t; \ + q[(rb) + 16 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 20 + 0]; \ + n = q[(rb) + 20 + 0 + 32]; \ + t = REDS2(n * alpha_tab[80 + 0 * 4]); \ + q[(rb) + 20 + 0] = m + t; \ + q[(rb) + 20 + 0 + 32] = m - t; \ + m = q[(rb) + 20 + 1]; \ + n = q[(rb) + 20 + 1 + 32]; \ + t = REDS2(n * alpha_tab[80 + 1 * 4]); \ + q[(rb) + 20 + 1] = m + t; \ + q[(rb) + 20 + 1 + 32] = m - t; \ + m = q[(rb) + 20 + 2]; \ + n = q[(rb) + 20 + 2 + 32]; \ + t = REDS2(n * alpha_tab[80 + 2 * 4]); \ + q[(rb) + 20 + 2] = m + t; \ + q[(rb) + 20 + 2 + 32] = m - t; \ + m = q[(rb) + 20 + 3]; \ + n = q[(rb) + 20 + 3 + 32]; \ + t = REDS2(n * alpha_tab[80 + 3 * 4]); \ + q[(rb) + 20 + 3] = m + t; \ + q[(rb) + 20 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 24 + 0]; \ + n = q[(rb) + 24 + 0 + 32]; \ + t = REDS2(n * alpha_tab[96 + 0 * 4]); \ + q[(rb) + 24 + 0] = m + t; \ + q[(rb) + 24 + 0 + 32] = m - t; \ + m = q[(rb) + 24 + 1]; \ + n = q[(rb) + 24 + 1 + 32]; \ + t = REDS2(n * alpha_tab[96 + 1 * 4]); \ + q[(rb) + 24 + 1] = m + t; \ + q[(rb) + 24 + 1 + 32] = m - t; \ + m = q[(rb) + 24 + 2]; \ + n = q[(rb) + 24 + 2 + 32]; \ + t = REDS2(n * alpha_tab[96 + 2 * 4]); \ + q[(rb) + 24 + 2] = m + t; \ + q[(rb) + 24 + 2 + 32] = m - t; \ + m = q[(rb) + 24 + 3]; \ + n = q[(rb) + 24 + 3 + 32]; \ + t = REDS2(n * alpha_tab[96 + 3 * 4]); \ + q[(rb) + 24 + 3] = m + t; \ + q[(rb) + 24 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 28 + 0]; \ + n = q[(rb) + 28 + 0 + 32]; \ + t = REDS2(n * alpha_tab[112 + 0 * 4]); \ + q[(rb) + 28 + 0] = m + t; \ + q[(rb) + 28 + 0 + 32] = m - t; \ + m = q[(rb) + 28 + 1]; \ + n = q[(rb) + 28 + 1 + 32]; \ + t = REDS2(n * alpha_tab[112 + 1 * 4]); \ + q[(rb) + 28 + 1] = m + t; \ + q[(rb) + 28 + 1 + 32] = m - t; \ + m = q[(rb) + 28 + 2]; \ + n = q[(rb) + 28 + 2 + 32]; \ + t = REDS2(n * alpha_tab[112 + 2 * 4]); \ + q[(rb) + 28 + 2] = m + t; \ + q[(rb) + 28 + 2 + 32] = m - t; \ + m = q[(rb) + 28 + 3]; \ + n = q[(rb) + 28 + 3 + 32]; \ + t = REDS2(n * alpha_tab[112 + 3 * 4]); \ + q[(rb) + 28 + 3] = m + t; \ + q[(rb) + 28 + 3 + 32] = m - t; \ + } while (0) + +#define FFT_LOOP_64_2(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 64]; \ + q[(rb)] = m + n; \ + q[(rb) + 64] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 64]; \ + t = REDS2(n * alpha_tab[0 + 1 * 2]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 64] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 64]; \ + t = REDS2(n * alpha_tab[0 + 2 * 2]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 64] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 64]; \ + t = REDS2(n * alpha_tab[0 + 3 * 2]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 64]; \ + t = REDS2(n * alpha_tab[8 + 0 * 2]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 64] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 64]; \ + t = REDS2(n * alpha_tab[8 + 1 * 2]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 64] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 64]; \ + t = REDS2(n * alpha_tab[8 + 2 * 2]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 64] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 64]; \ + t = REDS2(n * alpha_tab[8 + 3 * 2]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 64]; \ + t = REDS2(n * alpha_tab[16 + 0 * 2]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 64] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 64]; \ + t = REDS2(n * alpha_tab[16 + 1 * 2]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 64] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 64]; \ + t = REDS2(n * alpha_tab[16 + 2 * 2]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 64] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 64]; \ + t = REDS2(n * alpha_tab[16 + 3 * 2]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 64]; \ + t = REDS2(n * alpha_tab[24 + 0 * 2]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 64] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 64]; \ + t = REDS2(n * alpha_tab[24 + 1 * 2]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 64] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 64]; \ + t = REDS2(n * alpha_tab[24 + 2 * 2]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 64] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 64]; \ + t = REDS2(n * alpha_tab[24 + 3 * 2]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 16 + 0]; \ + n = q[(rb) + 16 + 0 + 64]; \ + t = REDS2(n * alpha_tab[32 + 0 * 2]); \ + q[(rb) + 16 + 0] = m + t; \ + q[(rb) + 16 + 0 + 64] = m - t; \ + m = q[(rb) + 16 + 1]; \ + n = q[(rb) + 16 + 1 + 64]; \ + t = REDS2(n * alpha_tab[32 + 1 * 2]); \ + q[(rb) + 16 + 1] = m + t; \ + q[(rb) + 16 + 1 + 64] = m - t; \ + m = q[(rb) + 16 + 2]; \ + n = q[(rb) + 16 + 2 + 64]; \ + t = REDS2(n * alpha_tab[32 + 2 * 2]); \ + q[(rb) + 16 + 2] = m + t; \ + q[(rb) + 16 + 2 + 64] = m - t; \ + m = q[(rb) + 16 + 3]; \ + n = q[(rb) + 16 + 3 + 64]; \ + t = REDS2(n * alpha_tab[32 + 3 * 2]); \ + q[(rb) + 16 + 3] = m + t; \ + q[(rb) + 16 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 20 + 0]; \ + n = q[(rb) + 20 + 0 + 64]; \ + t = REDS2(n * alpha_tab[40 + 0 * 2]); \ + q[(rb) + 20 + 0] = m + t; \ + q[(rb) + 20 + 0 + 64] = m - t; \ + m = q[(rb) + 20 + 1]; \ + n = q[(rb) + 20 + 1 + 64]; \ + t = REDS2(n * alpha_tab[40 + 1 * 2]); \ + q[(rb) + 20 + 1] = m + t; \ + q[(rb) + 20 + 1 + 64] = m - t; \ + m = q[(rb) + 20 + 2]; \ + n = q[(rb) + 20 + 2 + 64]; \ + t = REDS2(n * alpha_tab[40 + 2 * 2]); \ + q[(rb) + 20 + 2] = m + t; \ + q[(rb) + 20 + 2 + 64] = m - t; \ + m = q[(rb) + 20 + 3]; \ + n = q[(rb) + 20 + 3 + 64]; \ + t = REDS2(n * alpha_tab[40 + 3 * 2]); \ + q[(rb) + 20 + 3] = m + t; \ + q[(rb) + 20 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 24 + 0]; \ + n = q[(rb) + 24 + 0 + 64]; \ + t = REDS2(n * alpha_tab[48 + 0 * 2]); \ + q[(rb) + 24 + 0] = m + t; \ + q[(rb) + 24 + 0 + 64] = m - t; \ + m = q[(rb) + 24 + 1]; \ + n = q[(rb) + 24 + 1 + 64]; \ + t = REDS2(n * alpha_tab[48 + 1 * 2]); \ + q[(rb) + 24 + 1] = m + t; \ + q[(rb) + 24 + 1 + 64] = m - t; \ + m = q[(rb) + 24 + 2]; \ + n = q[(rb) + 24 + 2 + 64]; \ + t = REDS2(n * alpha_tab[48 + 2 * 2]); \ + q[(rb) + 24 + 2] = m + t; \ + q[(rb) + 24 + 2 + 64] = m - t; \ + m = q[(rb) + 24 + 3]; \ + n = q[(rb) + 24 + 3 + 64]; \ + t = REDS2(n * alpha_tab[48 + 3 * 2]); \ + q[(rb) + 24 + 3] = m + t; \ + q[(rb) + 24 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 28 + 0]; \ + n = q[(rb) + 28 + 0 + 64]; \ + t = REDS2(n * alpha_tab[56 + 0 * 2]); \ + q[(rb) + 28 + 0] = m + t; \ + q[(rb) + 28 + 0 + 64] = m - t; \ + m = q[(rb) + 28 + 1]; \ + n = q[(rb) + 28 + 1 + 64]; \ + t = REDS2(n * alpha_tab[56 + 1 * 2]); \ + q[(rb) + 28 + 1] = m + t; \ + q[(rb) + 28 + 1 + 64] = m - t; \ + m = q[(rb) + 28 + 2]; \ + n = q[(rb) + 28 + 2 + 64]; \ + t = REDS2(n * alpha_tab[56 + 2 * 2]); \ + q[(rb) + 28 + 2] = m + t; \ + q[(rb) + 28 + 2 + 64] = m - t; \ + m = q[(rb) + 28 + 3]; \ + n = q[(rb) + 28 + 3 + 64]; \ + t = REDS2(n * alpha_tab[56 + 3 * 2]); \ + q[(rb) + 28 + 3] = m + t; \ + q[(rb) + 28 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 32 + 0]; \ + n = q[(rb) + 32 + 0 + 64]; \ + t = REDS2(n * alpha_tab[64 + 0 * 2]); \ + q[(rb) + 32 + 0] = m + t; \ + q[(rb) + 32 + 0 + 64] = m - t; \ + m = q[(rb) + 32 + 1]; \ + n = q[(rb) + 32 + 1 + 64]; \ + t = REDS2(n * alpha_tab[64 + 1 * 2]); \ + q[(rb) + 32 + 1] = m + t; \ + q[(rb) + 32 + 1 + 64] = m - t; \ + m = q[(rb) + 32 + 2]; \ + n = q[(rb) + 32 + 2 + 64]; \ + t = REDS2(n * alpha_tab[64 + 2 * 2]); \ + q[(rb) + 32 + 2] = m + t; \ + q[(rb) + 32 + 2 + 64] = m - t; \ + m = q[(rb) + 32 + 3]; \ + n = q[(rb) + 32 + 3 + 64]; \ + t = REDS2(n * alpha_tab[64 + 3 * 2]); \ + q[(rb) + 32 + 3] = m + t; \ + q[(rb) + 32 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 36 + 0]; \ + n = q[(rb) + 36 + 0 + 64]; \ + t = REDS2(n * alpha_tab[72 + 0 * 2]); \ + q[(rb) + 36 + 0] = m + t; \ + q[(rb) + 36 + 0 + 64] = m - t; \ + m = q[(rb) + 36 + 1]; \ + n = q[(rb) + 36 + 1 + 64]; \ + t = REDS2(n * alpha_tab[72 + 1 * 2]); \ + q[(rb) + 36 + 1] = m + t; \ + q[(rb) + 36 + 1 + 64] = m - t; \ + m = q[(rb) + 36 + 2]; \ + n = q[(rb) + 36 + 2 + 64]; \ + t = REDS2(n * alpha_tab[72 + 2 * 2]); \ + q[(rb) + 36 + 2] = m + t; \ + q[(rb) + 36 + 2 + 64] = m - t; \ + m = q[(rb) + 36 + 3]; \ + n = q[(rb) + 36 + 3 + 64]; \ + t = REDS2(n * alpha_tab[72 + 3 * 2]); \ + q[(rb) + 36 + 3] = m + t; \ + q[(rb) + 36 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 40 + 0]; \ + n = q[(rb) + 40 + 0 + 64]; \ + t = REDS2(n * alpha_tab[80 + 0 * 2]); \ + q[(rb) + 40 + 0] = m + t; \ + q[(rb) + 40 + 0 + 64] = m - t; \ + m = q[(rb) + 40 + 1]; \ + n = q[(rb) + 40 + 1 + 64]; \ + t = REDS2(n * alpha_tab[80 + 1 * 2]); \ + q[(rb) + 40 + 1] = m + t; \ + q[(rb) + 40 + 1 + 64] = m - t; \ + m = q[(rb) + 40 + 2]; \ + n = q[(rb) + 40 + 2 + 64]; \ + t = REDS2(n * alpha_tab[80 + 2 * 2]); \ + q[(rb) + 40 + 2] = m + t; \ + q[(rb) + 40 + 2 + 64] = m - t; \ + m = q[(rb) + 40 + 3]; \ + n = q[(rb) + 40 + 3 + 64]; \ + t = REDS2(n * alpha_tab[80 + 3 * 2]); \ + q[(rb) + 40 + 3] = m + t; \ + q[(rb) + 40 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 44 + 0]; \ + n = q[(rb) + 44 + 0 + 64]; \ + t = REDS2(n * alpha_tab[88 + 0 * 2]); \ + q[(rb) + 44 + 0] = m + t; \ + q[(rb) + 44 + 0 + 64] = m - t; \ + m = q[(rb) + 44 + 1]; \ + n = q[(rb) + 44 + 1 + 64]; \ + t = REDS2(n * alpha_tab[88 + 1 * 2]); \ + q[(rb) + 44 + 1] = m + t; \ + q[(rb) + 44 + 1 + 64] = m - t; \ + m = q[(rb) + 44 + 2]; \ + n = q[(rb) + 44 + 2 + 64]; \ + t = REDS2(n * alpha_tab[88 + 2 * 2]); \ + q[(rb) + 44 + 2] = m + t; \ + q[(rb) + 44 + 2 + 64] = m - t; \ + m = q[(rb) + 44 + 3]; \ + n = q[(rb) + 44 + 3 + 64]; \ + t = REDS2(n * alpha_tab[88 + 3 * 2]); \ + q[(rb) + 44 + 3] = m + t; \ + q[(rb) + 44 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 48 + 0]; \ + n = q[(rb) + 48 + 0 + 64]; \ + t = REDS2(n * alpha_tab[96 + 0 * 2]); \ + q[(rb) + 48 + 0] = m + t; \ + q[(rb) + 48 + 0 + 64] = m - t; \ + m = q[(rb) + 48 + 1]; \ + n = q[(rb) + 48 + 1 + 64]; \ + t = REDS2(n * alpha_tab[96 + 1 * 2]); \ + q[(rb) + 48 + 1] = m + t; \ + q[(rb) + 48 + 1 + 64] = m - t; \ + m = q[(rb) + 48 + 2]; \ + n = q[(rb) + 48 + 2 + 64]; \ + t = REDS2(n * alpha_tab[96 + 2 * 2]); \ + q[(rb) + 48 + 2] = m + t; \ + q[(rb) + 48 + 2 + 64] = m - t; \ + m = q[(rb) + 48 + 3]; \ + n = q[(rb) + 48 + 3 + 64]; \ + t = REDS2(n * alpha_tab[96 + 3 * 2]); \ + q[(rb) + 48 + 3] = m + t; \ + q[(rb) + 48 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 52 + 0]; \ + n = q[(rb) + 52 + 0 + 64]; \ + t = REDS2(n * alpha_tab[104 + 0 * 2]); \ + q[(rb) + 52 + 0] = m + t; \ + q[(rb) + 52 + 0 + 64] = m - t; \ + m = q[(rb) + 52 + 1]; \ + n = q[(rb) + 52 + 1 + 64]; \ + t = REDS2(n * alpha_tab[104 + 1 * 2]); \ + q[(rb) + 52 + 1] = m + t; \ + q[(rb) + 52 + 1 + 64] = m - t; \ + m = q[(rb) + 52 + 2]; \ + n = q[(rb) + 52 + 2 + 64]; \ + t = REDS2(n * alpha_tab[104 + 2 * 2]); \ + q[(rb) + 52 + 2] = m + t; \ + q[(rb) + 52 + 2 + 64] = m - t; \ + m = q[(rb) + 52 + 3]; \ + n = q[(rb) + 52 + 3 + 64]; \ + t = REDS2(n * alpha_tab[104 + 3 * 2]); \ + q[(rb) + 52 + 3] = m + t; \ + q[(rb) + 52 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 56 + 0]; \ + n = q[(rb) + 56 + 0 + 64]; \ + t = REDS2(n * alpha_tab[112 + 0 * 2]); \ + q[(rb) + 56 + 0] = m + t; \ + q[(rb) + 56 + 0 + 64] = m - t; \ + m = q[(rb) + 56 + 1]; \ + n = q[(rb) + 56 + 1 + 64]; \ + t = REDS2(n * alpha_tab[112 + 1 * 2]); \ + q[(rb) + 56 + 1] = m + t; \ + q[(rb) + 56 + 1 + 64] = m - t; \ + m = q[(rb) + 56 + 2]; \ + n = q[(rb) + 56 + 2 + 64]; \ + t = REDS2(n * alpha_tab[112 + 2 * 2]); \ + q[(rb) + 56 + 2] = m + t; \ + q[(rb) + 56 + 2 + 64] = m - t; \ + m = q[(rb) + 56 + 3]; \ + n = q[(rb) + 56 + 3 + 64]; \ + t = REDS2(n * alpha_tab[112 + 3 * 2]); \ + q[(rb) + 56 + 3] = m + t; \ + q[(rb) + 56 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 60 + 0]; \ + n = q[(rb) + 60 + 0 + 64]; \ + t = REDS2(n * alpha_tab[120 + 0 * 2]); \ + q[(rb) + 60 + 0] = m + t; \ + q[(rb) + 60 + 0 + 64] = m - t; \ + m = q[(rb) + 60 + 1]; \ + n = q[(rb) + 60 + 1 + 64]; \ + t = REDS2(n * alpha_tab[120 + 1 * 2]); \ + q[(rb) + 60 + 1] = m + t; \ + q[(rb) + 60 + 1 + 64] = m - t; \ + m = q[(rb) + 60 + 2]; \ + n = q[(rb) + 60 + 2 + 64]; \ + t = REDS2(n * alpha_tab[120 + 2 * 2]); \ + q[(rb) + 60 + 2] = m + t; \ + q[(rb) + 60 + 2 + 64] = m - t; \ + m = q[(rb) + 60 + 3]; \ + n = q[(rb) + 60 + 3 + 64]; \ + t = REDS2(n * alpha_tab[120 + 3 * 2]); \ + q[(rb) + 60 + 3] = m + t; \ + q[(rb) + 60 + 3 + 64] = m - t; \ + } while (0) + +#define FFT_LOOP_128_1(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 128]; \ + q[(rb)] = m + n; \ + q[(rb) + 128] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 128]; \ + t = REDS2(n * alpha_tab[0 + 1 * 1]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 128] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 128]; \ + t = REDS2(n * alpha_tab[0 + 2 * 1]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 128] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 128]; \ + t = REDS2(n * alpha_tab[0 + 3 * 1]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 128] = m - t; \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 128]; \ + t = REDS2(n * alpha_tab[4 + 0 * 1]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 128] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 128]; \ + t = REDS2(n * alpha_tab[4 + 1 * 1]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 128] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 128]; \ + t = REDS2(n * alpha_tab[4 + 2 * 1]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 128] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 128]; \ + t = REDS2(n * alpha_tab[4 + 3 * 1]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 128] = m - t; \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 128]; \ + t = REDS2(n * alpha_tab[8 + 0 * 1]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 128] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 128]; \ + t = REDS2(n * alpha_tab[8 + 1 * 1]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 128] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 128]; \ + t = REDS2(n * alpha_tab[8 + 2 * 1]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 128] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 128]; \ + t = REDS2(n * alpha_tab[8 + 3 * 1]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 128] = m - t; \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 128]; \ + t = REDS2(n * alpha_tab[12 + 0 * 1]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 128] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 128]; \ + t = REDS2(n * alpha_tab[12 + 1 * 1]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 128] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 128]; \ + t = REDS2(n * alpha_tab[12 + 2 * 1]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 128] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 128]; \ + t = REDS2(n * alpha_tab[12 + 3 * 1]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 128] = m - t; \ + m = q[(rb) + 16 + 0]; \ + n = q[(rb) + 16 + 0 + 128]; \ + t = REDS2(n * alpha_tab[16 + 0 * 1]); \ + q[(rb) + 16 + 0] = m + t; \ + q[(rb) + 16 + 0 + 128] = m - t; \ + m = q[(rb) + 16 + 1]; \ + n = q[(rb) + 16 + 1 + 128]; \ + t = REDS2(n * alpha_tab[16 + 1 * 1]); \ + q[(rb) + 16 + 1] = m + t; \ + q[(rb) + 16 + 1 + 128] = m - t; \ + m = q[(rb) + 16 + 2]; \ + n = q[(rb) + 16 + 2 + 128]; \ + t = REDS2(n * alpha_tab[16 + 2 * 1]); \ + q[(rb) + 16 + 2] = m + t; \ + q[(rb) + 16 + 2 + 128] = m - t; \ + m = q[(rb) + 16 + 3]; \ + n = q[(rb) + 16 + 3 + 128]; \ + t = REDS2(n * alpha_tab[16 + 3 * 1]); \ + q[(rb) + 16 + 3] = m + t; \ + q[(rb) + 16 + 3 + 128] = m - t; \ + m = q[(rb) + 20 + 0]; \ + n = q[(rb) + 20 + 0 + 128]; \ + t = REDS2(n * alpha_tab[20 + 0 * 1]); \ + q[(rb) + 20 + 0] = m + t; \ + q[(rb) + 20 + 0 + 128] = m - t; \ + m = q[(rb) + 20 + 1]; \ + n = q[(rb) + 20 + 1 + 128]; \ + t = REDS2(n * alpha_tab[20 + 1 * 1]); \ + q[(rb) + 20 + 1] = m + t; \ + q[(rb) + 20 + 1 + 128] = m - t; \ + m = q[(rb) + 20 + 2]; \ + n = q[(rb) + 20 + 2 + 128]; \ + t = REDS2(n * alpha_tab[20 + 2 * 1]); \ + q[(rb) + 20 + 2] = m + t; \ + q[(rb) + 20 + 2 + 128] = m - t; \ + m = q[(rb) + 20 + 3]; \ + n = q[(rb) + 20 + 3 + 128]; \ + t = REDS2(n * alpha_tab[20 + 3 * 1]); \ + q[(rb) + 20 + 3] = m + t; \ + q[(rb) + 20 + 3 + 128] = m - t; \ + m = q[(rb) + 24 + 0]; \ + n = q[(rb) + 24 + 0 + 128]; \ + t = REDS2(n * alpha_tab[24 + 0 * 1]); \ + q[(rb) + 24 + 0] = m + t; \ + q[(rb) + 24 + 0 + 128] = m - t; \ + m = q[(rb) + 24 + 1]; \ + n = q[(rb) + 24 + 1 + 128]; \ + t = REDS2(n * alpha_tab[24 + 1 * 1]); \ + q[(rb) + 24 + 1] = m + t; \ + q[(rb) + 24 + 1 + 128] = m - t; \ + m = q[(rb) + 24 + 2]; \ + n = q[(rb) + 24 + 2 + 128]; \ + t = REDS2(n * alpha_tab[24 + 2 * 1]); \ + q[(rb) + 24 + 2] = m + t; \ + q[(rb) + 24 + 2 + 128] = m - t; \ + m = q[(rb) + 24 + 3]; \ + n = q[(rb) + 24 + 3 + 128]; \ + t = REDS2(n * alpha_tab[24 + 3 * 1]); \ + q[(rb) + 24 + 3] = m + t; \ + q[(rb) + 24 + 3 + 128] = m - t; \ + m = q[(rb) + 28 + 0]; \ + n = q[(rb) + 28 + 0 + 128]; \ + t = REDS2(n * alpha_tab[28 + 0 * 1]); \ + q[(rb) + 28 + 0] = m + t; \ + q[(rb) + 28 + 0 + 128] = m - t; \ + m = q[(rb) + 28 + 1]; \ + n = q[(rb) + 28 + 1 + 128]; \ + t = REDS2(n * alpha_tab[28 + 1 * 1]); \ + q[(rb) + 28 + 1] = m + t; \ + q[(rb) + 28 + 1 + 128] = m - t; \ + m = q[(rb) + 28 + 2]; \ + n = q[(rb) + 28 + 2 + 128]; \ + t = REDS2(n * alpha_tab[28 + 2 * 1]); \ + q[(rb) + 28 + 2] = m + t; \ + q[(rb) + 28 + 2 + 128] = m - t; \ + m = q[(rb) + 28 + 3]; \ + n = q[(rb) + 28 + 3 + 128]; \ + t = REDS2(n * alpha_tab[28 + 3 * 1]); \ + q[(rb) + 28 + 3] = m + t; \ + q[(rb) + 28 + 3 + 128] = m - t; \ + m = q[(rb) + 32 + 0]; \ + n = q[(rb) + 32 + 0 + 128]; \ + t = REDS2(n * alpha_tab[32 + 0 * 1]); \ + q[(rb) + 32 + 0] = m + t; \ + q[(rb) + 32 + 0 + 128] = m - t; \ + m = q[(rb) + 32 + 1]; \ + n = q[(rb) + 32 + 1 + 128]; \ + t = REDS2(n * alpha_tab[32 + 1 * 1]); \ + q[(rb) + 32 + 1] = m + t; \ + q[(rb) + 32 + 1 + 128] = m - t; \ + m = q[(rb) + 32 + 2]; \ + n = q[(rb) + 32 + 2 + 128]; \ + t = REDS2(n * alpha_tab[32 + 2 * 1]); \ + q[(rb) + 32 + 2] = m + t; \ + q[(rb) + 32 + 2 + 128] = m - t; \ + m = q[(rb) + 32 + 3]; \ + n = q[(rb) + 32 + 3 + 128]; \ + t = REDS2(n * alpha_tab[32 + 3 * 1]); \ + q[(rb) + 32 + 3] = m + t; \ + q[(rb) + 32 + 3 + 128] = m - t; \ + m = q[(rb) + 36 + 0]; \ + n = q[(rb) + 36 + 0 + 128]; \ + t = REDS2(n * alpha_tab[36 + 0 * 1]); \ + q[(rb) + 36 + 0] = m + t; \ + q[(rb) + 36 + 0 + 128] = m - t; \ + m = q[(rb) + 36 + 1]; \ + n = q[(rb) + 36 + 1 + 128]; \ + t = REDS2(n * alpha_tab[36 + 1 * 1]); \ + q[(rb) + 36 + 1] = m + t; \ + q[(rb) + 36 + 1 + 128] = m - t; \ + m = q[(rb) + 36 + 2]; \ + n = q[(rb) + 36 + 2 + 128]; \ + t = REDS2(n * alpha_tab[36 + 2 * 1]); \ + q[(rb) + 36 + 2] = m + t; \ + q[(rb) + 36 + 2 + 128] = m - t; \ + m = q[(rb) + 36 + 3]; \ + n = q[(rb) + 36 + 3 + 128]; \ + t = REDS2(n * alpha_tab[36 + 3 * 1]); \ + q[(rb) + 36 + 3] = m + t; \ + q[(rb) + 36 + 3 + 128] = m - t; \ + m = q[(rb) + 40 + 0]; \ + n = q[(rb) + 40 + 0 + 128]; \ + t = REDS2(n * alpha_tab[40 + 0 * 1]); \ + q[(rb) + 40 + 0] = m + t; \ + q[(rb) + 40 + 0 + 128] = m - t; \ + m = q[(rb) + 40 + 1]; \ + n = q[(rb) + 40 + 1 + 128]; \ + t = REDS2(n * alpha_tab[40 + 1 * 1]); \ + q[(rb) + 40 + 1] = m + t; \ + q[(rb) + 40 + 1 + 128] = m - t; \ + m = q[(rb) + 40 + 2]; \ + n = q[(rb) + 40 + 2 + 128]; \ + t = REDS2(n * alpha_tab[40 + 2 * 1]); \ + q[(rb) + 40 + 2] = m + t; \ + q[(rb) + 40 + 2 + 128] = m - t; \ + m = q[(rb) + 40 + 3]; \ + n = q[(rb) + 40 + 3 + 128]; \ + t = REDS2(n * alpha_tab[40 + 3 * 1]); \ + q[(rb) + 40 + 3] = m + t; \ + q[(rb) + 40 + 3 + 128] = m - t; \ + m = q[(rb) + 44 + 0]; \ + n = q[(rb) + 44 + 0 + 128]; \ + t = REDS2(n * alpha_tab[44 + 0 * 1]); \ + q[(rb) + 44 + 0] = m + t; \ + q[(rb) + 44 + 0 + 128] = m - t; \ + m = q[(rb) + 44 + 1]; \ + n = q[(rb) + 44 + 1 + 128]; \ + t = REDS2(n * alpha_tab[44 + 1 * 1]); \ + q[(rb) + 44 + 1] = m + t; \ + q[(rb) + 44 + 1 + 128] = m - t; \ + m = q[(rb) + 44 + 2]; \ + n = q[(rb) + 44 + 2 + 128]; \ + t = REDS2(n * alpha_tab[44 + 2 * 1]); \ + q[(rb) + 44 + 2] = m + t; \ + q[(rb) + 44 + 2 + 128] = m - t; \ + m = q[(rb) + 44 + 3]; \ + n = q[(rb) + 44 + 3 + 128]; \ + t = REDS2(n * alpha_tab[44 + 3 * 1]); \ + q[(rb) + 44 + 3] = m + t; \ + q[(rb) + 44 + 3 + 128] = m - t; \ + m = q[(rb) + 48 + 0]; \ + n = q[(rb) + 48 + 0 + 128]; \ + t = REDS2(n * alpha_tab[48 + 0 * 1]); \ + q[(rb) + 48 + 0] = m + t; \ + q[(rb) + 48 + 0 + 128] = m - t; \ + m = q[(rb) + 48 + 1]; \ + n = q[(rb) + 48 + 1 + 128]; \ + t = REDS2(n * alpha_tab[48 + 1 * 1]); \ + q[(rb) + 48 + 1] = m + t; \ + q[(rb) + 48 + 1 + 128] = m - t; \ + m = q[(rb) + 48 + 2]; \ + n = q[(rb) + 48 + 2 + 128]; \ + t = REDS2(n * alpha_tab[48 + 2 * 1]); \ + q[(rb) + 48 + 2] = m + t; \ + q[(rb) + 48 + 2 + 128] = m - t; \ + m = q[(rb) + 48 + 3]; \ + n = q[(rb) + 48 + 3 + 128]; \ + t = REDS2(n * alpha_tab[48 + 3 * 1]); \ + q[(rb) + 48 + 3] = m + t; \ + q[(rb) + 48 + 3 + 128] = m - t; \ + m = q[(rb) + 52 + 0]; \ + n = q[(rb) + 52 + 0 + 128]; \ + t = REDS2(n * alpha_tab[52 + 0 * 1]); \ + q[(rb) + 52 + 0] = m + t; \ + q[(rb) + 52 + 0 + 128] = m - t; \ + m = q[(rb) + 52 + 1]; \ + n = q[(rb) + 52 + 1 + 128]; \ + t = REDS2(n * alpha_tab[52 + 1 * 1]); \ + q[(rb) + 52 + 1] = m + t; \ + q[(rb) + 52 + 1 + 128] = m - t; \ + m = q[(rb) + 52 + 2]; \ + n = q[(rb) + 52 + 2 + 128]; \ + t = REDS2(n * alpha_tab[52 + 2 * 1]); \ + q[(rb) + 52 + 2] = m + t; \ + q[(rb) + 52 + 2 + 128] = m - t; \ + m = q[(rb) + 52 + 3]; \ + n = q[(rb) + 52 + 3 + 128]; \ + t = REDS2(n * alpha_tab[52 + 3 * 1]); \ + q[(rb) + 52 + 3] = m + t; \ + q[(rb) + 52 + 3 + 128] = m - t; \ + m = q[(rb) + 56 + 0]; \ + n = q[(rb) + 56 + 0 + 128]; \ + t = REDS2(n * alpha_tab[56 + 0 * 1]); \ + q[(rb) + 56 + 0] = m + t; \ + q[(rb) + 56 + 0 + 128] = m - t; \ + m = q[(rb) + 56 + 1]; \ + n = q[(rb) + 56 + 1 + 128]; \ + t = REDS2(n * alpha_tab[56 + 1 * 1]); \ + q[(rb) + 56 + 1] = m + t; \ + q[(rb) + 56 + 1 + 128] = m - t; \ + m = q[(rb) + 56 + 2]; \ + n = q[(rb) + 56 + 2 + 128]; \ + t = REDS2(n * alpha_tab[56 + 2 * 1]); \ + q[(rb) + 56 + 2] = m + t; \ + q[(rb) + 56 + 2 + 128] = m - t; \ + m = q[(rb) + 56 + 3]; \ + n = q[(rb) + 56 + 3 + 128]; \ + t = REDS2(n * alpha_tab[56 + 3 * 1]); \ + q[(rb) + 56 + 3] = m + t; \ + q[(rb) + 56 + 3 + 128] = m - t; \ + m = q[(rb) + 60 + 0]; \ + n = q[(rb) + 60 + 0 + 128]; \ + t = REDS2(n * alpha_tab[60 + 0 * 1]); \ + q[(rb) + 60 + 0] = m + t; \ + q[(rb) + 60 + 0 + 128] = m - t; \ + m = q[(rb) + 60 + 1]; \ + n = q[(rb) + 60 + 1 + 128]; \ + t = REDS2(n * alpha_tab[60 + 1 * 1]); \ + q[(rb) + 60 + 1] = m + t; \ + q[(rb) + 60 + 1 + 128] = m - t; \ + m = q[(rb) + 60 + 2]; \ + n = q[(rb) + 60 + 2 + 128]; \ + t = REDS2(n * alpha_tab[60 + 2 * 1]); \ + q[(rb) + 60 + 2] = m + t; \ + q[(rb) + 60 + 2 + 128] = m - t; \ + m = q[(rb) + 60 + 3]; \ + n = q[(rb) + 60 + 3 + 128]; \ + t = REDS2(n * alpha_tab[60 + 3 * 1]); \ + q[(rb) + 60 + 3] = m + t; \ + q[(rb) + 60 + 3 + 128] = m - t; \ + m = q[(rb) + 64 + 0]; \ + n = q[(rb) + 64 + 0 + 128]; \ + t = REDS2(n * alpha_tab[64 + 0 * 1]); \ + q[(rb) + 64 + 0] = m + t; \ + q[(rb) + 64 + 0 + 128] = m - t; \ + m = q[(rb) + 64 + 1]; \ + n = q[(rb) + 64 + 1 + 128]; \ + t = REDS2(n * alpha_tab[64 + 1 * 1]); \ + q[(rb) + 64 + 1] = m + t; \ + q[(rb) + 64 + 1 + 128] = m - t; \ + m = q[(rb) + 64 + 2]; \ + n = q[(rb) + 64 + 2 + 128]; \ + t = REDS2(n * alpha_tab[64 + 2 * 1]); \ + q[(rb) + 64 + 2] = m + t; \ + q[(rb) + 64 + 2 + 128] = m - t; \ + m = q[(rb) + 64 + 3]; \ + n = q[(rb) + 64 + 3 + 128]; \ + t = REDS2(n * alpha_tab[64 + 3 * 1]); \ + q[(rb) + 64 + 3] = m + t; \ + q[(rb) + 64 + 3 + 128] = m - t; \ + m = q[(rb) + 68 + 0]; \ + n = q[(rb) + 68 + 0 + 128]; \ + t = REDS2(n * alpha_tab[68 + 0 * 1]); \ + q[(rb) + 68 + 0] = m + t; \ + q[(rb) + 68 + 0 + 128] = m - t; \ + m = q[(rb) + 68 + 1]; \ + n = q[(rb) + 68 + 1 + 128]; \ + t = REDS2(n * alpha_tab[68 + 1 * 1]); \ + q[(rb) + 68 + 1] = m + t; \ + q[(rb) + 68 + 1 + 128] = m - t; \ + m = q[(rb) + 68 + 2]; \ + n = q[(rb) + 68 + 2 + 128]; \ + t = REDS2(n * alpha_tab[68 + 2 * 1]); \ + q[(rb) + 68 + 2] = m + t; \ + q[(rb) + 68 + 2 + 128] = m - t; \ + m = q[(rb) + 68 + 3]; \ + n = q[(rb) + 68 + 3 + 128]; \ + t = REDS2(n * alpha_tab[68 + 3 * 1]); \ + q[(rb) + 68 + 3] = m + t; \ + q[(rb) + 68 + 3 + 128] = m - t; \ + m = q[(rb) + 72 + 0]; \ + n = q[(rb) + 72 + 0 + 128]; \ + t = REDS2(n * alpha_tab[72 + 0 * 1]); \ + q[(rb) + 72 + 0] = m + t; \ + q[(rb) + 72 + 0 + 128] = m - t; \ + m = q[(rb) + 72 + 1]; \ + n = q[(rb) + 72 + 1 + 128]; \ + t = REDS2(n * alpha_tab[72 + 1 * 1]); \ + q[(rb) + 72 + 1] = m + t; \ + q[(rb) + 72 + 1 + 128] = m - t; \ + m = q[(rb) + 72 + 2]; \ + n = q[(rb) + 72 + 2 + 128]; \ + t = REDS2(n * alpha_tab[72 + 2 * 1]); \ + q[(rb) + 72 + 2] = m + t; \ + q[(rb) + 72 + 2 + 128] = m - t; \ + m = q[(rb) + 72 + 3]; \ + n = q[(rb) + 72 + 3 + 128]; \ + t = REDS2(n * alpha_tab[72 + 3 * 1]); \ + q[(rb) + 72 + 3] = m + t; \ + q[(rb) + 72 + 3 + 128] = m - t; \ + m = q[(rb) + 76 + 0]; \ + n = q[(rb) + 76 + 0 + 128]; \ + t = REDS2(n * alpha_tab[76 + 0 * 1]); \ + q[(rb) + 76 + 0] = m + t; \ + q[(rb) + 76 + 0 + 128] = m - t; \ + m = q[(rb) + 76 + 1]; \ + n = q[(rb) + 76 + 1 + 128]; \ + t = REDS2(n * alpha_tab[76 + 1 * 1]); \ + q[(rb) + 76 + 1] = m + t; \ + q[(rb) + 76 + 1 + 128] = m - t; \ + m = q[(rb) + 76 + 2]; \ + n = q[(rb) + 76 + 2 + 128]; \ + t = REDS2(n * alpha_tab[76 + 2 * 1]); \ + q[(rb) + 76 + 2] = m + t; \ + q[(rb) + 76 + 2 + 128] = m - t; \ + m = q[(rb) + 76 + 3]; \ + n = q[(rb) + 76 + 3 + 128]; \ + t = REDS2(n * alpha_tab[76 + 3 * 1]); \ + q[(rb) + 76 + 3] = m + t; \ + q[(rb) + 76 + 3 + 128] = m - t; \ + m = q[(rb) + 80 + 0]; \ + n = q[(rb) + 80 + 0 + 128]; \ + t = REDS2(n * alpha_tab[80 + 0 * 1]); \ + q[(rb) + 80 + 0] = m + t; \ + q[(rb) + 80 + 0 + 128] = m - t; \ + m = q[(rb) + 80 + 1]; \ + n = q[(rb) + 80 + 1 + 128]; \ + t = REDS2(n * alpha_tab[80 + 1 * 1]); \ + q[(rb) + 80 + 1] = m + t; \ + q[(rb) + 80 + 1 + 128] = m - t; \ + m = q[(rb) + 80 + 2]; \ + n = q[(rb) + 80 + 2 + 128]; \ + t = REDS2(n * alpha_tab[80 + 2 * 1]); \ + q[(rb) + 80 + 2] = m + t; \ + q[(rb) + 80 + 2 + 128] = m - t; \ + m = q[(rb) + 80 + 3]; \ + n = q[(rb) + 80 + 3 + 128]; \ + t = REDS2(n * alpha_tab[80 + 3 * 1]); \ + q[(rb) + 80 + 3] = m + t; \ + q[(rb) + 80 + 3 + 128] = m - t; \ + m = q[(rb) + 84 + 0]; \ + n = q[(rb) + 84 + 0 + 128]; \ + t = REDS2(n * alpha_tab[84 + 0 * 1]); \ + q[(rb) + 84 + 0] = m + t; \ + q[(rb) + 84 + 0 + 128] = m - t; \ + m = q[(rb) + 84 + 1]; \ + n = q[(rb) + 84 + 1 + 128]; \ + t = REDS2(n * alpha_tab[84 + 1 * 1]); \ + q[(rb) + 84 + 1] = m + t; \ + q[(rb) + 84 + 1 + 128] = m - t; \ + m = q[(rb) + 84 + 2]; \ + n = q[(rb) + 84 + 2 + 128]; \ + t = REDS2(n * alpha_tab[84 + 2 * 1]); \ + q[(rb) + 84 + 2] = m + t; \ + q[(rb) + 84 + 2 + 128] = m - t; \ + m = q[(rb) + 84 + 3]; \ + n = q[(rb) + 84 + 3 + 128]; \ + t = REDS2(n * alpha_tab[84 + 3 * 1]); \ + q[(rb) + 84 + 3] = m + t; \ + q[(rb) + 84 + 3 + 128] = m - t; \ + m = q[(rb) + 88 + 0]; \ + n = q[(rb) + 88 + 0 + 128]; \ + t = REDS2(n * alpha_tab[88 + 0 * 1]); \ + q[(rb) + 88 + 0] = m + t; \ + q[(rb) + 88 + 0 + 128] = m - t; \ + m = q[(rb) + 88 + 1]; \ + n = q[(rb) + 88 + 1 + 128]; \ + t = REDS2(n * alpha_tab[88 + 1 * 1]); \ + q[(rb) + 88 + 1] = m + t; \ + q[(rb) + 88 + 1 + 128] = m - t; \ + m = q[(rb) + 88 + 2]; \ + n = q[(rb) + 88 + 2 + 128]; \ + t = REDS2(n * alpha_tab[88 + 2 * 1]); \ + q[(rb) + 88 + 2] = m + t; \ + q[(rb) + 88 + 2 + 128] = m - t; \ + m = q[(rb) + 88 + 3]; \ + n = q[(rb) + 88 + 3 + 128]; \ + t = REDS2(n * alpha_tab[88 + 3 * 1]); \ + q[(rb) + 88 + 3] = m + t; \ + q[(rb) + 88 + 3 + 128] = m - t; \ + m = q[(rb) + 92 + 0]; \ + n = q[(rb) + 92 + 0 + 128]; \ + t = REDS2(n * alpha_tab[92 + 0 * 1]); \ + q[(rb) + 92 + 0] = m + t; \ + q[(rb) + 92 + 0 + 128] = m - t; \ + m = q[(rb) + 92 + 1]; \ + n = q[(rb) + 92 + 1 + 128]; \ + t = REDS2(n * alpha_tab[92 + 1 * 1]); \ + q[(rb) + 92 + 1] = m + t; \ + q[(rb) + 92 + 1 + 128] = m - t; \ + m = q[(rb) + 92 + 2]; \ + n = q[(rb) + 92 + 2 + 128]; \ + t = REDS2(n * alpha_tab[92 + 2 * 1]); \ + q[(rb) + 92 + 2] = m + t; \ + q[(rb) + 92 + 2 + 128] = m - t; \ + m = q[(rb) + 92 + 3]; \ + n = q[(rb) + 92 + 3 + 128]; \ + t = REDS2(n * alpha_tab[92 + 3 * 1]); \ + q[(rb) + 92 + 3] = m + t; \ + q[(rb) + 92 + 3 + 128] = m - t; \ + m = q[(rb) + 96 + 0]; \ + n = q[(rb) + 96 + 0 + 128]; \ + t = REDS2(n * alpha_tab[96 + 0 * 1]); \ + q[(rb) + 96 + 0] = m + t; \ + q[(rb) + 96 + 0 + 128] = m - t; \ + m = q[(rb) + 96 + 1]; \ + n = q[(rb) + 96 + 1 + 128]; \ + t = REDS2(n * alpha_tab[96 + 1 * 1]); \ + q[(rb) + 96 + 1] = m + t; \ + q[(rb) + 96 + 1 + 128] = m - t; \ + m = q[(rb) + 96 + 2]; \ + n = q[(rb) + 96 + 2 + 128]; \ + t = REDS2(n * alpha_tab[96 + 2 * 1]); \ + q[(rb) + 96 + 2] = m + t; \ + q[(rb) + 96 + 2 + 128] = m - t; \ + m = q[(rb) + 96 + 3]; \ + n = q[(rb) + 96 + 3 + 128]; \ + t = REDS2(n * alpha_tab[96 + 3 * 1]); \ + q[(rb) + 96 + 3] = m + t; \ + q[(rb) + 96 + 3 + 128] = m - t; \ + m = q[(rb) + 100 + 0]; \ + n = q[(rb) + 100 + 0 + 128]; \ + t = REDS2(n * alpha_tab[100 + 0 * 1]); \ + q[(rb) + 100 + 0] = m + t; \ + q[(rb) + 100 + 0 + 128] = m - t; \ + m = q[(rb) + 100 + 1]; \ + n = q[(rb) + 100 + 1 + 128]; \ + t = REDS2(n * alpha_tab[100 + 1 * 1]); \ + q[(rb) + 100 + 1] = m + t; \ + q[(rb) + 100 + 1 + 128] = m - t; \ + m = q[(rb) + 100 + 2]; \ + n = q[(rb) + 100 + 2 + 128]; \ + t = REDS2(n * alpha_tab[100 + 2 * 1]); \ + q[(rb) + 100 + 2] = m + t; \ + q[(rb) + 100 + 2 + 128] = m - t; \ + m = q[(rb) + 100 + 3]; \ + n = q[(rb) + 100 + 3 + 128]; \ + t = REDS2(n * alpha_tab[100 + 3 * 1]); \ + q[(rb) + 100 + 3] = m + t; \ + q[(rb) + 100 + 3 + 128] = m - t; \ + m = q[(rb) + 104 + 0]; \ + n = q[(rb) + 104 + 0 + 128]; \ + t = REDS2(n * alpha_tab[104 + 0 * 1]); \ + q[(rb) + 104 + 0] = m + t; \ + q[(rb) + 104 + 0 + 128] = m - t; \ + m = q[(rb) + 104 + 1]; \ + n = q[(rb) + 104 + 1 + 128]; \ + t = REDS2(n * alpha_tab[104 + 1 * 1]); \ + q[(rb) + 104 + 1] = m + t; \ + q[(rb) + 104 + 1 + 128] = m - t; \ + m = q[(rb) + 104 + 2]; \ + n = q[(rb) + 104 + 2 + 128]; \ + t = REDS2(n * alpha_tab[104 + 2 * 1]); \ + q[(rb) + 104 + 2] = m + t; \ + q[(rb) + 104 + 2 + 128] = m - t; \ + m = q[(rb) + 104 + 3]; \ + n = q[(rb) + 104 + 3 + 128]; \ + t = REDS2(n * alpha_tab[104 + 3 * 1]); \ + q[(rb) + 104 + 3] = m + t; \ + q[(rb) + 104 + 3 + 128] = m - t; \ + m = q[(rb) + 108 + 0]; \ + n = q[(rb) + 108 + 0 + 128]; \ + t = REDS2(n * alpha_tab[108 + 0 * 1]); \ + q[(rb) + 108 + 0] = m + t; \ + q[(rb) + 108 + 0 + 128] = m - t; \ + m = q[(rb) + 108 + 1]; \ + n = q[(rb) + 108 + 1 + 128]; \ + t = REDS2(n * alpha_tab[108 + 1 * 1]); \ + q[(rb) + 108 + 1] = m + t; \ + q[(rb) + 108 + 1 + 128] = m - t; \ + m = q[(rb) + 108 + 2]; \ + n = q[(rb) + 108 + 2 + 128]; \ + t = REDS2(n * alpha_tab[108 + 2 * 1]); \ + q[(rb) + 108 + 2] = m + t; \ + q[(rb) + 108 + 2 + 128] = m - t; \ + m = q[(rb) + 108 + 3]; \ + n = q[(rb) + 108 + 3 + 128]; \ + t = REDS2(n * alpha_tab[108 + 3 * 1]); \ + q[(rb) + 108 + 3] = m + t; \ + q[(rb) + 108 + 3 + 128] = m - t; \ + m = q[(rb) + 112 + 0]; \ + n = q[(rb) + 112 + 0 + 128]; \ + t = REDS2(n * alpha_tab[112 + 0 * 1]); \ + q[(rb) + 112 + 0] = m + t; \ + q[(rb) + 112 + 0 + 128] = m - t; \ + m = q[(rb) + 112 + 1]; \ + n = q[(rb) + 112 + 1 + 128]; \ + t = REDS2(n * alpha_tab[112 + 1 * 1]); \ + q[(rb) + 112 + 1] = m + t; \ + q[(rb) + 112 + 1 + 128] = m - t; \ + m = q[(rb) + 112 + 2]; \ + n = q[(rb) + 112 + 2 + 128]; \ + t = REDS2(n * alpha_tab[112 + 2 * 1]); \ + q[(rb) + 112 + 2] = m + t; \ + q[(rb) + 112 + 2 + 128] = m - t; \ + m = q[(rb) + 112 + 3]; \ + n = q[(rb) + 112 + 3 + 128]; \ + t = REDS2(n * alpha_tab[112 + 3 * 1]); \ + q[(rb) + 112 + 3] = m + t; \ + q[(rb) + 112 + 3 + 128] = m - t; \ + m = q[(rb) + 116 + 0]; \ + n = q[(rb) + 116 + 0 + 128]; \ + t = REDS2(n * alpha_tab[116 + 0 * 1]); \ + q[(rb) + 116 + 0] = m + t; \ + q[(rb) + 116 + 0 + 128] = m - t; \ + m = q[(rb) + 116 + 1]; \ + n = q[(rb) + 116 + 1 + 128]; \ + t = REDS2(n * alpha_tab[116 + 1 * 1]); \ + q[(rb) + 116 + 1] = m + t; \ + q[(rb) + 116 + 1 + 128] = m - t; \ + m = q[(rb) + 116 + 2]; \ + n = q[(rb) + 116 + 2 + 128]; \ + t = REDS2(n * alpha_tab[116 + 2 * 1]); \ + q[(rb) + 116 + 2] = m + t; \ + q[(rb) + 116 + 2 + 128] = m - t; \ + m = q[(rb) + 116 + 3]; \ + n = q[(rb) + 116 + 3 + 128]; \ + t = REDS2(n * alpha_tab[116 + 3 * 1]); \ + q[(rb) + 116 + 3] = m + t; \ + q[(rb) + 116 + 3 + 128] = m - t; \ + m = q[(rb) + 120 + 0]; \ + n = q[(rb) + 120 + 0 + 128]; \ + t = REDS2(n * alpha_tab[120 + 0 * 1]); \ + q[(rb) + 120 + 0] = m + t; \ + q[(rb) + 120 + 0 + 128] = m - t; \ + m = q[(rb) + 120 + 1]; \ + n = q[(rb) + 120 + 1 + 128]; \ + t = REDS2(n * alpha_tab[120 + 1 * 1]); \ + q[(rb) + 120 + 1] = m + t; \ + q[(rb) + 120 + 1 + 128] = m - t; \ + m = q[(rb) + 120 + 2]; \ + n = q[(rb) + 120 + 2 + 128]; \ + t = REDS2(n * alpha_tab[120 + 2 * 1]); \ + q[(rb) + 120 + 2] = m + t; \ + q[(rb) + 120 + 2 + 128] = m - t; \ + m = q[(rb) + 120 + 3]; \ + n = q[(rb) + 120 + 3 + 128]; \ + t = REDS2(n * alpha_tab[120 + 3 * 1]); \ + q[(rb) + 120 + 3] = m + t; \ + q[(rb) + 120 + 3 + 128] = m - t; \ + m = q[(rb) + 124 + 0]; \ + n = q[(rb) + 124 + 0 + 128]; \ + t = REDS2(n * alpha_tab[124 + 0 * 1]); \ + q[(rb) + 124 + 0] = m + t; \ + q[(rb) + 124 + 0 + 128] = m - t; \ + m = q[(rb) + 124 + 1]; \ + n = q[(rb) + 124 + 1 + 128]; \ + t = REDS2(n * alpha_tab[124 + 1 * 1]); \ + q[(rb) + 124 + 1] = m + t; \ + q[(rb) + 124 + 1 + 128] = m - t; \ + m = q[(rb) + 124 + 2]; \ + n = q[(rb) + 124 + 2 + 128]; \ + t = REDS2(n * alpha_tab[124 + 2 * 1]); \ + q[(rb) + 124 + 2] = m + t; \ + q[(rb) + 124 + 2 + 128] = m - t; \ + m = q[(rb) + 124 + 3]; \ + n = q[(rb) + 124 + 3 + 128]; \ + t = REDS2(n * alpha_tab[124 + 3 * 1]); \ + q[(rb) + 124 + 3] = m + t; \ + q[(rb) + 124 + 3 + 128] = m - t; \ + } while (0) + +/* + * Output ranges: + * d0: min= 0 max= 1020 + * d1: min= -67 max= 4587 + * d2: min=-4335 max= 4335 + * d3: min=-4147 max= 507 + * d4: min= -510 max= 510 + * d5: min= -252 max= 4402 + * d6: min=-4335 max= 4335 + * d7: min=-4332 max= 322 + */ +#define FFT8(xb, xs, d) do { \ + s32 x0 = x[(xb)]; \ + s32 x1 = x[(xb) + (xs)]; \ + s32 x2 = x[(xb) + 2 * (xs)]; \ + s32 x3 = x[(xb) + 3 * (xs)]; \ + s32 a0 = x0 + x2; \ + s32 a1 = x0 + (x2 << 4); \ + s32 a2 = x0 - x2; \ + s32 a3 = x0 - (x2 << 4); \ + s32 b0 = x1 + x3; \ + s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \ + s32 b2 = (x1 << 4) - (x3 << 4); \ + s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \ + d ## 0 = a0 + b0; \ + d ## 1 = a1 + b1; \ + d ## 2 = a2 + b2; \ + d ## 3 = a3 + b3; \ + d ## 4 = a0 - b0; \ + d ## 5 = a1 - b1; \ + d ## 6 = a2 - b2; \ + d ## 7 = a3 - b3; \ + } while (0) + +/* + * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced + * to some shifting. + * + * Output: within -591471..591723 + */ +#define FFT16(xb, xs, rb) do { \ + s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \ + s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \ + FFT8(xb, (xs) << 1, d1_); \ + FFT8((xb) + (xs), (xs) << 1, d2_); \ + q[(rb) + 0] = d1_0 + d2_0; \ + q[(rb) + 1] = d1_1 + (d2_1 << 1); \ + q[(rb) + 2] = d1_2 + (d2_2 << 2); \ + q[(rb) + 3] = d1_3 + (d2_3 << 3); \ + q[(rb) + 4] = d1_4 + (d2_4 << 4); \ + q[(rb) + 5] = d1_5 + (d2_5 << 5); \ + q[(rb) + 6] = d1_6 + (d2_6 << 6); \ + q[(rb) + 7] = d1_7 + (d2_7 << 7); \ + q[(rb) + 8] = d1_0 - d2_0; \ + q[(rb) + 9] = d1_1 - (d2_1 << 1); \ + q[(rb) + 10] = d1_2 - (d2_2 << 2); \ + q[(rb) + 11] = d1_3 - (d2_3 << 3); \ + q[(rb) + 12] = d1_4 - (d2_4 << 4); \ + q[(rb) + 13] = d1_5 - (d2_5 << 5); \ + q[(rb) + 14] = d1_6 - (d2_6 << 6); \ + q[(rb) + 15] = d1_7 - (d2_7 << 7); \ + } while (0) + +/* + * Output range: |q| <= 1183446 + */ +#define FFT32(xb, xs, rb, id) do { \ + FFT16(xb, (xs) << 1, rb); \ + FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \ + FFT_LOOP_16_8(rb); \ + } while (0) + +/* + * Output range: |q| <= 2366892 + */ +#define FFT64(xb, xs, rb) do { \ + FFT32(xb, (xs) << 1, (rb), label_a); \ + FFT32((xb) + (xs), (xs) << 1, (rb) + 32, label_b); \ + FFT_LOOP_32_4(rb); \ + } while (0) + +/* + * Output range: |q| <= 9467568 + */ +#define FFT256(xb, xs, rb, id) do { \ + FFT64((xb) + ((xs) * 0), (xs) << 2, (rb + 0)); \ + FFT64((xb) + ((xs) * 2), (xs) << 2, (rb + 64)); \ + FFT_LOOP_64_2(rb); \ + FFT64((xb) + ((xs) * 1), (xs) << 2, (rb + 128)); \ + FFT64((xb) + ((xs) * 3), (xs) << 2, (rb + 192)); \ + FFT_LOOP_64_2((rb) + 128); \ + FFT_LOOP_128_1(rb); \ + } while (0) + +/* + * beta^(255*i) mod 257 + */ +__constant__ static const unsigned short yoff_b_n[] = { + 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172, + 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101, + 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10, + 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230, + 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150, + 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109, + 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194, + 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93, + 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83, + 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110, + 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217, + 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108, + 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171, + 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78, + 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252, + 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142, + 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182, + 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74, + 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160, + 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82, + 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87, + 46, 45, 139, 41 +}; + +#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \ + + ((u32)((h) * (mm)) << 16)) + +#define W_BIG(sb, o1, o2, mm) \ + (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm) + +#define WB_0_0 W_BIG( 4, 0, 1, 185) +#define WB_0_1 W_BIG( 6, 0, 1, 185) +#define WB_0_2 W_BIG( 0, 0, 1, 185) +#define WB_0_3 W_BIG( 2, 0, 1, 185) +#define WB_0_4 W_BIG( 7, 0, 1, 185) +#define WB_0_5 W_BIG( 5, 0, 1, 185) +#define WB_0_6 W_BIG( 3, 0, 1, 185) +#define WB_0_7 W_BIG( 1, 0, 1, 185) +#define WB_1_0 W_BIG(15, 0, 1, 185) +#define WB_1_1 W_BIG(11, 0, 1, 185) +#define WB_1_2 W_BIG(12, 0, 1, 185) +#define WB_1_3 W_BIG( 8, 0, 1, 185) +#define WB_1_4 W_BIG( 9, 0, 1, 185) +#define WB_1_5 W_BIG(13, 0, 1, 185) +#define WB_1_6 W_BIG(10, 0, 1, 185) +#define WB_1_7 W_BIG(14, 0, 1, 185) +#define WB_2_0 W_BIG(17, -256, -128, 233) +#define WB_2_1 W_BIG(18, -256, -128, 233) +#define WB_2_2 W_BIG(23, -256, -128, 233) +#define WB_2_3 W_BIG(20, -256, -128, 233) +#define WB_2_4 W_BIG(22, -256, -128, 233) +#define WB_2_5 W_BIG(21, -256, -128, 233) +#define WB_2_6 W_BIG(16, -256, -128, 233) +#define WB_2_7 W_BIG(19, -256, -128, 233) +#define WB_3_0 W_BIG(30, -383, -255, 233) +#define WB_3_1 W_BIG(24, -383, -255, 233) +#define WB_3_2 W_BIG(25, -383, -255, 233) +#define WB_3_3 W_BIG(31, -383, -255, 233) +#define WB_3_4 W_BIG(27, -383, -255, 233) +#define WB_3_5 W_BIG(29, -383, -255, 233) +#define WB_3_6 W_BIG(28, -383, -255, 233) +#define WB_3_7 W_BIG(26, -383, -255, 233) + +#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) + +#define PP4_0_0 1 +#define PP4_0_1 0 +#define PP4_0_2 3 +#define PP4_0_3 2 +#define PP4_1_0 2 +#define PP4_1_1 3 +#define PP4_1_2 0 +#define PP4_1_3 1 +#define PP4_2_0 3 +#define PP4_2_1 2 +#define PP4_2_2 1 +#define PP4_2_3 0 + +#define PP8_0_0 1 +#define PP8_0_1 0 +#define PP8_0_2 3 +#define PP8_0_3 2 +#define PP8_0_4 5 +#define PP8_0_5 4 +#define PP8_0_6 7 +#define PP8_0_7 6 + +#define PP8_1_0 6 +#define PP8_1_1 7 +#define PP8_1_2 4 +#define PP8_1_3 5 +#define PP8_1_4 2 +#define PP8_1_5 3 +#define PP8_1_6 0 +#define PP8_1_7 1 + +#define PP8_2_0 2 +#define PP8_2_1 3 +#define PP8_2_2 0 +#define PP8_2_3 1 +#define PP8_2_4 6 +#define PP8_2_5 7 +#define PP8_2_6 4 +#define PP8_2_7 5 + +#define PP8_3_0 3 +#define PP8_3_1 2 +#define PP8_3_2 1 +#define PP8_3_3 0 +#define PP8_3_4 7 +#define PP8_3_5 6 +#define PP8_3_6 5 +#define PP8_3_7 4 + +#define PP8_4_0 5 +#define PP8_4_1 4 +#define PP8_4_2 7 +#define PP8_4_3 6 +#define PP8_4_4 1 +#define PP8_4_5 0 +#define PP8_4_6 3 +#define PP8_4_7 2 + +#define PP8_5_0 7 +#define PP8_5_1 6 +#define PP8_5_2 5 +#define PP8_5_3 4 +#define PP8_5_4 3 +#define PP8_5_5 2 +#define PP8_5_6 1 +#define PP8_5_7 0 + +#define PP8_6_0 4 +#define PP8_6_1 5 +#define PP8_6_2 6 +#define PP8_6_3 7 +#define PP8_6_4 0 +#define PP8_6_5 1 +#define PP8_6_6 2 +#define PP8_6_7 3 + +#define STEP_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA ## n; \ + } while (0) + +#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + u32 tA4 = ROL32(A4, r); \ + u32 tA5 = ROL32(A5, r); \ + u32 tA6 = ROL32(A6, r); \ + u32 tA7 = ROL32(A7, r); \ + STEP_ELT(0, w0, fun, s, pp8b); \ + STEP_ELT(1, w1, fun, s, pp8b); \ + STEP_ELT(2, w2, fun, s, pp8b); \ + STEP_ELT(3, w3, fun, s, pp8b); \ + STEP_ELT(4, w4, fun, s, pp8b); \ + STEP_ELT(5, w5, fun, s, pp8b); \ + STEP_ELT(6, w6, fun, s, pp8b); \ + STEP_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +#define SIMD_M3_0_0 0_ +#define SIMD_M3_1_0 1_ +#define SIMD_M3_2_0 2_ +#define SIMD_M3_3_0 0_ +#define SIMD_M3_4_0 1_ +#define SIMD_M3_5_0 2_ +#define SIMD_M3_6_0 0_ +#define SIMD_M3_7_0 1_ + +#define SIMD_M3_0_1 1_ +#define SIMD_M3_1_1 2_ +#define SIMD_M3_2_1 0_ +#define SIMD_M3_3_1 1_ +#define SIMD_M3_4_1 2_ +#define SIMD_M3_5_1 0_ +#define SIMD_M3_6_1 1_ +#define SIMD_M3_7_1 2_ + +#define SIMD_M3_0_2 2_ +#define SIMD_M3_1_2 0_ +#define SIMD_M3_2_2 1_ +#define SIMD_M3_3_2 2_ +#define SIMD_M3_4_2 0_ +#define SIMD_M3_5_2 1_ +#define SIMD_M3_6_2 2_ +#define SIMD_M3_7_2 0_ + +#define M7_0_0 0_ +#define M7_1_0 1_ +#define M7_2_0 2_ +#define M7_3_0 3_ +#define M7_4_0 4_ +#define M7_5_0 5_ +#define M7_6_0 6_ +#define M7_7_0 0_ + +#define M7_0_1 1_ +#define M7_1_1 2_ +#define M7_2_1 3_ +#define M7_3_1 4_ +#define M7_4_1 5_ +#define M7_5_1 6_ +#define M7_6_1 0_ +#define M7_7_1 1_ + +#define M7_0_2 2_ +#define M7_1_2 3_ +#define M7_2_2 4_ +#define M7_3_2 5_ +#define M7_4_2 6_ +#define M7_5_2 0_ +#define M7_6_2 1_ +#define M7_7_2 2_ + +#define M7_0_3 3_ +#define M7_1_3 4_ +#define M7_2_3 5_ +#define M7_3_3 6_ +#define M7_4_3 0_ +#define M7_5_3 1_ +#define M7_6_3 2_ +#define M7_7_3 3_ + +#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b) + +#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \ + STEP_BIG_(WB_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \ + } while (0) + +//__constant__ static const s32 SIMD_Q_64[] = { +// 4, 28, -80, -120, -47, -126, 45, -123, -92, -127, -70, 23, -23, -24, 40, -125, 101, 122, 34, -24, -119, 110, -121, -112, 32, 24, 51, 73, -117, -64, -21, 42, -60, 16, 5, 85, 107, 52, -44, -96, 42, 127, -18, -108, -47, 26, 91, 117, 112, 46, 87, 79, 126, -120, 65, -24, 121, 29, 118, -7, -53, 85, -98, -117, 32, 115, -47, -116, 63, 16, -108, 49, -119, 57, -110, 4, -76, -76, -42, -86, 58, 115, 4, 4, -83, -51, -37, 116, 32, 15, 36, -42, 73, -99, 94, 87, 60, -20, 67, 12, -76, 55, 117, -68, -82, -80, 93, -20, 92, -21, -128, -91, -11, 84, -28, 76, 94, -124, 37, 93, 17, -78, -106, -29, 88, -15, -47, 102, -4, -28, 80, 120, 47, 126, -45, 123, 92, 127, 70, -23, 23, 24, -40, 125, -101, -122, -34, 24, 119, -110, 121, 112, -32, -24, -51, -73, 117, 64, 21, -42, 60, -16, -5, -85, -107, -52, 44, 96, -42, -127, 18, 108, 47, -26, -91, -117, -112, -46, -87, -79, -126, 120, -65, 24, -121, -29, -118, 7, 53, -85, 98, 117, -32, -115, 47, 116, -63, -16, 108, -49, 119, -57, 110, -4, 76, 76, 42, 86, -58, -115, -4, -4, 83, 51, 37, -116, -32, -15, -36, 42, -73, 99, -94, -87, -60, 20, -67, -12, 76, -55, -117, 68, 82, 80, -93, 20, -92, 21, 128, 91, 11, -84, 28, -76, -94, 124, -37, -93, -17, 78, 106, 29, -88, 15, 47, -102 +//}; +__constant__ static const s32 SIMD_Q_80[] = { + -125, -101, 48, 8, 81, 2, -84, 5, 36, 1, 58, -106, 105, 104, -89, 3, -28, -7, -95, 104, 9, -19, 7, 16, -97, -105, -78, -56, 11, 64, 107, -87, 68, -113, -124, -44, -22, -77, 84, 32, -87, -2, 110, 20, 81, -103, -38, -12, -17, -83, -42, -50, -3, 8, -64, 104, -8, -100, -11, 121, 75, -44, 30, 11, -97, -14, 81, 12, -66, -113, 20, -80, 9, -72, 18, -125, 52, 52, 86, 42, -71, -14, -125, -125, 45, 77, 91, -13, -97, -114, -93, 86, -56, 29, -35, -42, -69, 108, -62, -117, 52, -74, -12, 60, 46, 48, -36, 108, -37, 107, 0, 37, 117, -45, 100, -53, -35, 4, -92, -36, -112, 50, 22, 99, -41, 113, 81, -27, 124, 100, -49, -9, -82, -3, 83, -6, -37, -2, -59, 105, -106, -105, 88, -4, 27, 6, 94, -105, -10, 18, -8, -17, 96, 104, 77, 55, -12, -65, -108, 86, -69, 112, 123, 43, 21, 76, -85, -33, 86, 1, -111, -21, -82, 102, 37, 11, 16, 82, 41, 49, 2, -9, 63, -105, 7, 99, 10, -122, -76, 43, -31, -12, 96, 13, -82, -13, 65, 112, -21, 79, -10, 71, -19, 124, -53, -53, -87, -43, 70, 13, 124, 124, -46, -78, -92, 12, 96, 113, 92, -87, 55, -30, 34, 41, 68, -109, 61, 116, -53, 73, 11, -61, -47, -49, 35, -109, 36, -108, -1, -38, -118, 44, -101, 52, 34, -5, 91, 35, 111, -51, -23, -100, 40, -114, -82, 26 +}; + +__constant__ static uint32_t c_PaddedMessage80[20]; + +__host__ +void x16_simd512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +#define TPB_SIMD 128 +__global__ +__launch_bounds__(TPB_SIMD,1) +static void x16_simd512_gpu_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t A[20]; + #pragma unroll 10 + for (int i=0; i < 20; i += 2) + AS_UINT2(&A[i]) = AS_UINT2(&c_PaddedMessage80[i]); + A[19] = cuda_swab32(startNonce + thread); + + // simd + unsigned char x[128]; + #pragma unroll + for (int i = 0; i < 20; i += 2) + AS_UINT2(&x[i*4]) = AS_UINT2(&A[i]); + #pragma unroll + for(int i = 80; i < 128; i+=4) AS_U32(&x[i]) = 0; + + // SIMD_IV512 + u32 A0 = 0x0BA16B95, A1 = 0x72F999AD, A2 = 0x9FECC2AE, A3 = 0xBA3264FC, A4 = 0x5E894929, A5 = 0x8E9F30E5, A6 = 0x2F1DAA37, A7 = 0xF0F2C558; + u32 B0 = 0xAC506643, B1 = 0xA90635A5, B2 = 0xE25B878B, B3 = 0xAAB7878F, B4 = 0x88817F7A, B5 = 0x0A02892B, B6 = 0x559A7550, B7 = 0x598F657E; + u32 C0 = 0x7EEF60A1, C1 = 0x6B70E3E8, C2 = 0x9C1714D1, C3 = 0xB958E2A8, C4 = 0xAB02675E, C5 = 0xED1C014F, C6 = 0xCD8D65BB, C7 = 0xFDB7A257; + u32 D0 = 0x09254899, D1 = 0xD699C7BC, D2 = 0x9019B6DC, D3 = 0x2B9022E4, D4 = 0x8FA14956, D5 = 0x21BF9BD3, D6 = 0xB94D0943, D7 = 0x6FFDDC22; + + s32 q[256]; + FFT256(0, 1, 0, ll1); + + #pragma unroll + for (int i = 0; i < 256; i ++) { + s32 tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + + A0 ^= A[ 0]; + A1 ^= A[ 1]; + A2 ^= A[ 2]; + A3 ^= A[ 3]; + A4 ^= A[ 4]; + A5 ^= A[ 5]; + A6 ^= A[ 6]; + A7 ^= A[ 7]; + B0 ^= A[ 8]; + B1 ^= A[ 9]; + B2 ^= A[10]; + B3 ^= A[11]; + B4 ^= A[12]; + B5 ^= A[13]; + B6 ^= A[14]; + B7 ^= A[15]; + C0 ^= A[16]; + C1 ^= A[17]; + C2 ^= A[18]; + C3 ^= A[19]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + // Second round + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q_80 + + A0 ^= 0x280; // bitlen + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + A[ 0] = A0; + A[ 1] = A1; + A[ 2] = A2; + A[ 3] = A3; + A[ 4] = A4; + A[ 5] = A5; + A[ 6] = A6; + A[ 7] = A7; + A[ 8] = B0; + A[ 9] = B1; + A[10] = B2; + A[11] = B3; + A[12] = B4; + A[13] = B5; + A[14] = B6; + A[15] = B7; + + const uint64_t hashPosition = thread; + uint32_t *Hash = (uint32_t*)(&g_outputhash[(size_t)8 * hashPosition]); + #pragma unroll + for (int i=0; i < 16; i += 2) + *(uint2*)&Hash[i] = *(uint2*)&A[i]; + } +} + +/***************************************************/ + +__host__ +void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t tpb = 128; + const dim3 grid((threads + tpb - 1) / tpb); + const dim3 block(tpb); + x16_simd512_gpu_80 <<>> (threads, startNonce, (uint64_t*) d_hash); +} diff --git a/x16/x16r.cu b/x16/x16r.cu new file mode 100644 index 0000000000..0bf4cc5833 --- /dev/null +++ b/x16/x16r.cu @@ -0,0 +1,1144 @@ +/** + * X16R algorithm (X16 with Randomized chain order) + * + * tpruvot 2018 - GPL code + */ + +#include +#include +#include + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" + +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_x16.h" + +static uint32_t *d_hash[MAX_GPUS]; + +enum Algo { + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + SHA512, + HASH_FUNC_COUNT +}; + +static const char* algo_strings[] = { + "blake", + "bmw512", + "groestl", + "jh512", + "keccak", + "skein", + "luffa", + "cube", + "shavite", + "simd", + "echo", + "hamsi", + "fugue", + "shabal", + "whirlpool", + "sha512", + NULL +}; + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; + +static void getAlgoString(const uint32_t* prevblock, char *output) +{ + char *sptr = output; + uint8_t* data = (uint8_t*)prevblock; + + for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) { + uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; + if (algoDigit >= 10) + sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoDigit); + sptr++; + } + *sptr = '\0'; +} + +// X16R CPU Hash (Validation) +extern "C" void x16r_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[128]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + + void *in = (void*) input; + int size = 80; + + uint32_t *in32 = (uint32_t*) input; + getAlgoString(&in32[1], hashOrder); + + for (int i = 0; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, in, size); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, in, size); + sph_luffa512_close(&ctx_luffa, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, in, size); + sph_cubehash512_close(&ctx_cubehash, hash); + break; + case SHAVITE: + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, in, size); + sph_shavite512_close(&ctx_shavite, hash); + break; + case SIMD: + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, in, size); + sph_simd512_close(&ctx_simd, hash); + break; + case ECHO: + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, in, size); + sph_echo512_close(&ctx_echo, hash); + break; + case HAMSI: + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, in, size); + sph_hamsi512_close(&ctx_hamsi, hash); + break; + case FUGUE: + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, in, size); + sph_fugue512_close(&ctx_fugue, hash); + break; + case SHABAL: + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, in, size); + sph_shabal512_close(&ctx_shabal, hash); + break; + case WHIRLPOOL: + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, in, size); + sph_whirlpool_close(&ctx_whirlpool, hash); + break; + case SHA512: + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512,(const void*) in, size); + sph_sha512_close(&ctx_sha512,(void*) hash); + break; + } + in = (void*) hash; + size = 64; + } + memcpy(output, hash, 32); +} + +void whirlpool_midstate(void *state, const void *input) +{ + sph_whirlpool_context ctx; + + sph_whirlpool_init(&ctx); + sph_whirlpool(&ctx, input, 64); + + memcpy(state, ctx.state, 64); +} + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +//#define _DEBUG +#define _DEBUG_PREFIX "x16r-" +#include "cuda_debug.cuh" + +//static int algo80_tests[HASH_FUNC_COUNT] = { 0 }; +//static int algo64_tests[HASH_FUNC_COUNT] = { 0 }; +static int algo80_fails[HASH_FUNC_COUNT] = { 0 }; + +extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); // 64 + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); // 64 + x16_echo512_cuda_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x003f; + //((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64 + ((uint8_t*)pdata)[8] = 0xAA; // hashOrder[0] = 'A'; for echo 80 + 64 + //((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64 + } + uint32_t _ALIGN(64) endiandata[20]; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + uint32_t ntime = swab32(pdata[17]); + if (s_ntime != ntime) { + getAlgoString(&endiandata[1], hashOrder); + s_ntime = ntime; + if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); + } + + cuda_check_cpu_setTarget(ptarget); + + char elem = hashOrder[0]; + const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x16_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + x16_sha512_setBlock_80(endiandata); + break; + default: { + return -1; + } + } + + int warn = 0; + + do { + int order = 0; + + // Hash with CUDA + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake80:"); + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw80 :"); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("grstl80:"); + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("jh51280:"); + break; + case KECCAK: + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("kecck80:"); + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + TRACE("skein80:"); + break; + case LUFFA: + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("luffa80:"); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("cube 80:"); + break; + case SHAVITE: + x16_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("simd512:"); + break; + case ECHO: + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("echo :"); + break; + case HAMSI: + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("hamsi :"); + break; + case FUGUE: + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("fugue :"); + break; + case SHABAL: + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("shabal :"); + break; + case WHIRLPOOL: + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("whirl :"); + break; + case SHA512: + x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + + for (int i = 1; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + break; + case KECCAK: + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("keccak :"); + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + break; + case LUFFA: + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + TRACE("cube :"); + break; + case SHAVITE: + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd :"); + break; + case ECHO: + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } + TRACE("echo :"); + break; + case HAMSI: + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("hamsi :"); + break; + case FUGUE: + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("fugue :"); + break; + case SHABAL: + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case WHIRLPOOL: + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case SHA512: + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + } + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); +#ifdef _DEBUG + uint32_t _ALIGN(64) dhash[8]; + be32enc(&endiandata[19], pdata[19]); + x16r_hash(dhash, endiandata); + applog_hash(dhash); + return -1; +#endif + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + x16r_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + x16r_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } +#if 0 + gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]); + + algo80_tests[algo80] += work->valid_nonces; + char oks64[128] = { 0 }; + char oks80[128] = { 0 }; + char fails[128] = { 0 }; + for (int a = 0; a < HASH_FUNC_COUNT; a++) { + const char elem = hashOrder[a]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + if (a > 0) algo64_tests[algo64] += work->valid_nonces; + sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99); + sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99); + sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99); + } + applog(LOG_INFO, "K64: %s", oks64); + applog(LOG_INFO, "K80: %s", oks80); + applog(LOG_ERR, "F80: %s", fails); +#endif + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + algo80_fails[algo80]++; + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s", + work->nonces[0], algo_strings[algo80], hashOrder); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_x16r(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ? + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} + +static void x16rt_getTimeHash( const uint32_t timeStamp, void* timeHash ) +{ + int32_t maskedTime = timeStamp & 0xffffff80; + sha256d( (unsigned char*)timeHash, (const unsigned char*)( &maskedTime ), + sizeof( maskedTime ) ); +} + +static void x16rt_getAlgoString( const uint32_t *timeHash, char *output) +{ + char *sptr = output; + uint8_t* data = (uint8_t*)timeHash; + + for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) { + uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; + + if (algoDigit >= 10) + sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoDigit); + sptr++; + } + *sptr = '\0'; +} + +extern "C" void x16rt_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[128]; + uint32_t _ALIGN(64) timeHash[8]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + + void *in = (void*) input; + int size = 80; + + uint32_t *in32 = (uint32_t*) input; + x16rt_getTimeHash( in32[17], &timeHash ); + x16rt_getAlgoString( &timeHash[0], hashOrder ); + char elem = hashOrder[0]; + uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + for (int i = 0; i < 16; i++) + { + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, in, size); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, in, size); + sph_luffa512_close(&ctx_luffa, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, in, size); + sph_cubehash512_close(&ctx_cubehash, hash); + break; + case SHAVITE: + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, in, size); + sph_shavite512_close(&ctx_shavite, hash); + break; + case SIMD: + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, in, size); + sph_simd512_close(&ctx_simd, hash); + break; + case ECHO: + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, in, size); + sph_echo512_close(&ctx_echo, hash); + break; + case HAMSI: + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, in, size); + sph_hamsi512_close(&ctx_hamsi, hash); + break; + case FUGUE: + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, in, size); + sph_fugue512_close(&ctx_fugue, hash); + break; + case SHABAL: + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, in, size); + sph_shabal512_close(&ctx_shabal, hash); + break; + case WHIRLPOOL: + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, in, size); + sph_whirlpool_close(&ctx_whirlpool, hash); + break; + case SHA512: + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512,(const void*) in, size); + sph_sha512_close(&ctx_sha512,(void*) hash); + break; + } + in = (void*) hash; + size = 64; + + if (i!=15) + { + elem = hashOrder[i+1]; + algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + } + + } + memcpy(output, hash, 32); +} + +extern "C" int scanhash_x16rt(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + if (opt_benchmark) + { + ((uint32_t*)ptarget)[7] = 0x0ff; + ((uint32_t*)pdata)[1] = 0x88888888; + ((uint32_t*)pdata)[2] = 0x88888888; + } + + uint32_t _ALIGN(64) endiandata[20]; + uint32_t _ALIGN(64) timeHash[8]; + + for (int k = 0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + uint32_t masked_ntime = endiandata[17] & 0xffffff80; + if ( s_ntime != masked_ntime ) + { + x16rt_getTimeHash( masked_ntime, &timeHash ); + x16rt_getAlgoString( &timeHash[0], hashOrder ); + s_ntime = masked_ntime; + if ( !(thr_id || opt_quiet) ) + { + applog( LOG_INFO, "hash order %s (%08x)", hashOrder, + endiandata[17] ); + cudaDeviceSynchronize(); + } + } + + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, (throughput)); + x13_hamsi512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); // 64 + x16_echo512_cuda_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + cuda_check_cpu_setTarget(ptarget); + + char elem = hashOrder[0]; + const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo80) + { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x16_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((uint64_t*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + x16_sha512_setBlock_80(endiandata); + break; + default: { + if (!thr_id) + applog(LOG_WARNING, "kernel %s %c unimplemented, order %s", algo_strings[algo80], elem, hashOrder); + sleep(5); + return -1; + } + } + + + int warn = 0; + + do { + int order = 0; + + uint32_t start = pdata[19]; + //uint32_t foundNonce; + + // Hash with CUDA + switch (algo80) + { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case KECCAK: + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + break; + case LUFFA: + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case SHAVITE: + x16_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + break; + case SIMD: + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case ECHO: + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case HAMSI: + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case FUGUE: + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case SHABAL: + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case WHIRLPOOL: + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + case SHA512: + x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + } + uint8_t algo; + + + for (int i = 1; i < 16; i++) + { + elem = hashOrder[i]; + uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case KECCAK: + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case LUFFA: + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + break; + case SHAVITE: + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case SIMD: + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case ECHO: + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } + break; + case HAMSI: + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case FUGUE: + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case SHABAL: + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case WHIRLPOOL: + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + break; + case SHA512: + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + break; + } + } + + *hashes_done = pdata[19] - first_nonce + throughput; + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); +#ifdef _DEBUG + uint32_t _ALIGN(64) dhash[8]; + be32enc(&endiandata[19], pdata[19]); + x16rt_hash(dhash, endiandata); + applog_hash(dhash); + return -1; +#endif + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + x16rt_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + x16rt_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } +#if 0 + gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]); + + algo80_tests[algo80] += work->valid_nonces; + char oks64[128] = { 0 }; + char oks80[128] = { 0 }; + char fails[128] = { 0 }; + for (int a = 0; a < HASH_FUNC_COUNT; a++) { + const char elem = hashOrder[a]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + if (a > 0) algo64_tests[algo64] += work->valid_nonces; + sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99); + sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99); + sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99); + } + applog(LOG_INFO, "K64: %s", oks64); + applog(LOG_INFO, "K80: %s", oks80); + applog(LOG_ERR, "F80: %s", fails); +#endif + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + algo80_fails[algo80]++; + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s", + work->nonces[0], algo_strings[algo80], hashOrder); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_x16rt(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ? + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} \ No newline at end of file diff --git a/x16/x16rv2.cu b/x16/x16rv2.cu new file mode 100644 index 0000000000..956cf559cf --- /dev/null +++ b/x16/x16rv2.cu @@ -0,0 +1,635 @@ +/** + * X16Rv2 algorithm (X16 with Randomized chain order) + * + * tpruvot 2018 - GPL code + * penfold 2019 - add tiger192 before keccak, luffa and sha512 + */ + +#include +#include +#include + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" + +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" + +#include "sph/sph_tiger.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_x16.h" + +static uint32_t *d_hash[MAX_GPUS]; + +enum Algo { + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + SHA512, + HASH_FUNC_COUNT +}; + +static const char* algo_strings[] = { + "blake", + "bmw512", + "groestl", + "jh512", + "keccak", + "skein", + "luffa", + "cube", + "shavite", + "simd", + "echo", + "hamsi", + "fugue", + "shabal", + "whirlpool", + "sha512", + NULL +}; + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; + +static void getAlgoString(const uint32_t* prevblock, char *output) +{ + char *sptr = output; + uint8_t* data = (uint8_t*)prevblock; + + for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) { + uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; + if (algoDigit >= 10) + sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoDigit); + sptr++; + } + *sptr = '\0'; +} + +// X16Rv2 CPU Hash (Validation) +extern "C" void x16rv2_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[128]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + sph_tiger_context ctx_tiger; + + void *in = (void*) input; + int size = 80; + + uint32_t *in32 = (uint32_t*) input; + getAlgoString(&in32[1], hashOrder); + + for (int i = 0; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_tiger_init(&ctx_tiger); + sph_tiger(&ctx_tiger, in, size); + sph_tiger_close(&ctx_tiger, hash); + memset(hash + 24, 0, 40); + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, hash, 64); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_tiger_init(&ctx_tiger); + sph_tiger(&ctx_tiger, in, size); + sph_tiger_close(&ctx_tiger, hash); + memset(hash + 24, 0, 40); + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, hash, 64); + sph_luffa512_close(&ctx_luffa, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, in, size); + sph_cubehash512_close(&ctx_cubehash, hash); + break; + case SHAVITE: + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, in, size); + sph_shavite512_close(&ctx_shavite, hash); + break; + case SIMD: + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, in, size); + sph_simd512_close(&ctx_simd, hash); + break; + case ECHO: + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, in, size); + sph_echo512_close(&ctx_echo, hash); + break; + case HAMSI: + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, in, size); + sph_hamsi512_close(&ctx_hamsi, hash); + break; + case FUGUE: + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, in, size); + sph_fugue512_close(&ctx_fugue, hash); + break; + case SHABAL: + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, in, size); + sph_shabal512_close(&ctx_shabal, hash); + break; + case WHIRLPOOL: + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, in, size); + sph_whirlpool_close(&ctx_whirlpool, hash); + break; + case SHA512: + sph_tiger_init(&ctx_tiger); + sph_tiger(&ctx_tiger, in, size); + sph_tiger_close(&ctx_tiger, hash); + memset(hash + 24, 0, 40); + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512,(const void*) hash, 64); + sph_sha512_close(&ctx_sha512,(void*) hash); + break; + } + in = (void*) hash; + size = 64; + } + memcpy(output, hash, 32); +} + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +//#define _DEBUG +#define _DEBUG_PREFIX "x16rv2-" +#include "cuda_debug.cuh" + +//static int algo80_tests[HASH_FUNC_COUNT] = { 0 }; +//static int algo64_tests[HASH_FUNC_COUNT] = { 0 }; +static int algo80_fails[HASH_FUNC_COUNT] = { 0 }; + +extern "C" int scanhash_x16rv2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); // 64 + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); // 64 + x16_echo512_cuda_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x003f; + //((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64 + ((uint8_t*)pdata)[8] = 0xAA; // hashOrder[0] = 'A'; for echo 80 + 64 + //((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64 + } + uint32_t _ALIGN(64) endiandata[20]; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + uint32_t ntime = swab32(pdata[17]); + if (s_ntime != ntime) { + getAlgoString(&endiandata[1], hashOrder); + s_ntime = ntime; + if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); + } + + cuda_check_cpu_setTarget(ptarget); + + char elem = hashOrder[0]; + const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + tiger192_setBlock_80(endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + tiger192_setBlock_80(endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x16_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + tiger192_setBlock_80(endiandata); + break; + default: { + return -1; + } + } + + int warn = 0; + + do { + int order = 0; + + // Hash with CUDA + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake80:"); + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw80 :"); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("grstl80:"); + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("jh51280:"); + break; + case KECCAK: + tiger192_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("kecck80:"); + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + TRACE("skein80:"); + break; + case LUFFA: + tiger192_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa80:"); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("cube 80:"); + break; + case SHAVITE: + x16_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("simd512:"); + break; + case ECHO: + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("echo :"); + break; + case HAMSI: + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("hamsi :"); + break; + case FUGUE: + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("fugue :"); + break; + case SHABAL: + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("shabal :"); + break; + case WHIRLPOOL: + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("whirl :"); + break; + case SHA512: + tiger192_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + + for (int i = 1; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + break; + case KECCAK: + tiger192_cpu_hash_64(thr_id, throughput, 1, d_hash[thr_id]); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("keccak :"); + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + break; + case LUFFA: + tiger192_cpu_hash_64(thr_id, throughput, 1, d_hash[thr_id]); + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + TRACE("cube :"); + break; + case SHAVITE: + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd :"); + break; + case ECHO: + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } + TRACE("echo :"); + break; + case HAMSI: + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("hamsi :"); + break; + case FUGUE: + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("fugue :"); + break; + case SHABAL: + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case WHIRLPOOL: + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case SHA512: + tiger192_cpu_hash_64(thr_id, throughput, 1, d_hash[thr_id]); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + } + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); +#ifdef _DEBUG + uint32_t _ALIGN(64) dhash[8]; + be32enc(&endiandata[19], pdata[19]); + x16rv2_hash(dhash, endiandata); + applog_hash(dhash); + return -1; +#endif + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + x16rv2_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + x16rv2_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } +#if 0 + gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]); + + algo80_tests[algo80] += work->valid_nonces; + char oks64[128] = { 0 }; + char oks80[128] = { 0 }; + char fails[128] = { 0 }; + for (int a = 0; a < HASH_FUNC_COUNT; a++) { + const char elem = hashOrder[a]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + if (a > 0) algo64_tests[algo64] += work->valid_nonces; + sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99); + sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99); + sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99); + } + applog(LOG_INFO, "K64: %s", oks64); + applog(LOG_INFO, "K80: %s", oks80); + applog(LOG_ERR, "F80: %s", fails); +#endif + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + algo80_fails[algo80]++; + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s", + work->nonces[0], algo_strings[algo80], hashOrder); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_x16rv2(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ? + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} \ No newline at end of file diff --git a/x16/x16s.cu b/x16/x16s.cu new file mode 100644 index 0000000000..0dffa3bec6 --- /dev/null +++ b/x16/x16s.cu @@ -0,0 +1,602 @@ +/** + * X16S algorithm (X16 with Shuffled chain order) + * + * tpruvot 2018 - GPL code + */ + +#include +#include +#include + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" + +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_x16.h" + +static uint32_t *d_hash[MAX_GPUS]; + +enum Algo { + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + SHA512, + HASH_FUNC_COUNT +}; + +static const char* algo_strings[] = { + "blake", + "bmw512", + "groestl", + "jh512", + "keccak", + "skein", + "luffa", + "cube", + "shavite", + "simd", + "echo", + "hamsi", + "fugue", + "shabal", + "whirlpool", + "sha512", + NULL +}; + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; + +static void getAlgoString(const uint32_t* prevblock, char *output) +{ + uint8_t* data = (uint8_t*)prevblock; + + strcpy(output, "0123456789ABCDEF"); + + for (uint8_t i = 0; i < HASH_FUNC_COUNT; i++) { + uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4; + int offset = (int) algoDigit; + char oldVal = output[offset]; + for(int j=offset; j-->0;) + output[j+1] = output[j]; + output[0] = oldVal; + } +} + +// X16S CPU Hash (Validation) +extern "C" void x16s_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[128]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + + void *in = (void*) input; + int size = 80; + + uint32_t *in32 = (uint32_t*) input; + getAlgoString(&in32[1], hashOrder); + + for (int i = 0; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, in, size); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, in, size); + sph_luffa512_close(&ctx_luffa, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, in, size); + sph_cubehash512_close(&ctx_cubehash, hash); + break; + case SHAVITE: + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, in, size); + sph_shavite512_close(&ctx_shavite, hash); + break; + case SIMD: + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, in, size); + sph_simd512_close(&ctx_simd, hash); + break; + case ECHO: + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, in, size); + sph_echo512_close(&ctx_echo, hash); + break; + case HAMSI: + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, in, size); + sph_hamsi512_close(&ctx_hamsi, hash); + break; + case FUGUE: + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, in, size); + sph_fugue512_close(&ctx_fugue, hash); + break; + case SHABAL: + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, in, size); + sph_shabal512_close(&ctx_shabal, hash); + break; + case WHIRLPOOL: + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, in, size); + sph_whirlpool_close(&ctx_whirlpool, hash); + break; + case SHA512: + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512,(const void*) in, size); + sph_sha512_close(&ctx_sha512,(void*) hash); + break; + } + in = (void*) hash; + size = 64; + } + memcpy(output, hash, 32); +} + +#if 0 /* in x16r */ +void whirlpool_midstate(void *state, const void *input) +{ + sph_whirlpool_context ctx; + + sph_whirlpool_init(&ctx); + sph_whirlpool(&ctx, input, 64); + + memcpy(state, ctx.state, 64); +} +#endif + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +//#define _DEBUG +#define _DEBUG_PREFIX "x16s-" +#include "cuda_debug.cuh" + +extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); // 64 + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); // 64 + x16_echo512_cuda_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x003f; + //((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0x0A; // hashOrder[0] = 'A'; for echo 64 + //((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64 + } + uint32_t _ALIGN(64) endiandata[20]; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + uint32_t ntime = swab32(pdata[17]); + if (s_ntime != ntime) { + getAlgoString(&endiandata[1], hashOrder); + s_ntime = ntime; + if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); + } + + cuda_check_cpu_setTarget(ptarget); + + char elem = hashOrder[0]; + const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x16_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + x16_sha512_setBlock_80(endiandata); + break; + default: { + return -1; + } + } + + int warn = 0; + + do { + int order = 0; + + // Hash with CUDA + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake80:"); + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw80 :"); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("grstl80:"); + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("jh51280:"); + break; + case KECCAK: + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("kecck80:"); + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + TRACE("skein80:"); + break; + case LUFFA: + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("luffa80:"); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("cube 80:"); + break; + case SHAVITE: + x16_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("simd512:"); + break; + case ECHO: + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("echo :"); + break; + case HAMSI: + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("hamsi :"); + break; + case FUGUE: + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("fugue :"); + break; + case SHABAL: + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("shabal :"); + break; + case WHIRLPOOL: + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("whirl :"); + break; + case SHA512: + x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + + for (int i = 1; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + break; + case KECCAK: + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("keccak :"); + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + break; + case LUFFA: + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + TRACE("cube :"); + break; + case SHAVITE: + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd :"); + break; + case ECHO: + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } + TRACE("echo :"); + break; + case HAMSI: + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("hamsi :"); + break; + case FUGUE: + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("fugue :"); + break; + case SHABAL: + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case WHIRLPOOL: + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case SHA512: + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + } + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); +#ifdef _DEBUG + uint32_t _ALIGN(64) dhash[8]; + be32enc(&endiandata[19], pdata[19]); + x16s_hash(dhash, endiandata); + applog_hash(dhash); + return -1; +#endif + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + x16s_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + x16s_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + //gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder); + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s", + work->nonces[0], algo_strings[algo80], hashOrder); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_x16s(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ? + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} diff --git a/x17/cuda_x17_haval256.cu b/x17/cuda_x17_haval256.cu index 74b311e236..a8db34d631 100644 --- a/x17/cuda_x17_haval256.cu +++ b/x17/cuda_x17_haval256.cu @@ -326,10 +326,10 @@ void x17_haval256_gpu_hash_64(const uint32_t threads, uint64_t *g_hash, const in pHash[3] = hash.h8[3]; if (outlen == 512) { - pHash[4] = 0; //hash.h8[4]; - pHash[5] = 0; //hash.h8[5]; - pHash[6] = 0; //hash.h8[6]; - pHash[7] = 0; //hash.h8[7]; + pHash[4] = hash.h8[4]; + pHash[5] = hash.h8[5]; + pHash[6] = hash.h8[6]; + pHash[7] = hash.h8[7]; } } } diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu index bebf17d5bd..a0757d0e5b 100644 --- a/x17/cuda_x17_sha512.cu +++ b/x17/cuda_x17_sha512.cu @@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, x17_sha512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); } + +__constant__ +static uint64_t c_PaddedMessage80[10]; + +__global__ +/*__launch_bounds__(256, 4)*/ +void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t W[80]; + #pragma unroll + for (int i = 0; i < 9; i ++) { + W[i] = SWAP64(c_PaddedMessage80[i]); + } + const uint32_t nonce = startNonce + thread; + //((uint32_t*)W)[19] = cuda_swab32(nonce); + W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce)); + W[9] = cuda_swab64(W[9]); + W[10] = 0x8000000000000000; + + #pragma unroll + for (int i = 11; i<15; i++) { + W[i] = 0U; + } + W[15] = 0x0000000000000280; + + #pragma unroll 64 + for (int i = 16; i < 80; i ++) { + W[i] = SSG5_1(W[i-2]) + W[i-7]; + W[i] += SSG5_0(W[i-15]) + W[i-16]; + } + + const uint64_t IV512[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 + }; + + uint64_t r[8]; + #pragma unroll + for (int i = 0; i < 8; i++) { + r[i] = IV512[i]; + } + + #pragma unroll + for (int i = 0; i < 80; i++) { + SHA3_STEP(c_WB, r, W, i&7, i); + } + + const uint64_t hashPosition = thread; + uint64_t *pHash = &g_hash[hashPosition << 3]; + #pragma unroll + for (int u = 0; u < 8; u ++) { + pHash[u] = SWAP64(r[u] + IV512[u]); + } + } +} + +__host__ +void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_sha512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash); +} + +__host__ +void x16_sha512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} \ No newline at end of file diff --git a/x17/hmq17.cu b/x17/hmq17.cu index 8fdbcdf4ff..83c8d29362 100644 --- a/x17/hmq17.cu +++ b/x17/hmq17.cu @@ -395,7 +395,7 @@ extern "C" int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; TRACE("keccak "); hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); @@ -407,7 +407,7 @@ extern "C" int scanhash_hmq17(int thr_id, struct work* work, uint32_t max_nonce, TRACE("cube "); hmq_filter_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order++); hmq_merge_cpu(thr_id, throughput, d_hash[thr_id], d_hash_br2[thr_id]); diff --git a/x17/x17.cu b/x17/x17.cu index 816e5e0634..90c7bd5bba 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -32,6 +32,8 @@ extern "C" { static uint32_t *d_hash[MAX_GPUS]; +extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash); + extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); @@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8; //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); @@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage @@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u } gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + quark_blake512_cpu_init(thr_id, throughput); quark_groestl512_cpu_init(thr_id, throughput); quark_skein512_cpu_init(thr_id, throughput); @@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u x11_luffaCubehash512_cpu_init(thr_id, throughput); x11_shavite512_cpu_init(thr_id, throughput); x11_simd512_cpu_init(thr_id, throughput); - x11_echo512_cpu_init(thr_id, throughput); x13_hamsi512_cpu_init(thr_id, throughput); x13_fugue512_cpu_init(thr_id, throughput); x14_shabal512_cpu_init(thr_id, throughput); @@ -216,11 +224,15 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++); x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x21/cuda_sha256_2.cu b/x21/cuda_sha256_2.cu new file mode 100644 index 0000000000..5b5cf1412a --- /dev/null +++ b/x21/cuda_sha256_2.cu @@ -0,0 +1,198 @@ +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "miner.h" + +#include +#include +#include + +#define SPH_C64(x) ((uint64_t)(x ## ULL)) +#define SPH_C32(x) ((uint32_t)(x ## U)) +#define SPH_T32(x) (x) +#define ROTR(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#include "cuda_helper.h" + + +static __constant__ const uint32_t H256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372), + SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + + +__device__ __forceinline__ +uint32_t Maj(const uint32_t a, const uint32_t b, const uint32_t c) { //Sha256 - Maj - andor + uint32_t result; + asm ("lop3.b32 %0, %1, %2, %3, 0xE8;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); // 0xE8 = ((0xF0 & (0xCC | 0xAA)) | (0xCC & 0xAA)) + return result; +} + +#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) + + +static __device__ __forceinline__ void sha2_step1(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, + uint32_t in,const uint32_t Kshared) { + uint32_t t1,t2; + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv = Maj(a, b, c); //((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +static __device__ __forceinline__ void sha2_step2(uint32_t a,uint32_t b,uint32_t c,uint32_t &d,uint32_t e,uint32_t f,uint32_t g,uint32_t &h, + uint32_t* in,uint32_t pc,const uint32_t Kshared) { + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + + uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); + uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 =ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 =ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv = Maj(a, b, c); //((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + in[pc] = ssg21+inx2+ssg20+inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + + +static __device__ __forceinline__ void sha2_round_body(uint32_t* in, uint32_t* r) { + uint32_t a = r[0]; + uint32_t b = r[1]; + uint32_t c = r[2]; + uint32_t d = r[3]; + uint32_t e = r[4]; + uint32_t f = r[5]; + uint32_t g = r[6]; + uint32_t h = r[7]; + + sha2_step1(a,b,c,d,e,f,g,h,in[ 0],0x428A2F98); + sha2_step1(h,a,b,c,d,e,f,g,in[ 1],0x71374491); + sha2_step1(g,h,a,b,c,d,e,f,in[ 2],0xB5C0FBCF); + sha2_step1(f,g,h,a,b,c,d,e,in[ 3],0xE9B5DBA5); + sha2_step1(e,f,g,h,a,b,c,d,in[ 4],0x3956C25B); + sha2_step1(d,e,f,g,h,a,b,c,in[ 5],0x59F111F1); + sha2_step1(c,d,e,f,g,h,a,b,in[ 6],0x923F82A4); + sha2_step1(b,c,d,e,f,g,h,a,in[ 7],0xAB1C5ED5); + sha2_step1(a,b,c,d,e,f,g,h,in[ 8],0xD807AA98); + sha2_step1(h,a,b,c,d,e,f,g,in[ 9],0x12835B01); + sha2_step1(g,h,a,b,c,d,e,f,in[10],0x243185BE); + sha2_step1(f,g,h,a,b,c,d,e,in[11],0x550C7DC3); + sha2_step1(e,f,g,h,a,b,c,d,in[12],0x72BE5D74); + sha2_step1(d,e,f,g,h,a,b,c,in[13],0x80DEB1FE); + sha2_step1(c,d,e,f,g,h,a,b,in[14],0x9BDC06A7); + sha2_step1(b,c,d,e,f,g,h,a,in[15],0xC19BF174); + + sha2_step2(a,b,c,d,e,f,g,h,in, 0,0xE49B69C1); + sha2_step2(h,a,b,c,d,e,f,g,in, 1,0xEFBE4786); + sha2_step2(g,h,a,b,c,d,e,f,in, 2,0x0FC19DC6); + sha2_step2(f,g,h,a,b,c,d,e,in, 3,0x240CA1CC); + sha2_step2(e,f,g,h,a,b,c,d,in, 4,0x2DE92C6F); + sha2_step2(d,e,f,g,h,a,b,c,in, 5,0x4A7484AA); + sha2_step2(c,d,e,f,g,h,a,b,in, 6,0x5CB0A9DC); + sha2_step2(b,c,d,e,f,g,h,a,in, 7,0x76F988DA); + sha2_step2(a,b,c,d,e,f,g,h,in, 8,0x983E5152); + sha2_step2(h,a,b,c,d,e,f,g,in, 9,0xA831C66D); + sha2_step2(g,h,a,b,c,d,e,f,in,10,0xB00327C8); + sha2_step2(f,g,h,a,b,c,d,e,in,11,0xBF597FC7); + sha2_step2(e,f,g,h,a,b,c,d,in,12,0xC6E00BF3); + sha2_step2(d,e,f,g,h,a,b,c,in,13,0xD5A79147); + sha2_step2(c,d,e,f,g,h,a,b,in,14,0x06CA6351); + sha2_step2(b,c,d,e,f,g,h,a,in,15,0x14292967); + + sha2_step2(a,b,c,d,e,f,g,h,in, 0,0x27B70A85); + sha2_step2(h,a,b,c,d,e,f,g,in, 1,0x2E1B2138); + sha2_step2(g,h,a,b,c,d,e,f,in, 2,0x4D2C6DFC); + sha2_step2(f,g,h,a,b,c,d,e,in, 3,0x53380D13); + sha2_step2(e,f,g,h,a,b,c,d,in, 4,0x650A7354); + sha2_step2(d,e,f,g,h,a,b,c,in, 5,0x766A0ABB); + sha2_step2(c,d,e,f,g,h,a,b,in, 6,0x81C2C92E); + sha2_step2(b,c,d,e,f,g,h,a,in, 7,0x92722C85); + sha2_step2(a,b,c,d,e,f,g,h,in, 8,0xA2BFE8A1); + sha2_step2(h,a,b,c,d,e,f,g,in, 9,0xA81A664B); + sha2_step2(g,h,a,b,c,d,e,f,in,10,0xC24B8B70); + sha2_step2(f,g,h,a,b,c,d,e,in,11,0xC76C51A3); + sha2_step2(e,f,g,h,a,b,c,d,in,12,0xD192E819); + sha2_step2(d,e,f,g,h,a,b,c,in,13,0xD6990624); + sha2_step2(c,d,e,f,g,h,a,b,in,14,0xF40E3585); + sha2_step2(b,c,d,e,f,g,h,a,in,15,0x106AA070); + + sha2_step2(a,b,c,d,e,f,g,h,in, 0,0x19A4C116); + sha2_step2(h,a,b,c,d,e,f,g,in, 1,0x1E376C08); + sha2_step2(g,h,a,b,c,d,e,f,in, 2,0x2748774C); + sha2_step2(f,g,h,a,b,c,d,e,in, 3,0x34B0BCB5); + sha2_step2(e,f,g,h,a,b,c,d,in, 4,0x391C0CB3); + sha2_step2(d,e,f,g,h,a,b,c,in, 5,0x4ED8AA4A); + sha2_step2(c,d,e,f,g,h,a,b,in, 6,0x5B9CCA4F); + sha2_step2(b,c,d,e,f,g,h,a,in, 7,0x682E6FF3); + sha2_step2(a,b,c,d,e,f,g,h,in, 8,0x748F82EE); + sha2_step2(h,a,b,c,d,e,f,g,in, 9,0x78A5636F); + sha2_step2(g,h,a,b,c,d,e,f,in,10,0x84C87814); + sha2_step2(f,g,h,a,b,c,d,e,in,11,0x8CC70208); + sha2_step2(e,f,g,h,a,b,c,d,in,12,0x90BEFFFA); + sha2_step2(d,e,f,g,h,a,b,c,in,13,0xA4506CEB); + sha2_step2(c,d,e,f,g,h,a,b,in,14,0xBEF9A3F7); + sha2_step2(b,c,d,e,f,g,h,a,in,15,0xC67178F2); + + r[0] = r[0] + a; + r[1] = r[1] + b; + r[2] = r[2] + c; + r[3] = r[3] + d; + r[4] = r[4] + e; + r[5] = r[5] + f; + r[6] = r[6] + g; + r[7] = r[7] + h; +} + + +__global__ void __launch_bounds__(512,2) sha256_gpu_hash_64(int threads, uint32_t *g_hash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) { + uint32_t in[16], in2[16], buf[8]; + uint32_t* inout = &g_hash[thread<<4]; + + #pragma unroll + for (int i = 0; i < 8; i++) buf[i] = H256[i]; + + #pragma unroll + for (int i = 0; i < 16; i++) in[i] = cuda_swab32(inout[i]); + sha2_round_body(in,buf); + + in2[0] = 0x80000000; + #pragma unroll + for (int i = 1 ; i < 15; i++) in2[i] = 0; + in2[15] = 0x200; + sha2_round_body(in2,buf); + + #pragma unroll + for (int i = 0; i < 8; i++) inout[i] = cuda_swab32(buf[i]); + } +} + + +__host__ +void sha256_cpu_hash_64(int thr_id, int threads, uint32_t *d_hash) { + const int threadsperblock = 512; + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + sha256_gpu_hash_64<<>>(threads, d_hash); +} diff --git a/x21/cuda_tiger192.cu b/x21/cuda_tiger192.cu new file mode 100644 index 0000000000..c60c57634d --- /dev/null +++ b/x21/cuda_tiger192.cu @@ -0,0 +1,761 @@ +/* + * tiger-192 djm34 + * + */ + +/* + * tiger-192 kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014 djm34 + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author phm + */ +#include +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + + +#include +#include +#include + +#include "cuda_helper.h" + +#define SPH_C64(x) (x ## ULL) +#define SPH_C32(x) (x ## U) +#define SPH_T64(x) (x) + + +__constant__ const uint64_t III[3] = { + SPH_C64(0x0123456789ABCDEF),SPH_C64(0xFEDCBA9876543210),SPH_C64(0xF096A5B4C3B2E187) +}; + + +__constant__ const uint64_t T1[256] = { + SPH_C64(0x02AAB17CF7E90C5E), SPH_C64(0xAC424B03E243A8EC), + SPH_C64(0x72CD5BE30DD5FCD3), SPH_C64(0x6D019B93F6F97F3A), + SPH_C64(0xCD9978FFD21F9193), SPH_C64(0x7573A1C9708029E2), + SPH_C64(0xB164326B922A83C3), SPH_C64(0x46883EEE04915870), + SPH_C64(0xEAACE3057103ECE6), SPH_C64(0xC54169B808A3535C), + SPH_C64(0x4CE754918DDEC47C), SPH_C64(0x0AA2F4DFDC0DF40C), + SPH_C64(0x10B76F18A74DBEFA), SPH_C64(0xC6CCB6235AD1AB6A), + SPH_C64(0x13726121572FE2FF), SPH_C64(0x1A488C6F199D921E), + SPH_C64(0x4BC9F9F4DA0007CA), SPH_C64(0x26F5E6F6E85241C7), + SPH_C64(0x859079DBEA5947B6), SPH_C64(0x4F1885C5C99E8C92), + SPH_C64(0xD78E761EA96F864B), SPH_C64(0x8E36428C52B5C17D), + SPH_C64(0x69CF6827373063C1), SPH_C64(0xB607C93D9BB4C56E), + SPH_C64(0x7D820E760E76B5EA), SPH_C64(0x645C9CC6F07FDC42), + SPH_C64(0xBF38A078243342E0), SPH_C64(0x5F6B343C9D2E7D04), + SPH_C64(0xF2C28AEB600B0EC6), SPH_C64(0x6C0ED85F7254BCAC), + SPH_C64(0x71592281A4DB4FE5), SPH_C64(0x1967FA69CE0FED9F), + SPH_C64(0xFD5293F8B96545DB), SPH_C64(0xC879E9D7F2A7600B), + SPH_C64(0x860248920193194E), SPH_C64(0xA4F9533B2D9CC0B3), + SPH_C64(0x9053836C15957613), SPH_C64(0xDB6DCF8AFC357BF1), + SPH_C64(0x18BEEA7A7A370F57), SPH_C64(0x037117CA50B99066), + SPH_C64(0x6AB30A9774424A35), SPH_C64(0xF4E92F02E325249B), + SPH_C64(0x7739DB07061CCAE1), SPH_C64(0xD8F3B49CECA42A05), + SPH_C64(0xBD56BE3F51382F73), SPH_C64(0x45FAED5843B0BB28), + SPH_C64(0x1C813D5C11BF1F83), SPH_C64(0x8AF0E4B6D75FA169), + SPH_C64(0x33EE18A487AD9999), SPH_C64(0x3C26E8EAB1C94410), + SPH_C64(0xB510102BC0A822F9), SPH_C64(0x141EEF310CE6123B), + SPH_C64(0xFC65B90059DDB154), SPH_C64(0xE0158640C5E0E607), + SPH_C64(0x884E079826C3A3CF), SPH_C64(0x930D0D9523C535FD), + SPH_C64(0x35638D754E9A2B00), SPH_C64(0x4085FCCF40469DD5), + SPH_C64(0xC4B17AD28BE23A4C), SPH_C64(0xCAB2F0FC6A3E6A2E), + SPH_C64(0x2860971A6B943FCD), SPH_C64(0x3DDE6EE212E30446), + SPH_C64(0x6222F32AE01765AE), SPH_C64(0x5D550BB5478308FE), + SPH_C64(0xA9EFA98DA0EDA22A), SPH_C64(0xC351A71686C40DA7), + SPH_C64(0x1105586D9C867C84), SPH_C64(0xDCFFEE85FDA22853), + SPH_C64(0xCCFBD0262C5EEF76), SPH_C64(0xBAF294CB8990D201), + SPH_C64(0xE69464F52AFAD975), SPH_C64(0x94B013AFDF133E14), + SPH_C64(0x06A7D1A32823C958), SPH_C64(0x6F95FE5130F61119), + SPH_C64(0xD92AB34E462C06C0), SPH_C64(0xED7BDE33887C71D2), + SPH_C64(0x79746D6E6518393E), SPH_C64(0x5BA419385D713329), + SPH_C64(0x7C1BA6B948A97564), SPH_C64(0x31987C197BFDAC67), + SPH_C64(0xDE6C23C44B053D02), SPH_C64(0x581C49FED002D64D), + SPH_C64(0xDD474D6338261571), SPH_C64(0xAA4546C3E473D062), + SPH_C64(0x928FCE349455F860), SPH_C64(0x48161BBACAAB94D9), + SPH_C64(0x63912430770E6F68), SPH_C64(0x6EC8A5E602C6641C), + SPH_C64(0x87282515337DDD2B), SPH_C64(0x2CDA6B42034B701B), + SPH_C64(0xB03D37C181CB096D), SPH_C64(0xE108438266C71C6F), + SPH_C64(0x2B3180C7EB51B255), SPH_C64(0xDF92B82F96C08BBC), + SPH_C64(0x5C68C8C0A632F3BA), SPH_C64(0x5504CC861C3D0556), + SPH_C64(0xABBFA4E55FB26B8F), SPH_C64(0x41848B0AB3BACEB4), + SPH_C64(0xB334A273AA445D32), SPH_C64(0xBCA696F0A85AD881), + SPH_C64(0x24F6EC65B528D56C), SPH_C64(0x0CE1512E90F4524A), + SPH_C64(0x4E9DD79D5506D35A), SPH_C64(0x258905FAC6CE9779), + SPH_C64(0x2019295B3E109B33), SPH_C64(0xF8A9478B73A054CC), + SPH_C64(0x2924F2F934417EB0), SPH_C64(0x3993357D536D1BC4), + SPH_C64(0x38A81AC21DB6FF8B), SPH_C64(0x47C4FBF17D6016BF), + SPH_C64(0x1E0FAADD7667E3F5), SPH_C64(0x7ABCFF62938BEB96), + SPH_C64(0xA78DAD948FC179C9), SPH_C64(0x8F1F98B72911E50D), + SPH_C64(0x61E48EAE27121A91), SPH_C64(0x4D62F7AD31859808), + SPH_C64(0xECEBA345EF5CEAEB), SPH_C64(0xF5CEB25EBC9684CE), + SPH_C64(0xF633E20CB7F76221), SPH_C64(0xA32CDF06AB8293E4), + SPH_C64(0x985A202CA5EE2CA4), SPH_C64(0xCF0B8447CC8A8FB1), + SPH_C64(0x9F765244979859A3), SPH_C64(0xA8D516B1A1240017), + SPH_C64(0x0BD7BA3EBB5DC726), SPH_C64(0xE54BCA55B86ADB39), + SPH_C64(0x1D7A3AFD6C478063), SPH_C64(0x519EC608E7669EDD), + SPH_C64(0x0E5715A2D149AA23), SPH_C64(0x177D4571848FF194), + SPH_C64(0xEEB55F3241014C22), SPH_C64(0x0F5E5CA13A6E2EC2), + SPH_C64(0x8029927B75F5C361), SPH_C64(0xAD139FABC3D6E436), + SPH_C64(0x0D5DF1A94CCF402F), SPH_C64(0x3E8BD948BEA5DFC8), + SPH_C64(0xA5A0D357BD3FF77E), SPH_C64(0xA2D12E251F74F645), + SPH_C64(0x66FD9E525E81A082), SPH_C64(0x2E0C90CE7F687A49), + SPH_C64(0xC2E8BCBEBA973BC5), SPH_C64(0x000001BCE509745F), + SPH_C64(0x423777BBE6DAB3D6), SPH_C64(0xD1661C7EAEF06EB5), + SPH_C64(0xA1781F354DAACFD8), SPH_C64(0x2D11284A2B16AFFC), + SPH_C64(0xF1FC4F67FA891D1F), SPH_C64(0x73ECC25DCB920ADA), + SPH_C64(0xAE610C22C2A12651), SPH_C64(0x96E0A810D356B78A), + SPH_C64(0x5A9A381F2FE7870F), SPH_C64(0xD5AD62EDE94E5530), + SPH_C64(0xD225E5E8368D1427), SPH_C64(0x65977B70C7AF4631), + SPH_C64(0x99F889B2DE39D74F), SPH_C64(0x233F30BF54E1D143), + SPH_C64(0x9A9675D3D9A63C97), SPH_C64(0x5470554FF334F9A8), + SPH_C64(0x166ACB744A4F5688), SPH_C64(0x70C74CAAB2E4AEAD), + SPH_C64(0xF0D091646F294D12), SPH_C64(0x57B82A89684031D1), + SPH_C64(0xEFD95A5A61BE0B6B), SPH_C64(0x2FBD12E969F2F29A), + SPH_C64(0x9BD37013FEFF9FE8), SPH_C64(0x3F9B0404D6085A06), + SPH_C64(0x4940C1F3166CFE15), SPH_C64(0x09542C4DCDF3DEFB), + SPH_C64(0xB4C5218385CD5CE3), SPH_C64(0xC935B7DC4462A641), + SPH_C64(0x3417F8A68ED3B63F), SPH_C64(0xB80959295B215B40), + SPH_C64(0xF99CDAEF3B8C8572), SPH_C64(0x018C0614F8FCB95D), + SPH_C64(0x1B14ACCD1A3ACDF3), SPH_C64(0x84D471F200BB732D), + SPH_C64(0xC1A3110E95E8DA16), SPH_C64(0x430A7220BF1A82B8), + SPH_C64(0xB77E090D39DF210E), SPH_C64(0x5EF4BD9F3CD05E9D), + SPH_C64(0x9D4FF6DA7E57A444), SPH_C64(0xDA1D60E183D4A5F8), + SPH_C64(0xB287C38417998E47), SPH_C64(0xFE3EDC121BB31886), + SPH_C64(0xC7FE3CCC980CCBEF), SPH_C64(0xE46FB590189BFD03), + SPH_C64(0x3732FD469A4C57DC), SPH_C64(0x7EF700A07CF1AD65), + SPH_C64(0x59C64468A31D8859), SPH_C64(0x762FB0B4D45B61F6), + SPH_C64(0x155BAED099047718), SPH_C64(0x68755E4C3D50BAA6), + SPH_C64(0xE9214E7F22D8B4DF), SPH_C64(0x2ADDBF532EAC95F4), + SPH_C64(0x32AE3909B4BD0109), SPH_C64(0x834DF537B08E3450), + SPH_C64(0xFA209DA84220728D), SPH_C64(0x9E691D9B9EFE23F7), + SPH_C64(0x0446D288C4AE8D7F), SPH_C64(0x7B4CC524E169785B), + SPH_C64(0x21D87F0135CA1385), SPH_C64(0xCEBB400F137B8AA5), + SPH_C64(0x272E2B66580796BE), SPH_C64(0x3612264125C2B0DE), + SPH_C64(0x057702BDAD1EFBB2), SPH_C64(0xD4BABB8EACF84BE9), + SPH_C64(0x91583139641BC67B), SPH_C64(0x8BDC2DE08036E024), + SPH_C64(0x603C8156F49F68ED), SPH_C64(0xF7D236F7DBEF5111), + SPH_C64(0x9727C4598AD21E80), SPH_C64(0xA08A0896670A5FD7), + SPH_C64(0xCB4A8F4309EBA9CB), SPH_C64(0x81AF564B0F7036A1), + SPH_C64(0xC0B99AA778199ABD), SPH_C64(0x959F1EC83FC8E952), + SPH_C64(0x8C505077794A81B9), SPH_C64(0x3ACAAF8F056338F0), + SPH_C64(0x07B43F50627A6778), SPH_C64(0x4A44AB49F5ECCC77), + SPH_C64(0x3BC3D6E4B679EE98), SPH_C64(0x9CC0D4D1CF14108C), + SPH_C64(0x4406C00B206BC8A0), SPH_C64(0x82A18854C8D72D89), + SPH_C64(0x67E366B35C3C432C), SPH_C64(0xB923DD61102B37F2), + SPH_C64(0x56AB2779D884271D), SPH_C64(0xBE83E1B0FF1525AF), + SPH_C64(0xFB7C65D4217E49A9), SPH_C64(0x6BDBE0E76D48E7D4), + SPH_C64(0x08DF828745D9179E), SPH_C64(0x22EA6A9ADD53BD34), + SPH_C64(0xE36E141C5622200A), SPH_C64(0x7F805D1B8CB750EE), + SPH_C64(0xAFE5C7A59F58E837), SPH_C64(0xE27F996A4FB1C23C), + SPH_C64(0xD3867DFB0775F0D0), SPH_C64(0xD0E673DE6E88891A), + SPH_C64(0x123AEB9EAFB86C25), SPH_C64(0x30F1D5D5C145B895), + SPH_C64(0xBB434A2DEE7269E7), SPH_C64(0x78CB67ECF931FA38), + SPH_C64(0xF33B0372323BBF9C), SPH_C64(0x52D66336FB279C74), + SPH_C64(0x505F33AC0AFB4EAA), SPH_C64(0xE8A5CD99A2CCE187), + SPH_C64(0x534974801E2D30BB), SPH_C64(0x8D2D5711D5876D90), + SPH_C64(0x1F1A412891BC038E), SPH_C64(0xD6E2E71D82E56648), + SPH_C64(0x74036C3A497732B7), SPH_C64(0x89B67ED96361F5AB), + SPH_C64(0xFFED95D8F1EA02A2), SPH_C64(0xE72B3BD61464D43D), + SPH_C64(0xA6300F170BDC4820), SPH_C64(0xEBC18760ED78A77A) +}; + +__constant__ const uint64_t T2[256] = { + SPH_C64(0xE6A6BE5A05A12138), SPH_C64(0xB5A122A5B4F87C98), + SPH_C64(0x563C6089140B6990), SPH_C64(0x4C46CB2E391F5DD5), + SPH_C64(0xD932ADDBC9B79434), SPH_C64(0x08EA70E42015AFF5), + SPH_C64(0xD765A6673E478CF1), SPH_C64(0xC4FB757EAB278D99), + SPH_C64(0xDF11C6862D6E0692), SPH_C64(0xDDEB84F10D7F3B16), + SPH_C64(0x6F2EF604A665EA04), SPH_C64(0x4A8E0F0FF0E0DFB3), + SPH_C64(0xA5EDEEF83DBCBA51), SPH_C64(0xFC4F0A2A0EA4371E), + SPH_C64(0xE83E1DA85CB38429), SPH_C64(0xDC8FF882BA1B1CE2), + SPH_C64(0xCD45505E8353E80D), SPH_C64(0x18D19A00D4DB0717), + SPH_C64(0x34A0CFEDA5F38101), SPH_C64(0x0BE77E518887CAF2), + SPH_C64(0x1E341438B3C45136), SPH_C64(0xE05797F49089CCF9), + SPH_C64(0xFFD23F9DF2591D14), SPH_C64(0x543DDA228595C5CD), + SPH_C64(0x661F81FD99052A33), SPH_C64(0x8736E641DB0F7B76), + SPH_C64(0x15227725418E5307), SPH_C64(0xE25F7F46162EB2FA), + SPH_C64(0x48A8B2126C13D9FE), SPH_C64(0xAFDC541792E76EEA), + SPH_C64(0x03D912BFC6D1898F), SPH_C64(0x31B1AAFA1B83F51B), + SPH_C64(0xF1AC2796E42AB7D9), SPH_C64(0x40A3A7D7FCD2EBAC), + SPH_C64(0x1056136D0AFBBCC5), SPH_C64(0x7889E1DD9A6D0C85), + SPH_C64(0xD33525782A7974AA), SPH_C64(0xA7E25D09078AC09B), + SPH_C64(0xBD4138B3EAC6EDD0), SPH_C64(0x920ABFBE71EB9E70), + SPH_C64(0xA2A5D0F54FC2625C), SPH_C64(0xC054E36B0B1290A3), + SPH_C64(0xF6DD59FF62FE932B), SPH_C64(0x3537354511A8AC7D), + SPH_C64(0xCA845E9172FADCD4), SPH_C64(0x84F82B60329D20DC), + SPH_C64(0x79C62CE1CD672F18), SPH_C64(0x8B09A2ADD124642C), + SPH_C64(0xD0C1E96A19D9E726), SPH_C64(0x5A786A9B4BA9500C), + SPH_C64(0x0E020336634C43F3), SPH_C64(0xC17B474AEB66D822), + SPH_C64(0x6A731AE3EC9BAAC2), SPH_C64(0x8226667AE0840258), + SPH_C64(0x67D4567691CAECA5), SPH_C64(0x1D94155C4875ADB5), + SPH_C64(0x6D00FD985B813FDF), SPH_C64(0x51286EFCB774CD06), + SPH_C64(0x5E8834471FA744AF), SPH_C64(0xF72CA0AEE761AE2E), + SPH_C64(0xBE40E4CDAEE8E09A), SPH_C64(0xE9970BBB5118F665), + SPH_C64(0x726E4BEB33DF1964), SPH_C64(0x703B000729199762), + SPH_C64(0x4631D816F5EF30A7), SPH_C64(0xB880B5B51504A6BE), + SPH_C64(0x641793C37ED84B6C), SPH_C64(0x7B21ED77F6E97D96), + SPH_C64(0x776306312EF96B73), SPH_C64(0xAE528948E86FF3F4), + SPH_C64(0x53DBD7F286A3F8F8), SPH_C64(0x16CADCE74CFC1063), + SPH_C64(0x005C19BDFA52C6DD), SPH_C64(0x68868F5D64D46AD3), + SPH_C64(0x3A9D512CCF1E186A), SPH_C64(0x367E62C2385660AE), + SPH_C64(0xE359E7EA77DCB1D7), SPH_C64(0x526C0773749ABE6E), + SPH_C64(0x735AE5F9D09F734B), SPH_C64(0x493FC7CC8A558BA8), + SPH_C64(0xB0B9C1533041AB45), SPH_C64(0x321958BA470A59BD), + SPH_C64(0x852DB00B5F46C393), SPH_C64(0x91209B2BD336B0E5), + SPH_C64(0x6E604F7D659EF19F), SPH_C64(0xB99A8AE2782CCB24), + SPH_C64(0xCCF52AB6C814C4C7), SPH_C64(0x4727D9AFBE11727B), + SPH_C64(0x7E950D0C0121B34D), SPH_C64(0x756F435670AD471F), + SPH_C64(0xF5ADD442615A6849), SPH_C64(0x4E87E09980B9957A), + SPH_C64(0x2ACFA1DF50AEE355), SPH_C64(0xD898263AFD2FD556), + SPH_C64(0xC8F4924DD80C8FD6), SPH_C64(0xCF99CA3D754A173A), + SPH_C64(0xFE477BACAF91BF3C), SPH_C64(0xED5371F6D690C12D), + SPH_C64(0x831A5C285E687094), SPH_C64(0xC5D3C90A3708A0A4), + SPH_C64(0x0F7F903717D06580), SPH_C64(0x19F9BB13B8FDF27F), + SPH_C64(0xB1BD6F1B4D502843), SPH_C64(0x1C761BA38FFF4012), + SPH_C64(0x0D1530C4E2E21F3B), SPH_C64(0x8943CE69A7372C8A), + SPH_C64(0xE5184E11FEB5CE66), SPH_C64(0x618BDB80BD736621), + SPH_C64(0x7D29BAD68B574D0B), SPH_C64(0x81BB613E25E6FE5B), + SPH_C64(0x071C9C10BC07913F), SPH_C64(0xC7BEEB7909AC2D97), + SPH_C64(0xC3E58D353BC5D757), SPH_C64(0xEB017892F38F61E8), + SPH_C64(0xD4EFFB9C9B1CC21A), SPH_C64(0x99727D26F494F7AB), + SPH_C64(0xA3E063A2956B3E03), SPH_C64(0x9D4A8B9A4AA09C30), + SPH_C64(0x3F6AB7D500090FB4), SPH_C64(0x9CC0F2A057268AC0), + SPH_C64(0x3DEE9D2DEDBF42D1), SPH_C64(0x330F49C87960A972), + SPH_C64(0xC6B2720287421B41), SPH_C64(0x0AC59EC07C00369C), + SPH_C64(0xEF4EAC49CB353425), SPH_C64(0xF450244EEF0129D8), + SPH_C64(0x8ACC46E5CAF4DEB6), SPH_C64(0x2FFEAB63989263F7), + SPH_C64(0x8F7CB9FE5D7A4578), SPH_C64(0x5BD8F7644E634635), + SPH_C64(0x427A7315BF2DC900), SPH_C64(0x17D0C4AA2125261C), + SPH_C64(0x3992486C93518E50), SPH_C64(0xB4CBFEE0A2D7D4C3), + SPH_C64(0x7C75D6202C5DDD8D), SPH_C64(0xDBC295D8E35B6C61), + SPH_C64(0x60B369D302032B19), SPH_C64(0xCE42685FDCE44132), + SPH_C64(0x06F3DDB9DDF65610), SPH_C64(0x8EA4D21DB5E148F0), + SPH_C64(0x20B0FCE62FCD496F), SPH_C64(0x2C1B912358B0EE31), + SPH_C64(0xB28317B818F5A308), SPH_C64(0xA89C1E189CA6D2CF), + SPH_C64(0x0C6B18576AAADBC8), SPH_C64(0xB65DEAA91299FAE3), + SPH_C64(0xFB2B794B7F1027E7), SPH_C64(0x04E4317F443B5BEB), + SPH_C64(0x4B852D325939D0A6), SPH_C64(0xD5AE6BEEFB207FFC), + SPH_C64(0x309682B281C7D374), SPH_C64(0xBAE309A194C3B475), + SPH_C64(0x8CC3F97B13B49F05), SPH_C64(0x98A9422FF8293967), + SPH_C64(0x244B16B01076FF7C), SPH_C64(0xF8BF571C663D67EE), + SPH_C64(0x1F0D6758EEE30DA1), SPH_C64(0xC9B611D97ADEB9B7), + SPH_C64(0xB7AFD5887B6C57A2), SPH_C64(0x6290AE846B984FE1), + SPH_C64(0x94DF4CDEACC1A5FD), SPH_C64(0x058A5BD1C5483AFF), + SPH_C64(0x63166CC142BA3C37), SPH_C64(0x8DB8526EB2F76F40), + SPH_C64(0xE10880036F0D6D4E), SPH_C64(0x9E0523C9971D311D), + SPH_C64(0x45EC2824CC7CD691), SPH_C64(0x575B8359E62382C9), + SPH_C64(0xFA9E400DC4889995), SPH_C64(0xD1823ECB45721568), + SPH_C64(0xDAFD983B8206082F), SPH_C64(0xAA7D29082386A8CB), + SPH_C64(0x269FCD4403B87588), SPH_C64(0x1B91F5F728BDD1E0), + SPH_C64(0xE4669F39040201F6), SPH_C64(0x7A1D7C218CF04ADE), + SPH_C64(0x65623C29D79CE5CE), SPH_C64(0x2368449096C00BB1), + SPH_C64(0xAB9BF1879DA503BA), SPH_C64(0xBC23ECB1A458058E), + SPH_C64(0x9A58DF01BB401ECC), SPH_C64(0xA070E868A85F143D), + SPH_C64(0x4FF188307DF2239E), SPH_C64(0x14D565B41A641183), + SPH_C64(0xEE13337452701602), SPH_C64(0x950E3DCF3F285E09), + SPH_C64(0x59930254B9C80953), SPH_C64(0x3BF299408930DA6D), + SPH_C64(0xA955943F53691387), SPH_C64(0xA15EDECAA9CB8784), + SPH_C64(0x29142127352BE9A0), SPH_C64(0x76F0371FFF4E7AFB), + SPH_C64(0x0239F450274F2228), SPH_C64(0xBB073AF01D5E868B), + SPH_C64(0xBFC80571C10E96C1), SPH_C64(0xD267088568222E23), + SPH_C64(0x9671A3D48E80B5B0), SPH_C64(0x55B5D38AE193BB81), + SPH_C64(0x693AE2D0A18B04B8), SPH_C64(0x5C48B4ECADD5335F), + SPH_C64(0xFD743B194916A1CA), SPH_C64(0x2577018134BE98C4), + SPH_C64(0xE77987E83C54A4AD), SPH_C64(0x28E11014DA33E1B9), + SPH_C64(0x270CC59E226AA213), SPH_C64(0x71495F756D1A5F60), + SPH_C64(0x9BE853FB60AFEF77), SPH_C64(0xADC786A7F7443DBF), + SPH_C64(0x0904456173B29A82), SPH_C64(0x58BC7A66C232BD5E), + SPH_C64(0xF306558C673AC8B2), SPH_C64(0x41F639C6B6C9772A), + SPH_C64(0x216DEFE99FDA35DA), SPH_C64(0x11640CC71C7BE615), + SPH_C64(0x93C43694565C5527), SPH_C64(0xEA038E6246777839), + SPH_C64(0xF9ABF3CE5A3E2469), SPH_C64(0x741E768D0FD312D2), + SPH_C64(0x0144B883CED652C6), SPH_C64(0xC20B5A5BA33F8552), + SPH_C64(0x1AE69633C3435A9D), SPH_C64(0x97A28CA4088CFDEC), + SPH_C64(0x8824A43C1E96F420), SPH_C64(0x37612FA66EEEA746), + SPH_C64(0x6B4CB165F9CF0E5A), SPH_C64(0x43AA1C06A0ABFB4A), + SPH_C64(0x7F4DC26FF162796B), SPH_C64(0x6CBACC8E54ED9B0F), + SPH_C64(0xA6B7FFEFD2BB253E), SPH_C64(0x2E25BC95B0A29D4F), + SPH_C64(0x86D6A58BDEF1388C), SPH_C64(0xDED74AC576B6F054), + SPH_C64(0x8030BDBC2B45805D), SPH_C64(0x3C81AF70E94D9289), + SPH_C64(0x3EFF6DDA9E3100DB), SPH_C64(0xB38DC39FDFCC8847), + SPH_C64(0x123885528D17B87E), SPH_C64(0xF2DA0ED240B1B642), + SPH_C64(0x44CEFADCD54BF9A9), SPH_C64(0x1312200E433C7EE6), + SPH_C64(0x9FFCC84F3A78C748), SPH_C64(0xF0CD1F72248576BB), + SPH_C64(0xEC6974053638CFE4), SPH_C64(0x2BA7B67C0CEC4E4C), + SPH_C64(0xAC2F4DF3E5CE32ED), SPH_C64(0xCB33D14326EA4C11), + SPH_C64(0xA4E9044CC77E58BC), SPH_C64(0x5F513293D934FCEF), + SPH_C64(0x5DC9645506E55444), SPH_C64(0x50DE418F317DE40A), + SPH_C64(0x388CB31A69DDE259), SPH_C64(0x2DB4A83455820A86), + SPH_C64(0x9010A91E84711AE9), SPH_C64(0x4DF7F0B7B1498371), + SPH_C64(0xD62A2EABC0977179), SPH_C64(0x22FAC097AA8D5C0E) +}; + +__constant__ const uint64_t T3[256] = { + SPH_C64(0xF49FCC2FF1DAF39B), SPH_C64(0x487FD5C66FF29281), + SPH_C64(0xE8A30667FCDCA83F), SPH_C64(0x2C9B4BE3D2FCCE63), + SPH_C64(0xDA3FF74B93FBBBC2), SPH_C64(0x2FA165D2FE70BA66), + SPH_C64(0xA103E279970E93D4), SPH_C64(0xBECDEC77B0E45E71), + SPH_C64(0xCFB41E723985E497), SPH_C64(0xB70AAA025EF75017), + SPH_C64(0xD42309F03840B8E0), SPH_C64(0x8EFC1AD035898579), + SPH_C64(0x96C6920BE2B2ABC5), SPH_C64(0x66AF4163375A9172), + SPH_C64(0x2174ABDCCA7127FB), SPH_C64(0xB33CCEA64A72FF41), + SPH_C64(0xF04A4933083066A5), SPH_C64(0x8D970ACDD7289AF5), + SPH_C64(0x8F96E8E031C8C25E), SPH_C64(0xF3FEC02276875D47), + SPH_C64(0xEC7BF310056190DD), SPH_C64(0xF5ADB0AEBB0F1491), + SPH_C64(0x9B50F8850FD58892), SPH_C64(0x4975488358B74DE8), + SPH_C64(0xA3354FF691531C61), SPH_C64(0x0702BBE481D2C6EE), + SPH_C64(0x89FB24057DEDED98), SPH_C64(0xAC3075138596E902), + SPH_C64(0x1D2D3580172772ED), SPH_C64(0xEB738FC28E6BC30D), + SPH_C64(0x5854EF8F63044326), SPH_C64(0x9E5C52325ADD3BBE), + SPH_C64(0x90AA53CF325C4623), SPH_C64(0xC1D24D51349DD067), + SPH_C64(0x2051CFEEA69EA624), SPH_C64(0x13220F0A862E7E4F), + SPH_C64(0xCE39399404E04864), SPH_C64(0xD9C42CA47086FCB7), + SPH_C64(0x685AD2238A03E7CC), SPH_C64(0x066484B2AB2FF1DB), + SPH_C64(0xFE9D5D70EFBF79EC), SPH_C64(0x5B13B9DD9C481854), + SPH_C64(0x15F0D475ED1509AD), SPH_C64(0x0BEBCD060EC79851), + SPH_C64(0xD58C6791183AB7F8), SPH_C64(0xD1187C5052F3EEE4), + SPH_C64(0xC95D1192E54E82FF), SPH_C64(0x86EEA14CB9AC6CA2), + SPH_C64(0x3485BEB153677D5D), SPH_C64(0xDD191D781F8C492A), + SPH_C64(0xF60866BAA784EBF9), SPH_C64(0x518F643BA2D08C74), + SPH_C64(0x8852E956E1087C22), SPH_C64(0xA768CB8DC410AE8D), + SPH_C64(0x38047726BFEC8E1A), SPH_C64(0xA67738B4CD3B45AA), + SPH_C64(0xAD16691CEC0DDE19), SPH_C64(0xC6D4319380462E07), + SPH_C64(0xC5A5876D0BA61938), SPH_C64(0x16B9FA1FA58FD840), + SPH_C64(0x188AB1173CA74F18), SPH_C64(0xABDA2F98C99C021F), + SPH_C64(0x3E0580AB134AE816), SPH_C64(0x5F3B05B773645ABB), + SPH_C64(0x2501A2BE5575F2F6), SPH_C64(0x1B2F74004E7E8BA9), + SPH_C64(0x1CD7580371E8D953), SPH_C64(0x7F6ED89562764E30), + SPH_C64(0xB15926FF596F003D), SPH_C64(0x9F65293DA8C5D6B9), + SPH_C64(0x6ECEF04DD690F84C), SPH_C64(0x4782275FFF33AF88), + SPH_C64(0xE41433083F820801), SPH_C64(0xFD0DFE409A1AF9B5), + SPH_C64(0x4325A3342CDB396B), SPH_C64(0x8AE77E62B301B252), + SPH_C64(0xC36F9E9F6655615A), SPH_C64(0x85455A2D92D32C09), + SPH_C64(0xF2C7DEA949477485), SPH_C64(0x63CFB4C133A39EBA), + SPH_C64(0x83B040CC6EBC5462), SPH_C64(0x3B9454C8FDB326B0), + SPH_C64(0x56F56A9E87FFD78C), SPH_C64(0x2DC2940D99F42BC6), + SPH_C64(0x98F7DF096B096E2D), SPH_C64(0x19A6E01E3AD852BF), + SPH_C64(0x42A99CCBDBD4B40B), SPH_C64(0xA59998AF45E9C559), + SPH_C64(0x366295E807D93186), SPH_C64(0x6B48181BFAA1F773), + SPH_C64(0x1FEC57E2157A0A1D), SPH_C64(0x4667446AF6201AD5), + SPH_C64(0xE615EBCACFB0F075), SPH_C64(0xB8F31F4F68290778), + SPH_C64(0x22713ED6CE22D11E), SPH_C64(0x3057C1A72EC3C93B), + SPH_C64(0xCB46ACC37C3F1F2F), SPH_C64(0xDBB893FD02AAF50E), + SPH_C64(0x331FD92E600B9FCF), SPH_C64(0xA498F96148EA3AD6), + SPH_C64(0xA8D8426E8B6A83EA), SPH_C64(0xA089B274B7735CDC), + SPH_C64(0x87F6B3731E524A11), SPH_C64(0x118808E5CBC96749), + SPH_C64(0x9906E4C7B19BD394), SPH_C64(0xAFED7F7E9B24A20C), + SPH_C64(0x6509EADEEB3644A7), SPH_C64(0x6C1EF1D3E8EF0EDE), + SPH_C64(0xB9C97D43E9798FB4), SPH_C64(0xA2F2D784740C28A3), + SPH_C64(0x7B8496476197566F), SPH_C64(0x7A5BE3E6B65F069D), + SPH_C64(0xF96330ED78BE6F10), SPH_C64(0xEEE60DE77A076A15), + SPH_C64(0x2B4BEE4AA08B9BD0), SPH_C64(0x6A56A63EC7B8894E), + SPH_C64(0x02121359BA34FEF4), SPH_C64(0x4CBF99F8283703FC), + SPH_C64(0x398071350CAF30C8), SPH_C64(0xD0A77A89F017687A), + SPH_C64(0xF1C1A9EB9E423569), SPH_C64(0x8C7976282DEE8199), + SPH_C64(0x5D1737A5DD1F7ABD), SPH_C64(0x4F53433C09A9FA80), + SPH_C64(0xFA8B0C53DF7CA1D9), SPH_C64(0x3FD9DCBC886CCB77), + SPH_C64(0xC040917CA91B4720), SPH_C64(0x7DD00142F9D1DCDF), + SPH_C64(0x8476FC1D4F387B58), SPH_C64(0x23F8E7C5F3316503), + SPH_C64(0x032A2244E7E37339), SPH_C64(0x5C87A5D750F5A74B), + SPH_C64(0x082B4CC43698992E), SPH_C64(0xDF917BECB858F63C), + SPH_C64(0x3270B8FC5BF86DDA), SPH_C64(0x10AE72BB29B5DD76), + SPH_C64(0x576AC94E7700362B), SPH_C64(0x1AD112DAC61EFB8F), + SPH_C64(0x691BC30EC5FAA427), SPH_C64(0xFF246311CC327143), + SPH_C64(0x3142368E30E53206), SPH_C64(0x71380E31E02CA396), + SPH_C64(0x958D5C960AAD76F1), SPH_C64(0xF8D6F430C16DA536), + SPH_C64(0xC8FFD13F1BE7E1D2), SPH_C64(0x7578AE66004DDBE1), + SPH_C64(0x05833F01067BE646), SPH_C64(0xBB34B5AD3BFE586D), + SPH_C64(0x095F34C9A12B97F0), SPH_C64(0x247AB64525D60CA8), + SPH_C64(0xDCDBC6F3017477D1), SPH_C64(0x4A2E14D4DECAD24D), + SPH_C64(0xBDB5E6D9BE0A1EEB), SPH_C64(0x2A7E70F7794301AB), + SPH_C64(0xDEF42D8A270540FD), SPH_C64(0x01078EC0A34C22C1), + SPH_C64(0xE5DE511AF4C16387), SPH_C64(0x7EBB3A52BD9A330A), + SPH_C64(0x77697857AA7D6435), SPH_C64(0x004E831603AE4C32), + SPH_C64(0xE7A21020AD78E312), SPH_C64(0x9D41A70C6AB420F2), + SPH_C64(0x28E06C18EA1141E6), SPH_C64(0xD2B28CBD984F6B28), + SPH_C64(0x26B75F6C446E9D83), SPH_C64(0xBA47568C4D418D7F), + SPH_C64(0xD80BADBFE6183D8E), SPH_C64(0x0E206D7F5F166044), + SPH_C64(0xE258A43911CBCA3E), SPH_C64(0x723A1746B21DC0BC), + SPH_C64(0xC7CAA854F5D7CDD3), SPH_C64(0x7CAC32883D261D9C), + SPH_C64(0x7690C26423BA942C), SPH_C64(0x17E55524478042B8), + SPH_C64(0xE0BE477656A2389F), SPH_C64(0x4D289B5E67AB2DA0), + SPH_C64(0x44862B9C8FBBFD31), SPH_C64(0xB47CC8049D141365), + SPH_C64(0x822C1B362B91C793), SPH_C64(0x4EB14655FB13DFD8), + SPH_C64(0x1ECBBA0714E2A97B), SPH_C64(0x6143459D5CDE5F14), + SPH_C64(0x53A8FBF1D5F0AC89), SPH_C64(0x97EA04D81C5E5B00), + SPH_C64(0x622181A8D4FDB3F3), SPH_C64(0xE9BCD341572A1208), + SPH_C64(0x1411258643CCE58A), SPH_C64(0x9144C5FEA4C6E0A4), + SPH_C64(0x0D33D06565CF620F), SPH_C64(0x54A48D489F219CA1), + SPH_C64(0xC43E5EAC6D63C821), SPH_C64(0xA9728B3A72770DAF), + SPH_C64(0xD7934E7B20DF87EF), SPH_C64(0xE35503B61A3E86E5), + SPH_C64(0xCAE321FBC819D504), SPH_C64(0x129A50B3AC60BFA6), + SPH_C64(0xCD5E68EA7E9FB6C3), SPH_C64(0xB01C90199483B1C7), + SPH_C64(0x3DE93CD5C295376C), SPH_C64(0xAED52EDF2AB9AD13), + SPH_C64(0x2E60F512C0A07884), SPH_C64(0xBC3D86A3E36210C9), + SPH_C64(0x35269D9B163951CE), SPH_C64(0x0C7D6E2AD0CDB5FA), + SPH_C64(0x59E86297D87F5733), SPH_C64(0x298EF221898DB0E7), + SPH_C64(0x55000029D1A5AA7E), SPH_C64(0x8BC08AE1B5061B45), + SPH_C64(0xC2C31C2B6C92703A), SPH_C64(0x94CC596BAF25EF42), + SPH_C64(0x0A1D73DB22540456), SPH_C64(0x04B6A0F9D9C4179A), + SPH_C64(0xEFFDAFA2AE3D3C60), SPH_C64(0xF7C8075BB49496C4), + SPH_C64(0x9CC5C7141D1CD4E3), SPH_C64(0x78BD1638218E5534), + SPH_C64(0xB2F11568F850246A), SPH_C64(0xEDFABCFA9502BC29), + SPH_C64(0x796CE5F2DA23051B), SPH_C64(0xAAE128B0DC93537C), + SPH_C64(0x3A493DA0EE4B29AE), SPH_C64(0xB5DF6B2C416895D7), + SPH_C64(0xFCABBD25122D7F37), SPH_C64(0x70810B58105DC4B1), + SPH_C64(0xE10FDD37F7882A90), SPH_C64(0x524DCAB5518A3F5C), + SPH_C64(0x3C9E85878451255B), SPH_C64(0x4029828119BD34E2), + SPH_C64(0x74A05B6F5D3CECCB), SPH_C64(0xB610021542E13ECA), + SPH_C64(0x0FF979D12F59E2AC), SPH_C64(0x6037DA27E4F9CC50), + SPH_C64(0x5E92975A0DF1847D), SPH_C64(0xD66DE190D3E623FE), + SPH_C64(0x5032D6B87B568048), SPH_C64(0x9A36B7CE8235216E), + SPH_C64(0x80272A7A24F64B4A), SPH_C64(0x93EFED8B8C6916F7), + SPH_C64(0x37DDBFF44CCE1555), SPH_C64(0x4B95DB5D4B99BD25), + SPH_C64(0x92D3FDA169812FC0), SPH_C64(0xFB1A4A9A90660BB6), + SPH_C64(0x730C196946A4B9B2), SPH_C64(0x81E289AA7F49DA68), + SPH_C64(0x64669A0F83B1A05F), SPH_C64(0x27B3FF7D9644F48B), + SPH_C64(0xCC6B615C8DB675B3), SPH_C64(0x674F20B9BCEBBE95), + SPH_C64(0x6F31238275655982), SPH_C64(0x5AE488713E45CF05), + SPH_C64(0xBF619F9954C21157), SPH_C64(0xEABAC46040A8EAE9), + SPH_C64(0x454C6FE9F2C0C1CD), SPH_C64(0x419CF6496412691C), + SPH_C64(0xD3DC3BEF265B0F70), SPH_C64(0x6D0E60F5C3578A9E) +}; + +__constant__ const uint64_t T4[256] = { + SPH_C64(0x5B0E608526323C55), SPH_C64(0x1A46C1A9FA1B59F5), + SPH_C64(0xA9E245A17C4C8FFA), SPH_C64(0x65CA5159DB2955D7), + SPH_C64(0x05DB0A76CE35AFC2), SPH_C64(0x81EAC77EA9113D45), + SPH_C64(0x528EF88AB6AC0A0D), SPH_C64(0xA09EA253597BE3FF), + SPH_C64(0x430DDFB3AC48CD56), SPH_C64(0xC4B3A67AF45CE46F), + SPH_C64(0x4ECECFD8FBE2D05E), SPH_C64(0x3EF56F10B39935F0), + SPH_C64(0x0B22D6829CD619C6), SPH_C64(0x17FD460A74DF2069), + SPH_C64(0x6CF8CC8E8510ED40), SPH_C64(0xD6C824BF3A6ECAA7), + SPH_C64(0x61243D581A817049), SPH_C64(0x048BACB6BBC163A2), + SPH_C64(0xD9A38AC27D44CC32), SPH_C64(0x7FDDFF5BAAF410AB), + SPH_C64(0xAD6D495AA804824B), SPH_C64(0xE1A6A74F2D8C9F94), + SPH_C64(0xD4F7851235DEE8E3), SPH_C64(0xFD4B7F886540D893), + SPH_C64(0x247C20042AA4BFDA), SPH_C64(0x096EA1C517D1327C), + SPH_C64(0xD56966B4361A6685), SPH_C64(0x277DA5C31221057D), + SPH_C64(0x94D59893A43ACFF7), SPH_C64(0x64F0C51CCDC02281), + SPH_C64(0x3D33BCC4FF6189DB), SPH_C64(0xE005CB184CE66AF1), + SPH_C64(0xFF5CCD1D1DB99BEA), SPH_C64(0xB0B854A7FE42980F), + SPH_C64(0x7BD46A6A718D4B9F), SPH_C64(0xD10FA8CC22A5FD8C), + SPH_C64(0xD31484952BE4BD31), SPH_C64(0xC7FA975FCB243847), + SPH_C64(0x4886ED1E5846C407), SPH_C64(0x28CDDB791EB70B04), + SPH_C64(0xC2B00BE2F573417F), SPH_C64(0x5C9590452180F877), + SPH_C64(0x7A6BDDFFF370EB00), SPH_C64(0xCE509E38D6D9D6A4), + SPH_C64(0xEBEB0F00647FA702), SPH_C64(0x1DCC06CF76606F06), + SPH_C64(0xE4D9F28BA286FF0A), SPH_C64(0xD85A305DC918C262), + SPH_C64(0x475B1D8732225F54), SPH_C64(0x2D4FB51668CCB5FE), + SPH_C64(0xA679B9D9D72BBA20), SPH_C64(0x53841C0D912D43A5), + SPH_C64(0x3B7EAA48BF12A4E8), SPH_C64(0x781E0E47F22F1DDF), + SPH_C64(0xEFF20CE60AB50973), SPH_C64(0x20D261D19DFFB742), + SPH_C64(0x16A12B03062A2E39), SPH_C64(0x1960EB2239650495), + SPH_C64(0x251C16FED50EB8B8), SPH_C64(0x9AC0C330F826016E), + SPH_C64(0xED152665953E7671), SPH_C64(0x02D63194A6369570), + SPH_C64(0x5074F08394B1C987), SPH_C64(0x70BA598C90B25CE1), + SPH_C64(0x794A15810B9742F6), SPH_C64(0x0D5925E9FCAF8C6C), + SPH_C64(0x3067716CD868744E), SPH_C64(0x910AB077E8D7731B), + SPH_C64(0x6A61BBDB5AC42F61), SPH_C64(0x93513EFBF0851567), + SPH_C64(0xF494724B9E83E9D5), SPH_C64(0xE887E1985C09648D), + SPH_C64(0x34B1D3C675370CFD), SPH_C64(0xDC35E433BC0D255D), + SPH_C64(0xD0AAB84234131BE0), SPH_C64(0x08042A50B48B7EAF), + SPH_C64(0x9997C4EE44A3AB35), SPH_C64(0x829A7B49201799D0), + SPH_C64(0x263B8307B7C54441), SPH_C64(0x752F95F4FD6A6CA6), + SPH_C64(0x927217402C08C6E5), SPH_C64(0x2A8AB754A795D9EE), + SPH_C64(0xA442F7552F72943D), SPH_C64(0x2C31334E19781208), + SPH_C64(0x4FA98D7CEAEE6291), SPH_C64(0x55C3862F665DB309), + SPH_C64(0xBD0610175D53B1F3), SPH_C64(0x46FE6CB840413F27), + SPH_C64(0x3FE03792DF0CFA59), SPH_C64(0xCFE700372EB85E8F), + SPH_C64(0xA7BE29E7ADBCE118), SPH_C64(0xE544EE5CDE8431DD), + SPH_C64(0x8A781B1B41F1873E), SPH_C64(0xA5C94C78A0D2F0E7), + SPH_C64(0x39412E2877B60728), SPH_C64(0xA1265EF3AFC9A62C), + SPH_C64(0xBCC2770C6A2506C5), SPH_C64(0x3AB66DD5DCE1CE12), + SPH_C64(0xE65499D04A675B37), SPH_C64(0x7D8F523481BFD216), + SPH_C64(0x0F6F64FCEC15F389), SPH_C64(0x74EFBE618B5B13C8), + SPH_C64(0xACDC82B714273E1D), SPH_C64(0xDD40BFE003199D17), + SPH_C64(0x37E99257E7E061F8), SPH_C64(0xFA52626904775AAA), + SPH_C64(0x8BBBF63A463D56F9), SPH_C64(0xF0013F1543A26E64), + SPH_C64(0xA8307E9F879EC898), SPH_C64(0xCC4C27A4150177CC), + SPH_C64(0x1B432F2CCA1D3348), SPH_C64(0xDE1D1F8F9F6FA013), + SPH_C64(0x606602A047A7DDD6), SPH_C64(0xD237AB64CC1CB2C7), + SPH_C64(0x9B938E7225FCD1D3), SPH_C64(0xEC4E03708E0FF476), + SPH_C64(0xFEB2FBDA3D03C12D), SPH_C64(0xAE0BCED2EE43889A), + SPH_C64(0x22CB8923EBFB4F43), SPH_C64(0x69360D013CF7396D), + SPH_C64(0x855E3602D2D4E022), SPH_C64(0x073805BAD01F784C), + SPH_C64(0x33E17A133852F546), SPH_C64(0xDF4874058AC7B638), + SPH_C64(0xBA92B29C678AA14A), SPH_C64(0x0CE89FC76CFAADCD), + SPH_C64(0x5F9D4E0908339E34), SPH_C64(0xF1AFE9291F5923B9), + SPH_C64(0x6E3480F60F4A265F), SPH_C64(0xEEBF3A2AB29B841C), + SPH_C64(0xE21938A88F91B4AD), SPH_C64(0x57DFEFF845C6D3C3), + SPH_C64(0x2F006B0BF62CAAF2), SPH_C64(0x62F479EF6F75EE78), + SPH_C64(0x11A55AD41C8916A9), SPH_C64(0xF229D29084FED453), + SPH_C64(0x42F1C27B16B000E6), SPH_C64(0x2B1F76749823C074), + SPH_C64(0x4B76ECA3C2745360), SPH_C64(0x8C98F463B91691BD), + SPH_C64(0x14BCC93CF1ADE66A), SPH_C64(0x8885213E6D458397), + SPH_C64(0x8E177DF0274D4711), SPH_C64(0xB49B73B5503F2951), + SPH_C64(0x10168168C3F96B6B), SPH_C64(0x0E3D963B63CAB0AE), + SPH_C64(0x8DFC4B5655A1DB14), SPH_C64(0xF789F1356E14DE5C), + SPH_C64(0x683E68AF4E51DAC1), SPH_C64(0xC9A84F9D8D4B0FD9), + SPH_C64(0x3691E03F52A0F9D1), SPH_C64(0x5ED86E46E1878E80), + SPH_C64(0x3C711A0E99D07150), SPH_C64(0x5A0865B20C4E9310), + SPH_C64(0x56FBFC1FE4F0682E), SPH_C64(0xEA8D5DE3105EDF9B), + SPH_C64(0x71ABFDB12379187A), SPH_C64(0x2EB99DE1BEE77B9C), + SPH_C64(0x21ECC0EA33CF4523), SPH_C64(0x59A4D7521805C7A1), + SPH_C64(0x3896F5EB56AE7C72), SPH_C64(0xAA638F3DB18F75DC), + SPH_C64(0x9F39358DABE9808E), SPH_C64(0xB7DEFA91C00B72AC), + SPH_C64(0x6B5541FD62492D92), SPH_C64(0x6DC6DEE8F92E4D5B), + SPH_C64(0x353F57ABC4BEEA7E), SPH_C64(0x735769D6DA5690CE), + SPH_C64(0x0A234AA642391484), SPH_C64(0xF6F9508028F80D9D), + SPH_C64(0xB8E319A27AB3F215), SPH_C64(0x31AD9C1151341A4D), + SPH_C64(0x773C22A57BEF5805), SPH_C64(0x45C7561A07968633), + SPH_C64(0xF913DA9E249DBE36), SPH_C64(0xDA652D9B78A64C68), + SPH_C64(0x4C27A97F3BC334EF), SPH_C64(0x76621220E66B17F4), + SPH_C64(0x967743899ACD7D0B), SPH_C64(0xF3EE5BCAE0ED6782), + SPH_C64(0x409F753600C879FC), SPH_C64(0x06D09A39B5926DB6), + SPH_C64(0x6F83AEB0317AC588), SPH_C64(0x01E6CA4A86381F21), + SPH_C64(0x66FF3462D19F3025), SPH_C64(0x72207C24DDFD3BFB), + SPH_C64(0x4AF6B6D3E2ECE2EB), SPH_C64(0x9C994DBEC7EA08DE), + SPH_C64(0x49ACE597B09A8BC4), SPH_C64(0xB38C4766CF0797BA), + SPH_C64(0x131B9373C57C2A75), SPH_C64(0xB1822CCE61931E58), + SPH_C64(0x9D7555B909BA1C0C), SPH_C64(0x127FAFDD937D11D2), + SPH_C64(0x29DA3BADC66D92E4), SPH_C64(0xA2C1D57154C2ECBC), + SPH_C64(0x58C5134D82F6FE24), SPH_C64(0x1C3AE3515B62274F), + SPH_C64(0xE907C82E01CB8126), SPH_C64(0xF8ED091913E37FCB), + SPH_C64(0x3249D8F9C80046C9), SPH_C64(0x80CF9BEDE388FB63), + SPH_C64(0x1881539A116CF19E), SPH_C64(0x5103F3F76BD52457), + SPH_C64(0x15B7E6F5AE47F7A8), SPH_C64(0xDBD7C6DED47E9CCF), + SPH_C64(0x44E55C410228BB1A), SPH_C64(0xB647D4255EDB4E99), + SPH_C64(0x5D11882BB8AAFC30), SPH_C64(0xF5098BBB29D3212A), + SPH_C64(0x8FB5EA14E90296B3), SPH_C64(0x677B942157DD025A), + SPH_C64(0xFB58E7C0A390ACB5), SPH_C64(0x89D3674C83BD4A01), + SPH_C64(0x9E2DA4DF4BF3B93B), SPH_C64(0xFCC41E328CAB4829), + SPH_C64(0x03F38C96BA582C52), SPH_C64(0xCAD1BDBD7FD85DB2), + SPH_C64(0xBBB442C16082AE83), SPH_C64(0xB95FE86BA5DA9AB0), + SPH_C64(0xB22E04673771A93F), SPH_C64(0x845358C9493152D8), + SPH_C64(0xBE2A488697B4541E), SPH_C64(0x95A2DC2DD38E6966), + SPH_C64(0xC02C11AC923C852B), SPH_C64(0x2388B1990DF2A87B), + SPH_C64(0x7C8008FA1B4F37BE), SPH_C64(0x1F70D0C84D54E503), + SPH_C64(0x5490ADEC7ECE57D4), SPH_C64(0x002B3C27D9063A3A), + SPH_C64(0x7EAEA3848030A2BF), SPH_C64(0xC602326DED2003C0), + SPH_C64(0x83A7287D69A94086), SPH_C64(0xC57A5FCB30F57A8A), + SPH_C64(0xB56844E479EBE779), SPH_C64(0xA373B40F05DCBCE9), + SPH_C64(0xD71A786E88570EE2), SPH_C64(0x879CBACDBDE8F6A0), + SPH_C64(0x976AD1BCC164A32F), SPH_C64(0xAB21E25E9666D78B), + SPH_C64(0x901063AAE5E5C33C), SPH_C64(0x9818B34448698D90), + SPH_C64(0xE36487AE3E1E8ABB), SPH_C64(0xAFBDF931893BDCB4), + SPH_C64(0x6345A0DC5FBBD519), SPH_C64(0x8628FE269B9465CA), + SPH_C64(0x1E5D01603F9C51EC), SPH_C64(0x4DE44006A15049B7), + SPH_C64(0xBF6C70E5F776CBB1), SPH_C64(0x411218F2EF552BED), + SPH_C64(0xCB0C0708705A36A3), SPH_C64(0xE74D14754F986044), + SPH_C64(0xCD56D9430EA8280E), SPH_C64(0xC12591D7535F5065), + SPH_C64(0xC83223F1720AEF96), SPH_C64(0xC3A0396F7363A51F) +}; + + +#define BYTE(x, n) __byte_perm(((uint32_t*)&(x))[(n) / 4], 0, 0x4440 + ((n) % 4)) + +#define ROUND(a, b, c, x, mul) { \ + uint64_t t0, t1; \ + c ^= x; \ + t0 = sharedMem[BYTE(c, 0)] ^ sharedMem[BYTE(c, 2)+256] ^ sharedMem[BYTE(c, 4)+512] ^ __ldg(&T4[BYTE(c, 6)]); \ + t1 = sharedMem[BYTE(c, 7)] ^ sharedMem[BYTE(c, 5)+256] ^ sharedMem[BYTE(c, 3)+512] ^ __ldg(&T4[BYTE(c, 1)]); \ + a -= t0; \ + b += t1; \ + b *= mul; \ +} + + +#define PASS(a, b, c, mul) { \ + ROUND(a, b, c, X0, mul); \ + ROUND(b, c, a, X1, mul); \ + ROUND(c, a, b, X2, mul); \ + ROUND(a, b, c, X3, mul); \ + ROUND(b, c, a, X4, mul); \ + ROUND(c, a, b, X5, mul); \ + ROUND(a, b, c, X6, mul); \ + ROUND(b, c, a, X7, mul); \ + } + + +#define KSCHED { \ + X0 = SPH_T64(X0 - (X7 ^ SPH_C64(0xA5A5A5A5A5A5A5A5))); \ + X1 ^= X0; \ + X2 = SPH_T64(X2 + X1); \ + X3 = SPH_T64(X3 - (X2 ^ (~X1 << 19))); \ + X4 ^= X3; \ + X5 = SPH_T64(X5 + X4); \ + X6 = SPH_T64(X6 - (X5 ^ (~X4 >> 23))); \ + X7 ^= X6; \ + X0 = SPH_T64(X0 + X7); \ + X1 = SPH_T64(X1 - (X0 ^ (~X7 << 19))); \ + X2 ^= X1; \ + X3 = SPH_T64(X3 + X2); \ + X4 = SPH_T64(X4 - (X3 ^ (~X2 >> 23))); \ + X5 ^= X4; \ + X6 = SPH_T64(X6 + X5); \ + X7 = SPH_T64(X7 - (X6 ^ SPH_C64(0x0123456789ABCDEF))); \ + } + + +#define TIGER_ROUND_BODY(in, r) { \ + uint64_t A, B, C; \ + uint64_t X0, X1, X2, X3, X4, X5, X6, X7; \ + \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + \ + X0 = (in[0]); \ + X1 = (in[1]); \ + X2 = (in[2]); \ + X3 = (in[3]); \ + X4 = (in[4]); \ + X5 = (in[5]); \ + X6 = (in[6]); \ + X7 = (in[7]); \ + PASS(A, B, C, 5); \ + KSCHED; \ + PASS(C, A, B, 7); \ + KSCHED; \ + PASS(B, C, A, 9); \ + \ + (r)[0] ^= A; \ + (r)[1] = SPH_T64(B - (r)[1]); \ + (r)[2] = SPH_T64(C + (r)[2]); \ + } + + +__global__ void __launch_bounds__(256,5) tiger192_gpu_hash_64(int threads, int zero_pad_64, uint32_t *d_hash) +{ + __shared__ uint64_t sharedMem[768]; +// if(threadIdx.x < 256) + { + sharedMem[threadIdx.x] = T1[threadIdx.x]; + sharedMem[threadIdx.x+256] = T2[threadIdx.x]; + sharedMem[threadIdx.x+512] = T3[threadIdx.x]; + //sharedMem[threadIdx.x+768] = T4[threadIdx.x]; + } + __syncthreads(); + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) { + uint64_t* inout = (uint64_t*)&d_hash[thread<<4]; + uint64_t buf[3], in[8], in2[8]; + + #pragma unroll + for (int i = 0; i < 8; i++) in[i] = inout[i]; + + #pragma unroll + for (int i = 0; i < 3; i++) buf[i] = III[i]; + + TIGER_ROUND_BODY(in, buf); + + in2[0] = 1; + #pragma unroll + for (int i = 1 ; i < 7; i++) in2[i] = 0; + in2[7] = 0x200; + TIGER_ROUND_BODY(in2, buf); + + #pragma unroll + for (int i = 0; i < 3; i++) inout[i] = buf[i]; + if (zero_pad_64) + { + #pragma unroll + for (int i = 3; i < 8; i++) inout[i] = 0; + } + } +} + +__constant__ uint64_t c_PaddedMessage80[10]; + +__global__ void __launch_bounds__(256,5) tiger192_gpu_hash_80(int threads, uint32_t startNonce, uint32_t *d_hash) +{ + __shared__ uint64_t sharedMem[768]; +// if(threadIdx.x < 256) + { + sharedMem[threadIdx.x] = T1[threadIdx.x]; + sharedMem[threadIdx.x+256] = T2[threadIdx.x]; + sharedMem[threadIdx.x+512] = T3[threadIdx.x]; + //sharedMem[threadIdx.x+768] = T4[threadIdx.x]; + } + __syncthreads(); + + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) { + uint64_t* out = (uint64_t*)&d_hash[thread<<4]; + uint64_t buf[3], in[8], in2[8]; + + const uint32_t nonce = cuda_swab32(startNonce + thread); + + #pragma unroll + for (int i = 0; i < 8; i++) in[i] = c_PaddedMessage80[i]; + + #pragma unroll + for (int i = 0; i < 3; i++) buf[i] = III[i]; + + TIGER_ROUND_BODY(in, buf); + + in2[0] = c_PaddedMessage80[8]; + in2[1] = (((uint64_t) nonce) << 32) | (c_PaddedMessage80[9] & 0xffffffff); + in2[2] = 1; + #pragma unroll + for (int i = 3; i < 7; i++) in2[i] = 0; + in2[7] = 0x280; + + TIGER_ROUND_BODY(in2, buf); + + #pragma unroll + for (int i = 0; i < 3; i++) out[i] = buf[i]; + #pragma unroll + for (int i = 3; i < 8; i++) out[i] = 0; + } +} + +__host__ void tiger192_cpu_hash_64(int thr_id, int threads, int zero_pad_64, uint32_t *d_hash) +{ + const int threadsperblock = 256; + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + tiger192_gpu_hash_64<<>>(threads, zero_pad_64, d_hash); +} + +__host__ +void tiger192_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +__host__ void tiger192_cpu_hash_80(int thr_id, int threads, uint32_t startNonce, uint32_t *d_hash) +{ + const int threadsperblock = 256; + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + tiger192_gpu_hash_80<<>>(threads, startNonce, d_hash); +} \ No newline at end of file diff --git a/x21/x21s.cu b/x21/x21s.cu new file mode 100644 index 0000000000..a722c3ba57 --- /dev/null +++ b/x21/x21s.cu @@ -0,0 +1,658 @@ +/** + * X21S algorithm (X16S + 5/6 algorithms from X22i) + * penfold 2018 + * + * Based on tpruvot 2018 + SUQA x22i - GPL code + */ + +#include +#include +#include + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" + +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" + +#include "sph/sph_haval.h" +#include "sph/sph_tiger.h" +#include "lyra2/Lyra2.h" +#include "sph/sph_streebog.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "x16/cuda_x16.h" + +static uint32_t *d_hash[MAX_GPUS]; +static uint64_t* d_matrix[MAX_GPUS]; + +extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen); + +extern void streebog_cpu_hash_64_alexis(int thr_id, uint32_t threads, uint32_t *d_hash); + +extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); +extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order); + +extern void tiger192_cpu_hash_64(int thr_id, int threads, uint32_t *d_hash); +extern void sha256_cpu_hash_64(int thr_id, int threads, uint32_t *d_hash); + +enum Algo { + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + SHA512, + HASH_FUNC_COUNT +}; + +static const char* algo_strings[] = { + "blake", + "bmw512", + "groestl", + "jh512", + "keccak", + "skein", + "luffa", + "cube", + "shavite", + "simd", + "echo", + "hamsi", + "fugue", + "shabal", + "whirlpool", + "sha512", + NULL +}; + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; + +static void getAlgoString(const uint32_t* prevblock, char *output) +{ + uint8_t* data = (uint8_t*)prevblock; + + strcpy(output, "0123456789ABCDEF"); + + for (uint8_t i = 0; i < HASH_FUNC_COUNT; i++) { + uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4; + int offset = (int) algoDigit; + char oldVal = output[offset]; + for(int j=offset; j-->0;) + output[j+1] = output[j]; + output[0] = oldVal; + } +} + +// X21S CPU Hash (Validation) +extern "C" void x21s_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[128]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + sph_haval256_5_context ctx_haval; + sph_tiger_context ctx_tiger; + sph_gost512_context ctx_gost; + sph_sha256_context ctx_sha; + + void *in = (void*) input; + int size = 80; + + uint32_t *in32 = (uint32_t*) input; + getAlgoString(&in32[1], hashOrder); + + for (int i = 0; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, in, size); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, in, size); + sph_luffa512_close(&ctx_luffa, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, in, size); + sph_cubehash512_close(&ctx_cubehash, hash); + break; + case SHAVITE: + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, in, size); + sph_shavite512_close(&ctx_shavite, hash); + break; + case SIMD: + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, in, size); + sph_simd512_close(&ctx_simd, hash); + break; + case ECHO: + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, in, size); + sph_echo512_close(&ctx_echo, hash); + break; + case HAMSI: + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, in, size); + sph_hamsi512_close(&ctx_hamsi, hash); + break; + case FUGUE: + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, in, size); + sph_fugue512_close(&ctx_fugue, hash); + break; + case SHABAL: + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, in, size); + sph_shabal512_close(&ctx_shabal, hash); + break; + case WHIRLPOOL: + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, in, size); + sph_whirlpool_close(&ctx_whirlpool, hash); + break; + case SHA512: + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512,(const void*) in, size); + sph_sha512_close(&ctx_sha512,(void*) hash); + break; + } + in = (void*) hash; + size = 64; + } + + sph_haval256_5_init(&ctx_haval); + sph_haval256_5(&ctx_haval,(const void*) hash, 64); + sph_haval256_5_close(&ctx_haval,hash); + + sph_tiger_init(&ctx_tiger); + sph_tiger (&ctx_tiger, (const void*) hash, 64); + sph_tiger_close(&ctx_tiger, (void*) hash); + + LYRA2((void*) hash, 32, (const void*) hash, 32, (const void*) hash, 32, 1, 4, 4); + + sph_gost512_init(&ctx_gost); + sph_gost512 (&ctx_gost, (const void*) hash, 64); + sph_gost512_close(&ctx_gost, (void*) hash); + + sph_sha256_init(&ctx_sha); + sph_sha256 (&ctx_sha, (const void*) hash, 64); + sph_sha256_close(&ctx_sha, (void*) hash); + + memcpy(output, hash, 32); +} + +#if 0 /* in x16r */ +void whirlpool_midstate(void *state, const void *input) +{ + sph_whirlpool_context ctx; + + sph_whirlpool_init(&ctx); + sph_whirlpool(&ctx, input, 64); + + memcpy(state, ctx.state, 64); +} +#endif + +static bool init[MAX_GPUS] = { 0 }; +static bool use_compat_kernels[MAX_GPUS] = { 0 }; + +//#define _DEBUG +#define _DEBUG_PREFIX "x21s-" +#include "cuda_debug.cuh" + +extern "C" int scanhash_x21s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3; + // SM 3 implentation requires a bit more memory + if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; + CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1); + + cuda_get_arch(thr_id); + use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500); + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_init(thr_id, throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); // 64 + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); // 64 + x16_echo512_cuda_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + x17_haval256_cpu_init(thr_id, throughput); + lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x003f; + //((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0x0A; // hashOrder[0] = 'A'; for echo 64 + //((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64 + } + uint32_t _ALIGN(64) endiandata[20]; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + uint32_t ntime = swab32(pdata[17]); + if (s_ntime != ntime) { + getAlgoString(&endiandata[1], hashOrder); + s_ntime = ntime; + if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); + } + + cuda_check_cpu_setTarget(ptarget); + + char elem = hashOrder[0]; + const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x16_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + x16_sha512_setBlock_80(endiandata); + break; + default: { + return -1; + } + } + + int warn = 0; + + do { + int order = 0; + + // Hash with CUDA + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake80:"); + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw80 :"); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("grstl80:"); + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("jh51280:"); + break; + case KECCAK: + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("kecck80:"); + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + TRACE("skein80:"); + break; + case LUFFA: + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("luffa80:"); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("cube 80:"); + break; + case SHAVITE: + x16_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("simd512:"); + break; + case ECHO: + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("echo :"); + break; + case HAMSI: + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("hamsi :"); + break; + case FUGUE: + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("fugue :"); + break; + case SHABAL: + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("shabal :"); + break; + case WHIRLPOOL: + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("whirl :"); + break; + case SHA512: + x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + + for (int i = 1; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + break; + case KECCAK: + quark_keccak512_cpu_hash_64(thr_id, throughput, NULL, d_hash[thr_id]); order++; + TRACE("keccak :"); + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + break; + case LUFFA: + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + TRACE("cube :"); + break; + case SHAVITE: + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd :"); + break; + case ECHO: + if (use_compat_kernels[thr_id]) + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + else { + x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++; + } + TRACE("echo :"); + break; + case HAMSI: + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("hamsi :"); + break; + case FUGUE: + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("fugue :"); + break; + case SHABAL: + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case WHIRLPOOL: + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case SHA512: + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + } + + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 512); order++; + tiger192_cpu_hash_64(thr_id, throughput, 0, d_hash[thr_id]); + lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], (uint64_t*) d_hash[thr_id], order++); + streebog_cpu_hash_64_alexis(thr_id, throughput, d_hash[thr_id]); + sha256_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); + +#ifdef _DEBUG + uint32_t _ALIGN(64) dhash[8]; + be32enc(&endiandata[19], pdata[19]); + x21s_hash(dhash, endiandata); + applog_hash(dhash); + return -1; +#endif + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + x21s_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + x21s_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } + //gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder); + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s", + work->nonces[0], algo_strings[algo80], hashOrder); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_x21s(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_matrix[thr_id]); + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ? + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +}